## Code format

In [1]:
# Code format
# For Jupyter Notebook:
%load_ext nb_black
# For Jupyter Lab:
# %load_ext lab_black

<IPython.core.display.Javascript object>

## Libraries

In [2]:
# Interacting with the operating system in a platform-independent way
import os

# Data management tools
# Serialization and deserialization of Python objects
import pickle
import json

<IPython.core.display.Javascript object>

In [48]:
# Libraries and classes to load and parse different types of text data
from langchain.document_loaders import (
    UnstructuredPDFLoader,
    OnlinePDFLoader,
    UnstructuredFileLoader,
)

# Tools for splitting text into smaller chunks for further processing
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter

# Class used for reading and manipulating PDF files
from PyPDF2 import PdfReader

# Tools for working with vector databases, including authentication with the Pinecone and OpenAI APIs
# from langchain.vectorstores import Chroma, Pinecone
# from langchain.vectorstores.faiss import FAISS
from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS

# Tools for working with OpenAI's GPT-3 language model
# from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings.openai import OpenAIEmbeddings

# Import external vector database library
import pinecone

# Wrapper around OpenAI's API and provides tools for interacting with OpenAI's GPT-3 language model
from langchain.llms import OpenAI

# Tools for building and running natural language processing (NLP) chains for question answering, generating text prompts
# Load question answering chain
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts.prompt import PromptTemplate
from langchain.chains import ChatVectorDBChain

<IPython.core.display.Javascript object>

## Variables

In [4]:
# Folder paths
str_folder_credentials = "credentials/"
str_folder_sources = "sources/"
str_folder_outputs = "outputs/"

<IPython.core.display.Javascript object>

In [5]:
# File names
lst_pdfs = [x for x in os.listdir(str_folder_sources) if x.endswith(".pdf")]
lst_pdfs

['Cap 571H.pdf', 'counter.pdf', 'mifid2.pdf', 'Prospectiva.pdf']

<IPython.core.display.Javascript object>

In [6]:
# Filename, with and without extension
srt_filename = lst_pdfs[2]
srt_filename_next = srt_filename.split(".")[0]

<IPython.core.display.Javascript object>

In [13]:
srt_filename

'mifid2.pdf'

<IPython.core.display.Javascript object>

In [15]:
# Filename and folders
str_path_input = str_folder_sources + srt_filename
str_path_output = str_folder_outputs + srt_filename_next + ".pkl"

<IPython.core.display.Javascript object>

In [16]:
str_path_input, str_path_output

('sources/mifid2.pdf', 'outputs/mifid2.pkl')

<IPython.core.display.Javascript object>

### Pinecone and OpenAI

In [8]:
# Load credentials
# Create dictionary to store credentials
lst_cred = {}
# Loop through folder and load json file with credentials
for x in [x for x in os.listdir(str_folder_credentials)]:
    lst_cred[x.split(".")[0]] = json.load(open(str_folder_credentials + x, "r"))

<IPython.core.display.Javascript object>

In [9]:
OPENAI_API_KEY = lst_cred["yahoo"]["OPENAI_API_KEY"]
PINECONE_API_KEY = lst_cred["mail"]["PINECONE_API_KEY"]
PINECONE_API_ENV = lst_cred["yahoo"]["PINECONE_API_ENV"]

<IPython.core.display.Javascript object>

## Ingestion of data

### Using `UnstructuredPDFLoader`

In [10]:
loader = UnstructuredPDFLoader(str_path_input)

<IPython.core.display.Javascript object>

In [11]:
data = loader.load()

Downloading model_final.pth:   0%|          | 0.00/330M [00:00<?, ?B/s]

Downloading (…)50_FPN_3x/config.yml:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

<IPython.core.display.Javascript object>

In [12]:
# Display the number of characters in the first document’s page content
print(f"You have {len(data)} document(s) in your data")
print(f"There are {len(data[0].page_content)} characters in your document")

You have 1 document(s) in your data
There are 578567 characters in your document


<IPython.core.display.Javascript object>

### Using `PyPDF`

In [43]:
data = PdfReader(str_path_input)

<IPython.core.display.Javascript object>

In [45]:
# read data from the file and put them into a variable called raw_text
raw_text = ""
for i, page in enumerate(data.pages):
    text = page.extract_text()
    if text:
        raw_text += text

<IPython.core.display.Javascript object>

### Split text

In [17]:
# Chunk your data up into smaller documents
# Split Book into Smaller Chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
texts = text_splitter.split_documents(data)

<IPython.core.display.Javascript object>

In [46]:
# We need to split the text that we read into smaller chunks so that during information retreival we don't hit the token size limits
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)
texts = text_splitter.split_text(raw_text)

<IPython.core.display.Javascript object>

In [47]:
print(f"Now you have {len(texts)} documents")

Now you have 627 documents


<IPython.core.display.Javascript object>

### Create embeddings and store them

In [19]:
# Create embeddings of your documents to get ready for semantic search
# Wrapper around OpenAI embedding models
# Load Data to vectorstore (for local storage)
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
vectorstore = FAISS.from_documents(texts, embeddings)

<IPython.core.display.Javascript object>

In [20]:
# Store embeddings
pickle.dump(vectorstore, open(str_path_output, "wb"))

<IPython.core.display.Javascript object>

### Create embeddings and store them (`PyPDF`)

In [51]:
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

<IPython.core.display.Javascript object>

In [52]:
# Use the from_texts() function to convert each document into a vector
docsearch = FAISS.from_texts(texts, embeddings)

<IPython.core.display.Javascript object>

In [53]:
# Change filename
str_path_output = str_folder_outputs + srt_filename_next + "_v2" + ".pkl"
# Store embeddings
pickle.dump(docsearch, open(str_path_output, "wb"))

<IPython.core.display.Javascript object>

### Load pickle file with vectorstore

In [21]:
# File names
lst_pkls = [x for x in os.listdir(str_folder_outputs)]
lst_pkls

['chat.pkl', 'file.pkl', 'mifid2.pkl', 'Prospectiva', 'vectorstore.pkl']

<IPython.core.display.Javascript object>

In [22]:
# Pickle filename and path
str_filename_pkl = str_folder_outputs + lst_pkls[2]
str_filename_pkl

'outputs/mifid2.pkl'

<IPython.core.display.Javascript object>

In [None]:
vectorstore = pickle.load(open(str_filename_pkl, "rb"))

### Pinecone

In [28]:
# Initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV,  # next to api key in console
)
# Name of the pinecone index
index_name = "general"
# Pinecone namespace for the document
namespace = "mifid2"

<IPython.core.display.Javascript object>

In [29]:
# Use the from_texts() function to convert each document into a vector
docsearch = Pinecone.from_texts(
    [t.page_content for t in texts],
    embeddings,
    index_name=index_name,
    namespace=namespace,
)

<IPython.core.display.Javascript object>

## Query data

### Using locally store vector database

In [23]:
llm = OpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key=OPENAI_API_KEY)



<IPython.core.display.Javascript object>

In [24]:
#
QA_PROMPT = "What is the title of this document?"
#
a_chain = ChatVectorDBChain.from_llm(
    llm=llm, vectorstore=vectorstore, return_source_documents=True
)
#
result = a_chain({"question": QA_PROMPT, "chat_history": ""})



<IPython.core.display.Javascript object>

In [25]:
result.keys()

dict_keys(['question', 'chat_history', 'answer', 'source_documents'])

<IPython.core.display.Javascript object>

In [26]:
result["answer"]

'The title of this document is not provided.'

<IPython.core.display.Javascript object>

### Pinecone

In [30]:
llm_pc = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm_pc, chain_type="stuff")

<IPython.core.display.Javascript object>

In [31]:
query = "What is the title of this document?"
docs = docsearch.similarity_search(query, include_metadata=True, namespace=namespace)
chain.run(input_documents=docs, question=query)

' The title of this document is the Joint Political Declaration of Member States and the Commission on explanatory documents of 28 September 2011.'

<IPython.core.display.Javascript object>

In [32]:
query = "Summarize each section of the document"
docs = docsearch.similarity_search(query, include_metadata=True, namespace=namespace)
chain.run(input_documents=docs, question=query)

' The Joint Political Declaration of Member States and the Commission on explanatory documents outlines the requirement for Member States to provide documents explaining the relationship between the components of a directive and the corresponding parts of national transposition instruments. The Scope and Definitions section outlines the content of the information to be published, as well as the concrete organisational requirements. The ESMA section states that ESMA shall submit draft regulatory technical standards to the Commission by 3 July. The Conditions for CTPs section outlines the organisational requirements for the delegated acts, as well as the information to be provided to clients about costs and charges.'

<IPython.core.display.Javascript object>

### FAISS

In [55]:
chain = load_qa_chain(OpenAI(openai_api_key=OPENAI_API_KEY), chain_type="stuff")

<IPython.core.display.Javascript object>

In [57]:
query = "What is the full title of this document?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' This document is Directive 2014/65/EU of the European Parliament and of the Council of 15 May 2014 on markets in financial instruments and amending Directive 2002/92/EC and Directive 2011/61/EU.'

<IPython.core.display.Javascript object>

In [58]:
query = "Summarize each section of the document"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' Article 23(1): The investment firm must establish a record that includes the document or documents agreed between the investment firm and the client that set out the rights and obligations of the parties, and the other terms on which the investment firm will provide services to the client. Article 23(2): The investment firm must provide the client with adequate reports on the service provided in a durable medium. The reports must include periodic communications to clients, taking into account the type and complexity of financial instruments involved and the nature of the service provided and must include the associated costs. Article 23(4): The information must be provided in a comprehensible form, in a way that clients can understand the nature and risks of the investment service, and of the type of financial instrument being offered, and must include the overall cost and cumulative effect on return of the investment. Where requested, an itemised breakdown must be provided.'

<IPython.core.display.Javascript object>

In [59]:
query = "Classify each article of the document"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' According to the document, Article 12 is in Part 1, Article 13(1) is in Part 2, Article 13(2) is in Part 3, Article 14(1) is in Part 4, Article 14(2) is in Part 5, Article 14(3) is in Part 6, Article 14(4) is in Part 7, Article 14(5) is in Part 8, Article 14(6) is in Part 9, Article 15 is in Part 10, Article 16(1) is in Part 11, Article 16(2) is in Part 12, Article 16(3) is in Part 13, Article 17(1) is in Part 14, Article 17(2) is in Part 15, Article 18(1) is in Part 16, Article 18(2) is in Part 17, Article 18(3) is in Part 18, Article 19(1) is in Part 19, Article 19(2) is in Part 20, Article 19(3) is in Part 21, and Article 19(4) is in Part 22.'

<IPython.core.display.Javascript object>

In [60]:
query = "Summarize each part of the document"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' This document outlines the rights and obligations of investment firms and their clients. It provides information on the documents and reports that must be provided to clients, the information that must be provided to the client to understand the overall cost, and the obligations of the investment firm when complying with Article 23. It also outlines the relationship between the components of a directive and the corresponding parts of national transposition instruments.'

<IPython.core.display.Javascript object>

In [61]:
query = "Transaction reporting obligations"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' Investment firms must take all reasonable steps to record relevant telephone conversations and electronic communications related to transactions or the provision of client order services. They must also notify clients that such communications will be recorded, and must provide periodic reports to clients about the quality of execution of transactions, including details about price, costs, and other services undertaken on their behalf.'

<IPython.core.display.Javascript object>

In [62]:
query = "List communications obligations"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' Investment firms must provide clear disclosure of potential conflicts of interest, record telephone conversations and electronic communications involving client orders, provide a record of the agreement between the firm and the client, provide adequate reports to the client in a durable medium, make sure marketing communications are fair, clear, and not misleading, and notify clients that telephone communications with the firm will be recorded.'

<IPython.core.display.Javascript object>