## Code format

In [1]:
# Code format
# For Jupyter Notebook:
%load_ext nb_black
# For Jupyter Lab:
# %load_ext lab_black

<IPython.core.display.Javascript object>

## Libraries

In [2]:
# Interacting with the operating system in a platform-independent way
import os

# Data management tools
# Serialization and deserialization of Python objects
import pickle
import json

# Class used for reading and manipulating PDF files
from PyPDF2 import PdfReader

<IPython.core.display.Javascript object>

In [3]:
# Libraries and classes to load and parse different types of text data
from langchain.document_loaders import (
    UnstructuredPDFLoader,
    OnlinePDFLoader,
    UnstructuredFileLoader,
    TextLoader,
    PyPDFLoader,
)

# Classes for splitting text into characters and recursively splitting text into characters
# Tools for splitting text into smaller chunks for further processing
from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
)

# Tools for working with OpenAI's GPT-3 language model
# from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings.openai import OpenAIEmbeddings

# Tools for working with vector databases, including authentication with the Pinecone and OpenAI APIs
# from langchain.vectorstores import Chroma, Pinecone
# from langchain.vectorstores.faiss import FAISS
from langchain.vectorstores import (
    Chroma,
    Pinecone,
    ElasticVectorSearch,
    Weaviate,
    FAISS,
)

# Wrapper around OpenAI's API and provides tools for interacting with OpenAI's GPT-3 language model
from langchain.llms import OpenAI

# Tools for building and running natural language processing (NLP) chains
# Class for building a retrieval-based question answering (QA) system and chatbots that interact with vector databases
from langchain.chains import (
    RetrievalQA,
    ChatVectorDBChain,
    ConversationalRetrievalChain,
)

# Generating text prompts
from langchain.prompts.prompt import PromptTemplate

# Load question answering chain
from langchain.chains.question_answering import load_qa_chain

# Class to create indexes in vector databases
from langchain.indexes import VectorstoreIndexCreator

<IPython.core.display.Javascript object>

## Functions

In [4]:
# Function to save an object into a pickle file.
def f_pklsave(arg_obj, arg_path: str):
    """
    Serialize and save the given object to a pickle file at the specified path.

    Parameters:
    arg_obj (object): The object that needs to be saved in the pickle file.
    arg_path (str): The file path of the pickle file where the object will be saved.

    Returns:
    None

    Example:
    >>> my_object = {"name": "John", "age": 25, "address": "123 Main St"}
    >>> file_path = "./my_object.pkl"
    >>> f_pklsave(my_object, file_path)
    """
    # Dump object to pickle file
    pickle.dump(arg_obj, open(arg_path, "wb"))

<IPython.core.display.Javascript object>

## Variables

In [5]:
# Folder paths
str_folder_credentials = "credentials/"
str_folder_sources = "sources/"
str_folder_outputs = "outputs/"

<IPython.core.display.Javascript object>

In [6]:
# File names
lst_pdfs = [x for x in os.listdir(str_folder_sources) if x.endswith(".pdf")]
lst_pdfs

['Cap 571H.pdf', 'counter.pdf', 'mifid2.pdf', 'Prospectiva.pdf']

<IPython.core.display.Javascript object>

In [7]:
# Filename, with and without extension
srt_filename = lst_pdfs[2]
srt_filename_next = srt_filename.split(".")[0]

<IPython.core.display.Javascript object>

In [8]:
# Filename and folders
str_path_input = str_folder_sources + srt_filename
str_path_output = str_folder_outputs + srt_filename_next + ".pkl"

<IPython.core.display.Javascript object>

In [10]:
srt_filename, str_path_input, str_path_output, str_path_output

('mifid2.pdf',
 'sources/mifid2.pdf',
 'outputs/mifid2.pkl',
 'outputs/mifid2.pkl')

<IPython.core.display.Javascript object>

### Authentication for APIs

In [11]:
# Load credentials
# Create dictionary to store credentials
lst_cred = {}
# Loop through folder and load json file with credentials
for x in [x for x in os.listdir(str_folder_credentials)]:
    lst_cred[x.split(".")[0]] = json.load(open(str_folder_credentials + x, "r"))

<IPython.core.display.Javascript object>

In [12]:
OPENAI_API_KEY = lst_cred["yahoo"]["OPENAI_API_KEY"]
PINECONE_API_KEY = lst_cred["mail"]["PINECONE_API_KEY"]
PINECONE_API_ENV = lst_cred["yahoo"]["PINECONE_API_ENV"]

<IPython.core.display.Javascript object>

## Ingestion of data

### Using `UnstructuredPDFLoader`

In [13]:
# Load PDF file
# loader = UnstructuredPDFLoader(str_path_input)
loader = PyPDFLoader(str_path_input)

<IPython.core.display.Javascript object>

In [14]:
# Parse its contents into an object
# data = loader.load()

<IPython.core.display.Javascript object>

### Using `PyPDF`

In [15]:
# Read the PDF file
data = PdfReader(str_path_input)

<IPython.core.display.Javascript object>

In [16]:
# Read data from the file and store it in a variable
# Create empty string
raw_text = ""
# Loop to iterate through each page of the PDF file
for i, page in enumerate(data.pages):
    # Extract the text content from each page
    text = page.extract_text()
    # Ensure that only pages with actual text content are added
    if text:
        raw_text += text

<IPython.core.display.Javascript object>

### Result

In [17]:
# Display the number of characters in the first document’s page content
# print(f"You have {len(data)} document(s) in your data")
# print(f"There are {len(data[0].page_content)} characters in your document")

<IPython.core.display.Javascript object>

In [21]:
# Display the number of characters in the first document’s page content
print(f"There are {sum(len(i) for i in raw_text)} characters in your document")

There are 509600 characters in your document


<IPython.core.display.Javascript object>

### Text split

In [22]:
# Using UnstructuredPDFLoader
# Chunk your data up into smaller documents
# Split Book into Smaller Chunks
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
# texts = text_splitter.split_documents(data)

<IPython.core.display.Javascript object>

In [23]:
# Using PdfReader
# We need to split the text that we read into smaller chunks so that during information retreival we don't hit the token size limits
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)
texts = text_splitter.split_text(raw_text)

<IPython.core.display.Javascript object>

## Embeddings
Chroma, FAISS, and Pinecone all provide tools for similarity search in vector databases, they have different approaches and are optimized for different use cases. Chroma is optimized for large-scale datasets, FAISS is optimized for high performance and scalability, and Pinecone is optimized for fast and efficient similarity search in production environments.

In [24]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

<IPython.core.display.Javascript object>

### Chroma
Chroma is a library that provides a way to perform approximate nearest neighbor search using locality-sensitive hashing. It uses a hashing technique to map high-dimensional vectors to lower-dimensional space and then finds the closest vectors using these hashed values. Chroma is designed to be used with large datasets and can handle millions of vectors efficiently.

In [None]:
# Create the vectorestore to use as the index
db_chroma = Chroma.from_texts(texts, embeddings)

In [None]:
# XXXX Store ChromeDB

### FAISS
Library that provides a range of algorithms for performing similarity search in high-dimensional vector spaces. It uses an index structure to organize the vectors in a way that enables efficient nearest neighbor search. FAISS is designed to be very fast and efficient and is widely used in production environments for large-scale similarity search.

In [25]:
# Use the from_texts() function to convert each document into a vector
db_faiss = FAISS.from_texts(texts, embeddings)

<IPython.core.display.Javascript object>

In [26]:
# Filename
str_path_output = str_folder_outputs + "db_faiss" + ".pkl"
# Store vector database
f_pklsave(db_faiss, str_path_output)

<IPython.core.display.Javascript object>

### Pinecone
Pinecone is a cloud-based vector database service that provides a scalable and efficient way to store and search high-dimensional vectors. Pinecone is optimized for fast and efficient similarity search, and it provides a simple API for managing and searching vector databases. Pinecone is designed to be used in production environments and is scalable to handle millions or even billions of vectors.

In [None]:
# Initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV,  # next to api key in console
)
# Name of the pinecone index
index_name = "general"
# Pinecone namespace for the document
namespace = "mifid2"

In [None]:
# Use the from_texts() function to convert each document into a vector
db_pinecone = Pinecone.from_texts(
    [t.page_content for t in texts],
    embeddings,
    index_name=index_name,
    namespace=namespace,
)

In [None]:
# Filename
str_path_output = str_folder_outputs + "db_pinecone" + ".pkl"
# Store vector database
f_pklsave(db_pinecone, str_path_output)

### Load pickle file with vector database

In [None]:
# File names
lst_pkls = [x for x in os.listdir(str_folder_outputs)]
lst_pkls

In [None]:
# Pickle filename and path
str_filename_pkl = str_folder_outputs + lst_pkls[2]
str_filename_pkl

In [None]:
db_vector = pickle.load(open(str_filename_pkl, "rb"))b

## Query data
`load_qa_chain`, `RetrievalQA`, `VectorstoreIndexCreator` and `ConversationalRetrievalChain` are all part of the LangChain library, which provides tools for building and running natural language processing (NLP) chains.

`load_qa_chain` and `RetrievalQA` are focused on question answering, while `VectorstoreIndexCreator` and `ConversationalRetrievalChain` are more general-purpose tools for working with text data and search.

In [27]:
# Create new instance of the OpenAI class
# llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
llm = OpenAI(openai_api_key=OPENAI_API_KEY)

<IPython.core.display.Javascript object>

In [28]:
# Question that the NLP chain will attempt to answer
query = "What is the full title of this document?"

<IPython.core.display.Javascript object>

In [29]:
# Selection of vector database
# docsearch = db_chroma
docsearch = db_faiss
# docsearch = db_pinecone

<IPython.core.display.Javascript object>

In [30]:
# Create a retriever from the vector database
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 2})

<IPython.core.display.Javascript object>

### `load_qa_chain`
Function that loads a pre-trained question answering model from disk. The function returns a `QuestionAnsweringChain` object that can be used to answer questions based on the input context.

In [31]:
# Load a pre-trained question-answering (QA) chain and creates an instance of it
chain = load_qa_chain(llm=llm, chain_type="stuff")

<IPython.core.display.Javascript object>

In [32]:
# Query string as input and searches the vector database for similar documents
# docs variable is a list of documents that are similar to the query string, ordered by their similarity score.
docs = docsearch.similarity_search(query)

<IPython.core.display.Javascript object>

In [33]:
# Processes the input documents and the question using a pre-trained machine learning model and returns an answer to the question
chain.run(input_documents=docs, question=query)

' Directive 2014/65/EU of the European Parliament and of the Council of 15 May 2014 on markets in financial instruments and amending Directive 2002/92/EC and Directive 2011/61/EU.'

<IPython.core.display.Javascript object>

#### Parameters

`stuff`, `map_reduce`, `refine`, and `map-rerank` are different strategies or approaches used in the Langchain package for performing natural language processing (NLP) tasks such as question-answering or text generation.

- `stuff` strategy is based on generating multiple candidate answers from different sources and using a simple algorithm to select the best answer.

- `map_reduce` strategy divides a large dataset into smaller parts, applies a function (map) to each part, and then combines the results (reduce) to generate the final output. This strategy can be used to speed up the processing of large datasets.

- `refine` strategy is based on iteratively refining the answer by using different NLP models to generate new candidate answers and then selecting the best one.

- `map-rerank` strategy is similar to the `map_reduce` strategy, but it also includes a reranking step where the results are sorted according to their relevance to the input query.

These strategies can be used in combination to achieve better results depending on the specific use case and the type of data being processed.

[Source.](https://python.langchain.com/en/latest/modules/chains/index_examples/qa_with_sources.html)

In [None]:
### For multiple documents 
# loaders = [....]
# documents = []
# for loader in loaders:
#     documents.extend(loader.load())
#
chain = load_qa_chain(llm=llm, chain_type="map_reduce")
query = "what is the total number of AI publications?"
chain.run(input_documents=documents, question=query)

### `RetrievalQA`
Class that implements a question answering pipeline that retrieves relevant documents from a document database before answering the question. It uses Elasticsearch as the document database and uses a TF-IDF vectorizer to rank the documents based on their relevance.

In [34]:
# create a chain to answer questions
qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True
)

<IPython.core.display.Javascript object>

In [35]:
result = qa({"query": query})

<IPython.core.display.Javascript object>

In [36]:
retriever.get_relevant_documents(query)

[Document(page_content='No 45/2001 and delivered an opinion on 10 February 2012 (  1 ). \n(168)  In accordance with the Joint Political Declaration of Member States and the Commission on explanatory \ndocuments of 28 September 2011 (  2 ), Member States have undertaken to accompany, in justified cases, the \nnotification of their transposition measures with one or more documents explaining the relationship between \nthe components of a directive and the corresponding parts of national transposition instruments. With regard to \nthis Directive, the legislator considers the transmission of such documents to be justified.  \n(169)  The obligation to transpose this Directive into national law should be confined to those provisions which represent \na substantive amendment as compared to the earlier Directives. The obligation to transpose the provisions which \nare unchanged arises under the earlier Directives.', metadata={}),
 Document(page_content='provided in a standardised format;  \n(d

<IPython.core.display.Javascript object>

In [37]:
result

{'query': 'What is the full title of this document?',
 'result': " I don't know.",
 'source_documents': [Document(page_content='No 45/2001 and delivered an opinion on 10 February 2012 (  1 ). \n(168)  In accordance with the Joint Political Declaration of Member States and the Commission on explanatory \ndocuments of 28 September 2011 (  2 ), Member States have undertaken to accompany, in justified cases, the \nnotification of their transposition measures with one or more documents explaining the relationship between \nthe components of a directive and the corresponding parts of national transposition instruments. With regard to \nthis Directive, the legislator considers the transmission of such documents to be justified.  \n(169)  The obligation to transpose this Directive into national law should be confined to those provisions which represent \na substantive amendment as compared to the earlier Directives. The obligation to transpose the provisions which \nare unchanged arises under 

<IPython.core.display.Javascript object>

### `VectorstoreIndexCreator`
Class that creates an index for a vector store. A vector store is a database that stores embeddings (i.e. numerical representations) of text data. The index is used to efficiently search for embeddings that are similar to a given query embedding.

In [38]:
# Create a searchable index of embeddings based on a set of input documents
index = VectorstoreIndexCreator(
    # split the documents into chunks
    text_splitter=text_splitter,
    # select which embeddings we want to use
    embedding=embeddings,
    # use Chroma as the vectorestore to index and search embeddings
    vectorstore_cls=Chroma,
).from_loaders([loader])

Using embedded DuckDB without persistence: data will be transient


<IPython.core.display.Javascript object>

In [39]:
index.query(llm=llm, question=query, chain_type="stuff")

' This document is called "Directive 2014/600/EU of the European Parliament and of the Council of 15 May 2014 on markets in financial instruments and amending Directive 2002/92/EC and Directive 2011/61/EU".'

<IPython.core.display.Javascript object>

### `ConversationalRetrievalChain`
Class that implements a conversational retrieval pipeline. It retrieves relevant responses from a response database based on the input query, and uses a language model to rank the responses based on their relevance. It also incorporates feedback from the user to refine the search results.

Conversation memory + RetrievalQAChain

Allow for passing in chat history which can be used for follow up questions.

In [40]:
# Create a chain to answer questions
qa = ConversationalRetrievalChain.from_llm(llm, retriever)

<IPython.core.display.Javascript object>

In [41]:
# Initializes an empty list which will be used to keep track of the chat history
chat_history = []

<IPython.core.display.Javascript object>

In [42]:
result = qa({"question": query, "chat_history": chat_history})

<IPython.core.display.Javascript object>

In [43]:
result["answer"]

" I don't know."

<IPython.core.display.Javascript object>

#### Prompt another question

In [None]:
chat_history = [(query, result["answer"])]
query = "What is this number divided by 2?"
result = qa({"question": query, "chat_history": chat_history})

In [None]:
chat_history

In [None]:
result['answer']