# Content Engine Pipeline
## _RAG based App to chat with local PDFs_
## Author : __Abhijit Mandal__

#### Supressing all the warnings here

In [1]:
import warnings
warnings.filterwarnings('ignore')

#### Installing all the dependencies here

In [2]:
!pip install langchain
!pip install -U langchain-community
!pip install langchain_pinecone
!pip install unstructured
!pip install langchain-text-splitters
!pip install sentence-transformers
!pip install llama-cpp-python

Collecting langchain
  Downloading langchain-0.2.6-py3-none-any.whl (975 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m975.5/975.5 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
Collecting langchain-core<0.3.0,>=0.2.10 (from langchain)
  Downloading langchain_core-0.2.11-py3-none-any.whl (337 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m337.4/337.4 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl (25 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.83-py3-none-any.whl (127 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.5/127.5 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.3.0,>=0.2.10->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting orjson<4.0.0,>=3.9.14 (from lang

#### Importing the required libs here

In [4]:
from langchain_pinecone import PineconeVectorStore
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.llms import LlamaCpp
import os
import glob

## Loading the local PDF data files here

#### Creating a func to extract data from the PDF.

In [5]:
def load_pdf(data):
    loader = DirectoryLoader(
                data,
                glob="*.pdf",
                loader_cls=PyPDFLoader
    )
    documents = loader.load()
    return documents

#### Loaded all the PDFs here

In [6]:
docs = load_pdf("/content/drive/MyDrive/Colab Datasets/PDFs")

#### Checking for the content inside randomly

In [7]:
docs[100].page_content

'$162 million was determined by referencing a financing transaction and used as an input to an OPM. Other key inputs to the OPM were discount rates of 22% and\n28%, volatility of 70% and time to liquidity of 1.25 years.\nThe fair value of our Lime investments as of December 31, 2022 of $113 million was determined by referencing a financing transaction and used as an input to\nan OPM. Other key inputs to the OPM were discount rates of 32% and 38%, volatility of 87% and time to liquidity of 1.50 years.\nFinancial Assets and Liabilities Measured at Fair Value Using Level 3 Inputs\nThe following table presents a reconciliation of our financial assets and liabilities measured and recorded at fair value on a recurring basis as of December 31,\n2021 and 2022, using significant unobservable inputs (Level 3) (in millions):\nNon-marketable\nDebt SecuritiesNon-marketable\nEquity Securities Notes Receivable MLU B.V. Call Option\nBalance as of December 31, 2020 $ 2,341 $ 52 $ 83 $ — \nChange in fai

## Setting up the Pinecone Vector Database here

#### Fetching the key info from `Secrets` here

In [17]:
from google.colab import userdata
os.environ["PINECONE_API_KEY"] = userdata.get("PINECONE_API_KEY")
os.environ["PINECONE_API_ENV"] = userdata.get("PINECONE_API_ENV")
index_name = os.environ["index_name"] = userdata.get("index_name")

#### Created a func to split the text from the whole document and creating chunks here

In [9]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

#### Checking for the total splits made for the entire doc

In [10]:
len(text_split(docs))

3438

#### Performing document text splitting into chunks here

In [11]:
text_chunks = text_split(extracted_data=docs)

#### Checking for the first chunk here

In [12]:
text_chunks[0].page_content

'UNITED STATES\nSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\n____________________________________________ \nFORM 10-K\n____________________________________________ \n(Mark One)\n☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the fiscal year ended December 31, 2022\nOR\n☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the transition period from_____ to _____            \nCommission File Number: 001-38902'

#### Checking for the last chunk here

In [13]:
text_chunks[3437].page_content

'any\tcompensation\tthat\tis\tsubject\tto\trecoupment\tand/or\tforfeiture\tunder\tthe\tPolicy.\nCapitalized\tterms\tused\tbut\tnot\tdefined\therein\thave\tthe\tmeanings\tset\tforth\tin\tthe\tPolicy.\nSigned:\nPrint\tName:\nDate:'

# Performing Text embeddings for the text-chunks here

#### Created a func to convert the text-chunks into embeddings

In [14]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

#### Loading the embeddings here

In [15]:
embeddings = download_hugging_face_embeddings()

  warn_deprecated(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Creating Vector Database Store using Pinecone API

In [18]:
vector_store = PineconeVectorStore.from_documents(
    text_chunks,
    embeddings,
    index_name=index_name,
)

## Performing Similarity Search here

In [19]:
test_query = "Who is the founder of Tesla?"

In [21]:
similar_docs = vector_store.similarity_search(test_query)

In [22]:
[s.page_content for s in similar_docs]

['Tesla,\tInc.\nNotes\tto\tConsolidated\tFinancial\tStatements\nNote\t1\t–\t\nOverview\nTesla,\tInc.\t(“Tesla”,\tthe\t“Company”,\t“we”,\t“us”\tor\t“our”)\twas\tincorporated\tin\tthe\tState\tof\tDelaware\ton\tJuly\t1,\t2003.\tWe\tdesign,\tdevelop,\nmanufacture,\tsell\tand\tlease\thigh-performance\tfully\telectric\tvehicles\tand\tenergy\tgeneration\tand\tstorage\tsystems,\tand\toffer\tservices\trelated\tto\tour\nproducts.\tOur\tChief\tExecutive\tOfficer,\tas\tthe\tchief\toperating\tdecision\tmaker\t(“CODM”),\torganizes\tour\tcompany,\tmanages\tresource\tallocations\tand',
 'Tesla\tEnergy\tVentures\tHoldings\tB.V.\nNetherlands\nTesla\tFinance\tLLC\nDelaware\nTesla\tFinancial\tLeasing\t(China)\tCo.,\tLtd.\nChina\nTesla\tFinancial\tServices\tGmbH\nGermany\nTesla\tFinancial\tServices\tHoldings\tB.V.\nNetherlands\nTesla\tFinancial\tServices\tLimited\nUnited\tKingdom\nTesla\tFrance\tS.à\tr.l.\nFrance\nTesla\tGermany\tGmbH\nGermany\nTesla\tGeneral\tInsurance,\tInc.\nArizona\nTesla\tGreece\tSing

# Creating Prompt Template to interact with the LLM

In [27]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Answer:
"""

#### Creating prompt chain here

In [28]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

## Loading the `instruct` LLM here
#### Model name : `mistral-7b-instruct-v0.1.Q4_K_M`
[Model Link Here](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF)

In [29]:
llm = LlamaCpp(
               streaming=True,
               model_path='/content/drive/MyDrive/Colab Datasets/Models/mistral-7b-instruct-v0.1.Q4_K_M.gguf',
               temperature = 0.75,
               top_p=1,
               verbose=True,
               n_ctx=4096
)

llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from /content/drive/MyDrive/Colab Datasets/Models/mistral-7b-instruct-v0.1.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.1
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model

## Creating the `RAG Pipeline` here

In [30]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever(search_kwargs={"k":2}))

# Asking the Chatbot/Model to answer the `Sample Questions` here
* What are the risk factors associated with Google and Tesla?
* What is the total revenue for Google Search?
* What are the differences in the business of Tesla and Uber?

#### Creating a question bank here

In [32]:
ques_list = [
    "What are the risk factors associated with Google and Tesla?",
    "What is the total revenue for Google Search?",
    "What are the differences in the business of Tesla and Uber?"
]
ans_list = []

#### Retreiving answers for each questions here

In [33]:
for i in range(len(ques_list)):
    result = qa.invoke(ques_list[i])
    answer = result["result"]
    ans_list.append(answer)


llama_print_timings:        load time =    3888.51 ms
llama_print_timings:      sample time =      57.09 ms /    83 runs   (    0.69 ms per token,  1453.74 tokens per second)
llama_print_timings: prompt eval time =  134139.80 ms /   260 tokens (  515.92 ms per token,     1.94 tokens per second)
llama_print_timings:        eval time =   64944.06 ms /    82 runs   (  792.00 ms per token,     1.26 tokens per second)
llama_print_timings:       total time =  199270.15 ms /   342 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =    3888.51 ms
llama_print_timings:      sample time =      95.34 ms /   140 runs   (    0.68 ms per token,  1468.51 tokens per second)
llama_print_timings: prompt eval time =  207928.69 ms /   384 tokens (  541.48 ms per token,     1.85 tokens per second)
llama_print_timings:        eval time =  113349.77 ms /   140 runs   (  809.64 ms per token,     1.24 tokens per second)
llama_print_timings:       total time =  321598.53 ms /   524 

In [34]:
ans_list

[" The risk factors associated with Google include ongoing commitment to sustainability, investments in renewable energy and environmental initiatives, competition from other companies in the technology industry, and potential changes in government policies or regulations related to technology. The risk factors associated with Tesla include ongoing competition in the electric vehicle market, regulatory risks related to autonomous driving technology, and potential issues with the company's supply chain and manufacturing processes.",
 ' To find the total revenue for Google Search, you can add up the revenue from the following categories: Google Search & other and YouTube ads. The revenue from Google Search & other increased by $12.6 billion from 2022 to 2023, while the revenue from YouTube ads also increased by $2.3 billion. Therefore, the total revenue for Google Search in 2023 is $175.033 million + $31.510 million = $<<175.033+31.510=206.543>>206.543 million.',
 ' Tesla is an automaker

#### Final Questions and their respective Answers here

In [35]:
for i in range(len(ques_list)):
    print(f"Question : {ques_list[i]}")
    print(f"Answer : {ans_list[i]}")
    print()

Question : What are the risk factors associated with Google and Tesla?
Answer :  The risk factors associated with Google include ongoing commitment to sustainability, investments in renewable energy and environmental initiatives, competition from other companies in the technology industry, and potential changes in government policies or regulations related to technology. The risk factors associated with Tesla include ongoing competition in the electric vehicle market, regulatory risks related to autonomous driving technology, and potential issues with the company's supply chain and manufacturing processes.

Question : What is the total revenue for Google Search?
Answer :  To find the total revenue for Google Search, you can add up the revenue from the following categories: Google Search & other and YouTube ads. The revenue from Google Search & other increased by $12.6 billion from 2022 to 2023, while the revenue from YouTube ads also increased by $2.3 billion. Therefore, the total reve