In [0]:
# INSTALLS
%pip install langchain accelerate transformers ctransformers PyPDF2 pymupdf unstructured[pdf] faiss-cpu -q
dbutils.library.restartPython()

Python interpreter will be restarted.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
qpd 0.4.4 requires antlr4-python3-runtime<4.12,>=4.11.1, but you have antlr4-python3-runtime 4.9.3 which is incompatible.
httpx 0.13.3 requires idna==2.*, but you have idna 3.6 which is incompatible.
botocore 1.34.34 requires urllib3<1.27,>=1.25.4; python_version < "3.10", but you have urllib3 2.2.0 which is incompatible.
boto3 1.21.18 requires botocore<1.25.0,>=1.24.18, but you have botocore 1.34.34 which is incompatible.
Python interpreter will be restarted.


In [0]:
# IMPORTS
import os
from langchain.llms import CTransformers
from langchain import PromptTemplate, LLMChain
import boto3
from PyPDF2 import PdfReader
from io import BytesIO
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader, PyMuPDFLoader, S3FileLoader
from langchain.retrievers import BM25Retriever
from langchain.text_splitter import RecursiveCharacterTextSplitter 
import pickle
from langchain.chains import RetrievalQA

In [0]:
# VARIABLES

MODEL_NAME = 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
MODEL_FILE_NAME = 'mistral-7b-instruct-v0.2.Q6_K.gguf'
MODEL_FOLDER_NAME = 'models--TheBloke--Mistral-7B-Instruct-v0.2-GGUF'
MAX_NEW_TOKENS = 1024
CACHE_DIR = '/Workspace/Users/ajay.kumar@miqdigital.com/models'
TEMP = 0.2
MODEL_FILE_PATH = os.path.join(CACHE_DIR, MODEL_FOLDER_NAME)

In [0]:
# COMMANDS TO MOVE THE FOLDER FROM CACHE DIRECTORY TO MODELS DIRECTORY

# import subprocess
# subprocess.run(['rsync', '-r', '~/.cache/huggingface/hub/models--TheBloke--Mistral-7B-Instruct-v0.2-GGUF', '/Workspace/Users/ajay.kumar@miqdigital.com/models'])

In [0]:
llm = CTransformers(
        model = MODEL_NAME,
        model_file = MODEL_FILE_NAME,
        max_new_tokens = MAX_NEW_TOKENS,
        temperature = TEMP
    )

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [0]:
# READ PDF FROM THE BUCKET OR LOCAL FILE DIR
bucket_name ="prod-ai-and-automation"
item_name = "proj_pptrag_jiradump_ppts/inputdocs/processed_jiradump_docformat.pdf"
SOURCE_DIR = "/Workspace/Users/ajay.kumar@miqdigital.com/inputdocs/processed_jiradump_docformat.pdf"

In [0]:
# FUNCTION TO CREATE A PERSIST FAISS VECTOR DB

def create_vector_db(SOURCE_DIR):
    loader = PyPDFLoader(SOURCE_DIR)
    document = loader.load()
    CHUNK_SIZE = 1024
    CHUNK_OVERLAP = 200
    EMBEDDER = "BAAI/bge-base-en-v1.5"
    FAISS_PATH = "/Workspace/Users/ajay.kumar@miqdigital.com/vector_dbs/FAISS_RAG"

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE,
                                                   chunk_overlap=CHUNK_OVERLAP)
    texts = text_splitter.split_documents(document)

    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDER,
                                       model_kwargs={'device': 'cpu'})

    db = FAISS.from_documents(texts, embeddings)
    db.save_local(FAISS_PATH)


In [0]:
# REQUIRED ONLY FIRST TIME (DATA INGESTION)

# create_vector_db(SOURCE_DIR)

In [0]:
PROMPT_TEMPLATE = '''
You are analysing a document. The document has "User Request" and "Response". The "Response" consists of information regarding slide numbers and their resepctive contents. Your job is to tell me the slide numbers and the contents "Response" in the document, based on the request that I ask you. If you don't know the answer, just say "I don't know the answer."
DO NOT make up answers that are not based on facts. Explain with detailed answers that are easy to understand.
Context: {context}
Question: {question}
Only return the useful aspects of the answer below and nothing else.
Helpful answer:
'''
EMBEDDER = "BAAI/bge-base-en-v1.5"
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDER,
                                       model_kwargs={'device': 'cpu'})
FAISS_PATH = "/Workspace/Users/ajay.kumar@miqdigital.com/vector_dbs/FAISS_RAG"
INP_VARS = ['context', 'question']
CHAIN_TYPE = "stuff"
SEARCH_KWARGS = {'k': 4}
MAX_NEW_TOKENS = 100


In [0]:
faiss_vectorstore = FAISS.load_local(FAISS_PATH, embeddings)
faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs=SEARCH_KWARGS)

custom_prompt_temp = PromptTemplate(template=PROMPT_TEMPLATE,input_variables=INP_VARS)

qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    retriever=faiss_retriever,
    return_source_documents=True,
    chain_type='stuff',
    verbose = True,
    chain_type_kwargs={"prompt": custom_prompt_temp}
)

In [0]:
query = "Request you to provide us with a FSA list of people overexposed/underexposed against ads from government of Quebec"
docs = faiss_vectorstore.similarity_search(query)

In [0]:
print(docs[0].page_content)

User Request Hi	team,			Request	you	to	provide	us	with	a	FSA	list	of	people	overexposed/underexposed	against	ads	from	government	of	Quebec?			Response • slide_number:1	• slide_text:ATV	POST-CAMPAIGN	INSIGHTS	Maple	Leaf	18/09/2023	to	07/12/2023	• slide_number:2	• slide_text:CTV	Performance	• slide_number:3	• slide_text:CAMPAIGN		Households	reached		Video	Completion		Rate	(VCR)		976,874		90,987		98%		Advanced	TV	with	ACR		Impressions		Performance	Summary	• slide_number:4	• slide_text:CTV	Campaign	Insights	• slide_number:5	• slide_text:Incremental	Reach	for	CTV	The	CTV	campaign	helped	drive	an	additional	30.15%											%	Incremental	Reach	on	CTV		LTV-only		Reach	CTV	+	LTV	Overlap	*	CTV	Reach	Only	*	Shared												%				63.56K	27.43K	69.85	30.15


In [0]:
question = "Request you to provide us with a FSA list of people overexposed/underexposed against ads from government of Quebec"
result = qa_chain({"query": question})
result["result"]



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Out[66]: 'Based on the document does the document does the document does the document does the "Based on the document does the document does this document doesn\'To answer\nIn order and the slide numbers \nThe document does the document does the document does the slides \n\n\n\n\nTo answer to provide a)\nI\'Your team, In response contains information from the document does the document does the "Based on the FSA in response contains the slides \nThe document does the document does the user is the FSA to answer based on the document does the document does the document does the document does the document does the document does the FSA: Based on the document does the document does the "Based on slide numbers \n\nTo provide us, The document does the document does the slides \nTo answer based on the document does the user is there are there are there are there are there are you do not directly related to fulfilling bas

In [0]:
print(result['result'])

Based on the document does the document does the document does the document does the "Based on the document does the document does this document doesn'To answer
In order and the slide numbers 
The document does the document does the document does the slides 




To answer to provide a)
I'Your team, In response contains information from the document does the document does the "Based on the FSA in response contains the slides 
The document does the document does the user is the FSA to answer based on the document does the document does the document does the document does the document does the document does the FSA: Based on the document does the document does the "Based on slide numbers 

To provide us, The document does the document does the slides 
To answer based on the document does the user is there are there are there are there are there are you do not directly related to fulfilling based on the FSA in response contains information from the document does the document does the docum