In [1]:
import os
with open(r'C:\Python\HQgeminiAPIKey.txt', 'r')as HQfile:
    GOOGLE_API_KEY = HQfile.read()
os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY
with open(r'C:\Python\geminiAPIKey.txt', 'r')as file:
    GOOGL_EMBED_API_KEY = file.read()
os.environ['GOOGL_EMBED_API_KEY'] = GOOGL_EMBED_API_KEY

In [2]:
print('Importing required libraries...')
import warnings
warnings.filterwarnings('ignore')
from glob import glob
from datetime import datetime as dt
import time
import faiss
from langchain.rate_limiters import InMemoryRateLimiter
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings as google_embed
rate_limiter = InMemoryRateLimiter(requests_per_second=0.1,
                                   check_every_n_seconds=0.1,  # How often the limiter checks if a request is allowed
                                   max_bucket_size=10,)         # Maximum burst size
gemini = ChatGoogleGenerativeAI(model='gemini-2.5-flash', google_api_key=GOOGLE_API_KEY, rate_limiter=rate_limiter)
embedding = google_embed(model = 'models/gemini-embedding-001', google_api_key=GOOGL_EMBED_API_KEY, rate_limiter=rate_limiter)
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
from langchain_community.document_loaders import PyMuPDFLoader, UnstructuredPDFLoader
from langchain_community.document_loaders.parsers import TesseractBlobParser, RapidOCRBlobParser
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_core.runnables import RunnablePassthrough
from IPython.display import display, Markdown
from dotenv import load_dotenv
load_dotenv()
print('Required libraries imported!')

Importing required libraries...
Required libraries imported!


In [3]:
papers=[]

for file_path in glob('./PDF/*.pdf'):
    loader = PyMuPDFLoader(file_path, mode="page", images_inner_format="html-img", images_parser=TesseractBlobParser(), extract_tables="markdown",)
    data = loader.load()
    for doc in data:
        doc.metadata['source'] = file_path.split('\\')[-1]
        doc.metadata['title'] = file_path.split('\\')[-1].split('.')[0]
        papers.append(doc)

papers[15]

Consider using the pymupdf_layout package for a greatly improved page layout analysis.


Document(metadata={'producer': 'pdfTeX-1.40.17', 'creator': 'LaTeX with hyperref package', 'creationdate': '2020-07-24T00:04:08+00:00', 'source': 'Language Models are Few-Shot Learners.pdf', 'file_path': './PDF\\Language Models are Few-Shot Learners.pdf', 'total_pages': 75, 'format': 'PDF 1.5', 'title': 'Language Models are Few-Shot Learners', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2020-07-24T00:04:08+00:00', 'trapped': '', 'modDate': 'D:20200724000408Z', 'creationDate': 'D:20200724000408Z', 'page': 0}, page_content='Language Models are Few-Shot Learners\nTom B. Brown∗\nBenjamin Mann∗\nNick Ryder∗\nMelanie Subbiah∗\nJared Kaplan†\nPrafulla Dhariwal\nArvind Neelakantan\nPranav Shyam\nGirish Sastry\nAmanda Askell\nSandhini Agarwal\nAriel Herbert-Voss\nGretchen Krueger\nTom Henighan\nRewon Child\nAditya Ramesh\nDaniel M. Ziegler\nJeffrey Wu\nClemens Winter\nChristopher Hesse\nMark Chen\nEric Sigler\nMateusz Litwin\nScott Gray\nBenjamin Chess\nJack Clark\nChristopher Bern

# Filtering Metadata

In [4]:
filtered_papers = filter_complex_metadata(papers)
filtered_papers[0]

Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'source': 'Attention Is All You Need.pdf', 'file_path': './PDF\\Attention Is All You Need.pdf', 'total_pages': 15, 'format': 'PDF 1.5', 'title': 'Attention Is All You Need', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'trapped': '', 'modDate': 'D:20240410211143Z', 'creationDate': 'D:20240410211143Z', 'page': 0}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.to

In [5]:
splitter = RecursiveCharacterTextSplitter(chunk_size = 1500, chunk_overlap = 300)
chunks = splitter.split_documents(filtered_papers)
chunks[0]

Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'source': 'Attention Is All You Need.pdf', 'file_path': './PDF\\Attention Is All You Need.pdf', 'total_pages': 15, 'format': 'PDF 1.5', 'title': 'Attention Is All You Need', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'trapped': '', 'modDate': 'D:20240410211143Z', 'creationDate': 'D:20240410211143Z', 'page': 0}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.to

In [6]:
chunks[1]

Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'source': 'Attention Is All You Need.pdf', 'file_path': './PDF\\Attention Is All You Need.pdf', 'total_pages': 15, 'format': 'PDF 1.5', 'title': 'Attention Is All You Need', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'trapped': '', 'modDate': 'D:20240410211143Z', 'creationDate': 'D:20240410211143Z', 'page': 0}, page_content='to-German translation task, improving over the existing best results, including\nensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task,\nour model establishes a new single-model state-of-the-art BLEU score of 41.8 after\ntraining for 3.5 days on eight GPUs, a small fraction of the training costs of the\nbest models from the literature. We show that the Transformer generalizes well to\nother tasks by applying it successfully to English constituency parsing both with\nlarge an

# Storing data in FAISS vector store
- ### Run below cell only once

In [7]:
os.path.isfile('./FAISS/index.faiss')

False

# Loading FAISS vector datastore

In [3]:
# Loading FAISS Database
faiss_vector_store = FAISS.load_local('./FAISS', embedding, allow_dangerous_deserialization=True)
faiss_vector_store

<langchain_community.vectorstores.faiss.FAISS at 0x1a6edde7250>

In [4]:
retriever = faiss_vector_store.as_retriever(search_type='similarity', k=5)

In [11]:
retriever.invoke("Scramble tasks")

[Document(id='4b24d341-cae0-40c4-9a4a-f7706858b509', metadata={'producer': 'pdfTeX-1.40.17', 'creator': 'LaTeX with hyperref package', 'creationdate': '2020-07-24T00:04:08+00:00', 'source': 'Language Models are Few-Shot Learners.pdf', 'file_path': './PDF\\Language Models are Few-Shot Learners.pdf', 'total_pages': 75, 'format': 'PDF 1.5', 'title': 'Language Models are Few-Shot Learners', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2020-07-24T00:04:08+00:00', 'trapped': '', 'modDate': 'D:20200724000408Z', 'creationDate': 'D:20200724000408Z', 'page': 66}, page_content='Figure H.10: All results for all Scramble tasks.\nFigure H.11: All results for all Translation tasks.\n67'),
 Document(id='49114c6b-b34a-426f-8497-af678d117e07', metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'source': 'Attention Is All You Need.pdf', 'file_path': './PDF\\Attention Is All You Need.pdf', 'total_pages': 15, 'format': 'PDF 1.5

In [6]:
rag_prompt = '''You are an AI assistant who is a good & polite helper who will answer user questions from the given context only and if you won't find the answer in the 
context then you will politely deny and will ask another question from user. Also, make sure you save the tokens as much as possible.

question: {question}
context: {context}
'''
rag_prompt

"You are an AI assistant who is a good & polite helper who will answer user questions from the given context only and if you won't find the answer in the \ncontext then you will politely deny and will ask another question from user. Also, make sure you save the tokens as much as possible.\n\nquestion: {question}\ncontext: {context}\n"

In [7]:
rag_prompt_template = ChatPromptTemplate.from_template(rag_prompt)

In [8]:
def format_docs(docs):
  return "\n\n".join(doc.page_content for doc in docs)

qa_rag_chain = (
    {"context":(retriever | format_docs),
     "question": RunnablePassthrough()} | rag_prompt_template | gemini
)

In [10]:
while True:
    usrQuery = input("Please tye your query here. If you want to exit the chat then please type 'Exit' or 'Quit':\n")
    if usrQuery in ['Exit', 'Quit']:
        break
    else:
        responce = qa_rag_chain.invoke(usrQuery)
        display(Markdown(responce.content))

Please tye your query here. If you want to exit the chat then please type 'Exit' or 'Quit':
 Hello


Hello! I am an AI assistant here to help you.

I cannot find an answer to "Hello" in the provided context, as it does not contain conversational responses or information about greetings.

Please let me know if you have a specific question I can answer from the given context!

Please tye your query here. If you want to exit the chat then please type 'Exit' or 'Quit':
 Large Language Models


Scaling up language models significantly improves task-agnostic, few-shot performance. For example, GPT-3 is an autoregressive language model with 175 billion parameters, which achieved competitiveness with prior state-of-the-art fine-tuning approaches in the few-shot setting without any gradient updates or fine-tuning. These very large language models are suggested as an important ingredient in developing adaptable, general language systems. While they consume significant resources during training, they can be surprisingly efficient once trained, with techniques like model distillation further reducing costs.

Please tye your query here. If you want to exit the chat then please type 'Exit' or 'Quit':
 What is Large Language Models


I apologize, but the provided context does not offer a direct definition of "Large Language Models." It discusses scaling up language models and gives examples like GPT-3, which has 175 billion parameters, but it doesn't explicitly define the term itself.

Is there anything else I can help you with from the given context?

Please tye your query here. If you want to exit the chat then please type 'Exit' or 'Quit':
 What is machine learning?


I am sorry, but the provided context does not contain a definition for "machine learning."

Is there another question I can help you with from the given context?

Please tye your query here. If you want to exit the chat then please type 'Exit' or 'Quit':
 Deep Learning?


I'm sorry, but the provided context does not contain a definition or explanation of "Deep Learning." It only mentions it in the title of one of the references.

Is there another question I can help you with from the given context?

Please tye your query here. If you want to exit the chat then please type 'Exit' or 'Quit':
 Figure 1


I apologize, but the provided context does not contain any information about "Figure 1".

Is there another question I can help you with based on the given context?

Please tye your query here. If you want to exit the chat then please type 'Exit' or 'Quit':
 Figure


Here is the information about "Figure" from the context:

*   Figure H.10: All results for all Scramble tasks.
*   Figure H.11: All results for all Translation tasks.

Is there anything else I can help you with from the provided text?

Please tye your query here. If you want to exit the chat then please type 'Exit' or 'Quit':
 Show me Figure H.10: All results for all Scramble tasks


I found the caption "Figure H.10: All results for all Scramble tasks" in the context. However, as an AI, I cannot display images or figures. Is there anything else I can help you with from the provided text?

Please tye your query here. If you want to exit the chat then please type 'Exit' or 'Quit':
 what is Scramble tasks?


I apologize, but the provided context mentions "Scramble tasks" in the context of presenting results (Figure H.10) and lists "Anagrams 1" as a type of task, but it does not provide a definition or description of what Scramble tasks are.

Please let me know if you have another question!

Please tye your query here. If you want to exit the chat then please type 'Exit' or 'Quit':
 Exit
