In [1]:
from langchain_community.document_loaders import JSONLoader
import json 
from langchain_community.docstore.document import Document
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import uuid
from glob import glob
from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
import time
from dotenv import load_dotenv
import os 

load_dotenv()

True

In [2]:
file_path = r"C:\Users\bhavi\OneDrive\Desktop\langhcain_learning\RAG\rag_docs\wikidata_rag_demo.jsonl"

In [4]:
loader = JSONLoader(file_path=file_path,
                    jq_schema=".",
                    text_content=False,
                    json_lines=True)

wiki_docs = loader.load()
print("LENGTH OF DOCS ----",len(wiki_docs))
print(wiki_docs[3])

LENGTH OF DOCS ---- 1801
page_content='{"id": "71548", "title": "Chi-square distribution", "paragraphs": ["In probability theory and statistics, the chi-square distribution (also chi-squared or formula_1\\u00a0 distribution) is one of the most widely used theoretical probability distributions. Chi-square distribution with formula_2 degrees of freedom is written as formula_3. It is a special case of gamma distribution.", "Chi-square distribution is primarily used in statistical significance tests and confidence intervals. It is useful, because it is relatively easy to show that certain probability distributions come close to it, under certain conditions. One of these conditions is that the null hypothesis must be true. Another one is that the different random variables (or observations) must be independent of each other."]}' metadata={'source': 'C:\\Users\\bhavi\\OneDrive\\Desktop\\langhcain_learning\\RAG\\rag_docs\\wikidata_rag_demo.jsonl', 'seq_num': 4}


In [5]:
# loading ddata from json 
wikipedia_documents = []

for doc in wiki_docs:
    doc = json.loads(doc.page_content)
    meta_data = {"title":doc["title"],
                 "id":doc["id"],
                 "source":"wikipedia",
                 "page":1
                 }
    
    data = " ".join(doc["paragraphs"])
    wikipedia_documents.append(Document(page_content=data ,metadata=meta_data))

In [6]:
wikipedia_documents[1]

Document(page_content='Dattatreya is the God who is an incarnation of the Divine Trinity Brahma, Vishnu and Siva. The word Datta means "Given", Datta is called so because the divine trinity have "given" themselves in the form of a son to the sage couple Guru Atri and Mata Anusuya. He is the son of Guru Atri, hence the name "Atreya." In the Nath tradition, Dattatreya is seen as an Avatar or incarnation of the Lord Shiva and as the Adi-Guru (First Teacher) of the Adi-Nath sampradaya of the Nathas. Although Dattatreya was at first a "Lord of Yoga" with Tantric traits, he was adapted and assimilated into the more devotional cults; while still worshiped by millions of Hindus, he is approached more as a benevolent God than as a teacher of the highest essence of Indian thought. Though the Dattatreya of the Natha tradition coexisted and intermingled with the Puranic, Brahmanical tradition of the Datta sampradaya, here we shall focus almost exclusively on the earlier Tantric manifestation of Da

In [7]:
# Loading model 

api_key = os.getenv("GROQ_API_KEY")
model = ChatGroq(model="qwen/qwen3-32b",
                 temperature=0,
                 max_tokens=None,
                 api_key=api_key,
                 timeout=None,
                 max_retries=2,
                 )


In [None]:
# Loading model 

api_key = os.getenv("GROQ_API_KEY")
model = ChatGroq(model="qwen/qwen3-32b",
                 temperature=0,
                 max_tokens=None,
                 api_key=api_key,
                 timeout=None,
                 max_retries=2,
                 )


In [9]:
def create_standard_chunks(file_path, chunk_size=1500, chunk_overlap=150):
    print("Loading Pages:", file_path)
    loader = PyMuPDFLoader(file_path)
    doc_pages = loader.load()

    print("Chunking pages...", file_path)
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap
    )
    doc_chunks = splitter.split_documents(doc_pages)

    standard_chunks = []

    for chunk in doc_chunks:
        chunk_metadata_upd = {
            "id": str(uuid.uuid4()),
            "page": chunk.metadata.get("page"),
            "source": file_path,
            "title": os.path.basename(file_path) 
        }

        standard_chunks.append(Document(
            page_content=chunk.page_content,
            metadata=chunk_metadata_upd
        ))
        
    print("Finished processing --------", file_path)
    return standard_chunks

In [10]:
pdf_files = glob("C:/Users/bhavi/OneDrive/Desktop/langhcain_learning/RAG/rag_docs/*.pdf")
print(pdf_files)

['C:/Users/bhavi/OneDrive/Desktop/langhcain_learning/RAG/rag_docs\\attention_paper.pdf', 'C:/Users/bhavi/OneDrive/Desktop/langhcain_learning/RAG/rag_docs\\cnn_paper.pdf', 'C:/Users/bhavi/OneDrive/Desktop/langhcain_learning/RAG/rag_docs\\resnet_paper.pdf', 'C:/Users/bhavi/OneDrive/Desktop/langhcain_learning/RAG/rag_docs\\vision_transformer.pdf']


In [12]:
paper_docs = []

for fp in pdf_files:
    paper_docs.extend(create_standard_chunks(file_path=fp,chunk_size=1500))

Loading Pages: C:/Users/bhavi/OneDrive/Desktop/langhcain_learning/RAG/rag_docs\attention_paper.pdf
Chunking pages... C:/Users/bhavi/OneDrive/Desktop/langhcain_learning/RAG/rag_docs\attention_paper.pdf
Finished processing -------- C:/Users/bhavi/OneDrive/Desktop/langhcain_learning/RAG/rag_docs\attention_paper.pdf
Loading Pages: C:/Users/bhavi/OneDrive/Desktop/langhcain_learning/RAG/rag_docs\cnn_paper.pdf
Chunking pages... C:/Users/bhavi/OneDrive/Desktop/langhcain_learning/RAG/rag_docs\cnn_paper.pdf
Finished processing -------- C:/Users/bhavi/OneDrive/Desktop/langhcain_learning/RAG/rag_docs\cnn_paper.pdf
Loading Pages: C:/Users/bhavi/OneDrive/Desktop/langhcain_learning/RAG/rag_docs\resnet_paper.pdf
Chunking pages... C:/Users/bhavi/OneDrive/Desktop/langhcain_learning/RAG/rag_docs\resnet_paper.pdf
Finished processing -------- C:/Users/bhavi/OneDrive/Desktop/langhcain_learning/RAG/rag_docs\resnet_paper.pdf
Loading Pages: C:/Users/bhavi/OneDrive/Desktop/langhcain_learning/RAG/rag_docs\vision

In [13]:
total_chunks = wikipedia_documents + paper_docs
print("------ Lenght of Documents ---------",len(total_chunks))

------ Lenght of Documents --------- 1967


In [15]:
# Indexing Documents and chunk embeddings in Vector DB 
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma


embedding_model = HuggingFaceEmbeddings(model_name='BAAI/bge-small-en')

chroma_db = Chroma.from_documents(documents=total_chunks,
                                  embedding=embedding_model,
                                  collection_metadata={"hnsw:space":"cosine"},
                                  persist_directory="./wikipedia_db")

print("[----EMBEDDINGS CREATED ---------]")

  from .autonotebook import tqdm as notebook_tqdm


[----EMBEDDINGS CREATED ---------]


In [22]:
# doing similarity based retrieval 
from langchain_community.retrievers import BM25Retriever 
from langchain.retrievers import EnsembleRetriever
similarity_retriever = chroma_db.as_retriever(search_type="similarity",
                                              search_kwargs={"k":5})

bm25_retrievers = BM25Retriever.from_documents(documents=total_chunks,
                                               k=5)
print("--- Similarity and bm25 Retriever initalizes ----")


--- Similarity and bm25 Retriever initalizes ----


In [23]:
# reciprocal rank Fusion 

ensemble_retriever = EnsembleRetriever(
    retrievers = [bm25_retrievers,similarity_retriever],
    weights = [0.5,0.5]
)
ensemble_retriever

EnsembleRetriever(retrievers=[BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x000001E869706750>, k=5), VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001E82D806290>, search_kwargs={'k': 5})], weights=[0.5, 0.5])

In [24]:
# chained Retrieval with Reranker 
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain.retrievers import ContextualCompressionRetriever

reranker = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-v2-m3")
reranker_compression = CrossEncoderReranker(model=reranker
                                            ,top_n=5)

final_retriever = ContextualCompressionRetriever(
    base_retriever = ensemble_retriever,  # BM25 + similarity search 
    base_compressor = reranker_compression # Re-ranker

)
final_retriever

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


ContextualCompressionRetriever(base_compressor=CrossEncoderReranker(model=HuggingFaceCrossEncoder(client=<sentence_transformers.cross_encoder.CrossEncoder.CrossEncoder object at 0x000001E86973D310>, model_name='BAAI/bge-reranker-v2-m3', model_kwargs={}), top_n=5), base_retriever=EnsembleRetriever(retrievers=[BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x000001E869706750>, k=5), VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001E82D806290>, search_kwargs={'k': 5})], weights=[0.5, 0.5]))

In [33]:
from IPython.display import display ,Markdown

def display_docs(docs):
    for doc in docs:
        print("------- META DATA ----------:",doc.metadata)
        print("-------- CONTENT BRIEF ---------")
        display(Markdown(doc.page_content[:500]))
        print()
        print("-----------------------------------------------\n")

In [35]:
query = "What is Machine Learning ?"
tops_docs = final_retriever.invoke(query)
display_docs(tops_docs)

------- META DATA ----------: {'title': 'Machine learning', 'id': '564928', 'source': 'wikipedia', 'page': 1}
-------- CONTENT BRIEF ---------


Machine learning gives computers the ability to learn without being explicitly programmed (Arthur Samuel, 1959). It is a subfield of computer science. The idea came from work in artificial intelligence. Machine learning explores the study and construction of algorithms which can learn and make predictions on data. Such algorithms follow programmed instructions, but can also make predictions or decisions based on data. They build a model from sample inputs. Machine learning is done where designin


-----------------------------------------------

------- META DATA ----------: {'page': 1, 'title': 'Deep learning', 'id': '663523', 'source': 'wikipedia'}
-------- CONTENT BRIEF ---------


Deep learning (also called deep structured learning or hierarchical learning) is a kind of machine learning, which is mostly used with certain kinds of neural networks. As with other kinds of machine-learning, learning sessions can be unsupervised, semi-supervised, or supervised. In many cases, structures are organised so that there is at least one intermediate layer (or hidden layer), between the input layer and the output layer. Certain tasks, such as as recognizing and understanding speech, i


-----------------------------------------------

------- META DATA ----------: {'title': 'Supervised learning', 'page': 1, 'id': '359370', 'source': 'wikipedia'}
-------- CONTENT BRIEF ---------


In machine learning, supervised learning is the task of inferring a function from labelled training data. The results of the training are known beforehand, the system simply learns how to get to these results correctly. Usually, such systems work with vectors. They get the training data and the result of the training as two vectors and produce a "classifier". Usually, the system uses inductive reasoning to generalize the training data.


-----------------------------------------------

------- META DATA ----------: {'page': 1, 'title': 'Reinforcement learning', 'id': '610032', 'source': 'wikipedia'}
-------- CONTENT BRIEF ---------


Reinforcement learning (RL) is teaching a "software agent" how to behave in an environment by telling it how good it's doing. It is an area of machine learning inspired by behaviorist psychology. Reinforcement learning is different from supervised learning because the correct inputs and outputs are never shown. Also, reinforcement learning usually learns as it goes (online learning) unlike supervised learning. This means an agent has to choose between exploring and sticking with what it knows be


-----------------------------------------------

------- META DATA ----------: {'title': 'Jaime Carbonell', 'id': '740974', 'source': 'wikipedia', 'page': 1}
-------- CONTENT BRIEF ---------


Jaime Guillermo Carbonell (July 29, 1953 – February 28, 2020) was an American computer scientist. His works focused on natural language processing tools and technologies. He earned his B.S. degrees in Physics and in Mathematics from MIT in 1975 and did his Ph.D. under Dr. Roger Schank at Yale University in 1979. He joined Carnegie Mellon University as an assistant professor of computer science in 1979 and lived in Pittsburgh from then. He was affiliated with the Language Technologies Institute, 


-----------------------------------------------



In [36]:
query = "What is difference between Vision Transformer and Normal Trnasformer ?"
tops_docs = final_retriever.invoke(query)
display_docs(tops_docs)

------- META DATA ----------: {'id': '301cc692-ce5e-453c-aa39-d3724f38ce3b', 'page': 2, 'source': 'C:/Users/bhavi/OneDrive/Desktop/langhcain_learning/RAG/rag_docs\\vision_transformer.pdf', 'title': 'vision_transformer.pdf'}
-------- CONTENT BRIEF ---------


Published as a conference paper at ICLR 2021
Transformer Encoder
MLP 
Head
Vision Transformer (ViT)
*
Linear Projection of Flattened Patches
* Extra learnable
     [ cl ass]  embedding
1
2
3
4
5
6
7
8
9
0
Patch + Position 
Embedding
Class
Bird
Ball
Car
...
Embedded 
Patches
Multi-Head 
Attention
Norm
MLP
Norm
+
L x
+
Transformer Encoder
Figure 1: Model overview. We split an image into ﬁxed-size patches, linearly embed each of them,
add position embeddings, and feed the resulting sequence of vect


-----------------------------------------------

------- META DATA ----------: {'id': '63c5851c-4977-440e-8d14-8a2a9ac86390', 'page': 7, 'source': 'C:/Users/bhavi/OneDrive/Desktop/langhcain_learning/RAG/rag_docs\\vision_transformer.pdf', 'title': 'vision_transformer.pdf'}
-------- CONTENT BRIEF ---------


tational budgets, but the difference vanishes for larger models. This result is somewhat surprising,
since one might expect convolutional local feature processing to assist ViT at any size. Third, Vision
Transformers appear not to saturate within the range tried, motivating future scaling efforts.
4.5
INSPECTING VISION TRANSFORMER
Input
Attention
Figure 6: Representative ex-
amples of attention from the
output token to the input
space. See Appendix D.7 for
details.
To begin to understand how the


-----------------------------------------------

------- META DATA ----------: {'id': '611763c7-13d4-4272-a632-100a8963675d', 'page': 8, 'source': 'C:/Users/bhavi/OneDrive/Desktop/langhcain_learning/RAG/rag_docs\\vision_transformer.pdf', 'title': 'vision_transformer.pdf'}
-------- CONTENT BRIEF ---------


5
CONCLUSION
We have explored the direct application of Transformers to image recognition. Unlike prior works
using self-attention in computer vision, we do not introduce image-speciﬁc inductive biases into
the architecture apart from the initial patch extraction step. Instead, we interpret an image as a
sequence of patches and process it by a standard Transformer encoder as used in NLP. This simple,
yet scalable, strategy works surprisingly well when coupled with pre-training on large datasets.


-----------------------------------------------

------- META DATA ----------: {'page': 6, 'source': 'C:/Users/bhavi/OneDrive/Desktop/langhcain_learning/RAG/rag_docs\\vision_transformer.pdf', 'title': 'vision_transformer.pdf', 'id': '5c9ecfc5-6518-4f66-bd1f-7e156063d5be'}
-------- CONTENT BRIEF ---------


Published as a conference paper at ICLR 2021
ImageNet
ImageNet-21k
JFT-300M
Pre-training dataset
70
75
80
85
90
ImageNet Top1 Accuracy [%]
BiT
ViT-B/32
ViT-B/16
ViT-L/32
ViT-L/16
ViT-H/14
Figure 3:
Transfer to ImageNet.
While
large ViT models perform worse than BiT
ResNets (shaded area) when pre-trained on
small datasets, they shine when pre-trained on
larger datasets. Similarly, larger ViT variants
overtake smaller ones as the dataset grows.
10 M
30 M
100 M
300 M
Number of JFT pre-training samp


-----------------------------------------------

------- META DATA ----------: {'id': '339f5aab-85e0-4e8e-99af-d9e10beb8e65', 'page': 3, 'source': 'C:/Users/bhavi/OneDrive/Desktop/langhcain_learning/RAG/rag_docs\\vision_transformer.pdf', 'title': 'vision_transformer.pdf'}
-------- CONTENT BRIEF ---------


the patches can have spatial size 1x1, which means that the input sequence is obtained by simply
ﬂattening the spatial dimensions of the feature map and projecting to the Transformer dimension.
The classiﬁcation input embedding and position embeddings are added as described above.
3.2
FINE-TUNING AND HIGHER RESOLUTION
Typically, we pre-train ViT on large datasets, and ﬁne-tune to (smaller) downstream tasks. For
this, we remove the pre-trained prediction head and attach a zero-initialized D × K f


-----------------------------------------------



In [37]:
query = "What is langchain ?"
tops_docs = final_retriever.invoke(query)
display_docs(tops_docs)

------- META DATA ----------: {'page': 9, 'source': 'C:/Users/bhavi/OneDrive/Desktop/langhcain_learning/RAG/rag_docs\\attention_paper.pdf', 'title': 'attention_paper.pdf', 'id': '438cc259-dcb6-4ac3-84c4-db96fda87d8b'}
-------- CONTENT BRIEF ---------


arXiv:1607.06450, 2016.
[2] Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. Neural machine translation by jointly
learning to align and translate. CoRR, abs/1409.0473, 2014.
[3] Denny Britz, Anna Goldie, Minh-Thang Luong, and Quoc V. Le. Massive exploration of neural
machine translation architectures. CoRR, abs/1703.03906, 2017.
[4] Jianpeng Cheng, Li Dong, and Mirella Lapata. Long short-term memory-networks for machine
reading. arXiv preprint arXiv:1601.06733, 2016.
10


-----------------------------------------------

------- META DATA ----------: {'page': 1, 'source': 'wikipedia', 'id': '125565', 'title': 'Extensible Messaging and Presence Protocol'}
-------- CONTENT BRIEF ---------


Extensible Messaging and Presence Protocol (XMPP) (formerly named Jabber) is a protocol for instant messaging. It is inspired by XML. It is different to most protocols because it is an open standard. This means that anybody who has a domain name and an internet connection can run their own server. Most of the software and the clients are open source. Other software such as Google Talk and the Gizmo5 use the XMPP protocol. It has been installed on thousands of servers across the internet. There a


-----------------------------------------------

------- META DATA ----------: {'page': 1, 'id': '168810', 'title': 'Keyhole Markup Language', 'source': 'wikipedia'}
-------- CONTENT BRIEF ---------


Keyhole Markup Language is a file format for displaying data on maps. In the same way that a Web browser displays web pages written with HTML, KML draws data on maps like Google Maps and virtual globes like Google Earth. It is an international standard , maintained by the Open Geospatial Consortium .


-----------------------------------------------

------- META DATA ----------: {'title': 'attention_paper.pdf', 'page': 11, 'source': 'C:/Users/bhavi/OneDrive/Desktop/langhcain_learning/RAG/rag_docs\\attention_paper.pdf', 'id': '5aaa4199-7af9-4ce7-a58a-68efa6ecdc6b'}
-------- CONTENT BRIEF ---------


translation system: Bridging the gap between human and machine translation. arXiv preprint
arXiv:1609.08144, 2016.
[39] Jie Zhou, Ying Cao, Xuguang Wang, Peng Li, and Wei Xu. Deep recurrent models with
fast-forward connections for neural machine translation. CoRR, abs/1606.04199, 2016.
[40] Muhua Zhu, Yue Zhang, Wenliang Chen, Min Zhang, and Jingbo Zhu. Fast and accurate
shift-reduce constituent parsing. In Proceedings of the 51st Annual Meeting of the ACL (Volume
1: Long Papers), pages 434–443.


-----------------------------------------------

------- META DATA ----------: {'title': 'Billboard Hot 100', 'id': '128360', 'source': 'wikipedia', 'page': 1}
-------- CONTENT BRIEF ---------


The "Billboard" Hot 100 is a list of the current most well-liked music made by "Billboard" magazine. Rankings are based on radio play and sales; the tracking week for sales is each Monday to Sunday, while for radio play it is Wednesday to Tuesday. A new chart is compiled and officially released to the public by "Billboard" on Thursday. Each chart is dated with the "week-ending" date of the Saturday two weeks after. Example: The first number one song of the Hot 100 was "Poor Little Fool" by Ricky


-----------------------------------------------

