In [26]:
from langchain_ollama import OllamaEmbeddings, OllamaLLM
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain.retrievers import ParentDocumentRetriever
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from pprint import pprint

In [27]:
embedding = OllamaEmbeddings(model = "nomic-embed-text")
llm = OllamaLLM(model = "llama3.2")

In [28]:
loaders = [
    PyPDFLoader("/home/bishwayansaha99/langchain/docs/attention.pdf"),
    PyPDFLoader("/home/bishwayansaha99/langchain/docs/lost_in_the_middle.pdf")
]

docs = []
for loader in loaders:
    docs.extend(loader.load())
print(f"Length of documnets: {len(docs)}")

for i, doc in enumerate(docs):
    print(f"Chunk {i+1} Size {llm.get_num_tokens( doc.page_content)}")

Length of documnets: 33
Chunk 1 Size 709
Chunk 2 Size 886
Chunk 3 Size 440
Chunk 4 Size 579
Chunk 5 Size 797
Chunk 6 Size 849
Chunk 7 Size 782
Chunk 8 Size 924
Chunk 9 Size 829
Chunk 10 Size 858
Chunk 11 Size 1018
Chunk 12 Size 981
Chunk 13 Size 273
Chunk 14 Size 321
Chunk 15 Size 311
Chunk 16 Size 878
Chunk 17 Size 1153
Chunk 18 Size 1171
Chunk 19 Size 975
Chunk 20 Size 1134
Chunk 21 Size 1198
Chunk 22 Size 1101
Chunk 23 Size 1046
Chunk 24 Size 1125
Chunk 25 Size 1156
Chunk 26 Size 1172
Chunk 27 Size 1139
Chunk 28 Size 1417
Chunk 29 Size 1093
Chunk 30 Size 759
Chunk 31 Size 648
Chunk 32 Size 727
Chunk 33 Size 741


In [29]:
vector_store = Chroma(collection_name="doc_retreiver", embedding_function=embedding)

store = InMemoryStore()

child_splitter = RecursiveCharacterTextSplitter(chunk_size = 500)

parent_doc_retriever = ParentDocumentRetriever(
    vectorstore= vector_store,
    docstore= store,
    child_splitter=child_splitter
)
parent_doc_retriever.add_documents(docs, id=None)
print(f"Number of parent chunks{len(list(store.yield_keys()))}")

Number of parent chunks33


In [30]:
parent_doc_retriever.invoke("What is attention in transformer?")

[Document(metadata={'source': '/home/bishwayansaha99/langchain/docs/attention.pdf', 'page': 4}, page_content='output values. These are concatenated and once again projected, resulting in the final values, as\ndepicted in Figure 2.\nMulti-head attention allows the model to jointly attend to information from different representation\nsubspaces at different positions. With a single attention head, averaging inhibits this.\nMultiHead(Q, K, V) = Concat(head1, ...,headh)WO\nwhere headi = Attention(QWQ\ni , KWK\ni , V WV\ni )\nWhere the projections are parameter matricesWQ\ni ∈ Rdmodel×dk , WK\ni ∈ Rdmodel×dk , WV\ni ∈ Rdmodel×dv\nand WO ∈ Rhdv×dmodel .\nIn this work we employ h = 8 parallel attention layers, or heads. For each of these we use\ndk = dv = dmodel/h = 64. Due to the reduced dimension of each head, the total computational cost\nis similar to that of single-head attention with full dimensionality.\n3.2.3 Applications of Attention in our Model\nThe Transformer uses multi-head atten

<div>
    We have 33 parent chunks each with token size approx 1200.
    Although 1200 tokens can easilt be sent to embedding model, let's consder that our embedding model can't handle that many tokens at a go.
    So we have to get smaller chunks from this parent chunks, let's say child chunks.
    Now when we hit a query, the similarity search will happen based on the small sized child chunks, but the response will be corresponding parent chunks.
</div>

In [31]:
store_1 = InMemoryStore()

parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)

vector_store_1 = Chroma(
    collection_name="parent_doc_retriever", embedding_function=embedding
)

parent_doc_retriever_1 = ParentDocumentRetriever(
    vectorstore= vector_store_1,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
    docstore = store_1
)

parent_doc_retriever_1.add_documents(documents=docs, id=None)
print(f"Number of chunks {len(list(store_1.yield_keys()))}")

Number of chunks 137


In [32]:
parent_doc_retriever_1.invoke("What is attention in transformer?")

[Document(metadata={'source': '/home/bishwayansaha99/langchain/docs/attention.pdf', 'page': 4}, page_content='output values. These are concatenated and once again projected, resulting in the final values, as\ndepicted in Figure 2.\nMulti-head attention allows the model to jointly attend to information from different representation\nsubspaces at different positions. With a single attention head, averaging inhibits this.\nMultiHead(Q, K, V) = Concat(head1, ...,headh)WO\nwhere headi = Attention(QWQ\ni , KWK\ni , V WV\ni )\nWhere the projections are parameter matricesWQ\ni ∈ Rdmodel×dk , WK\ni ∈ Rdmodel×dk , WV\ni ∈ Rdmodel×dv\nand WO ∈ Rhdv×dmodel .\nIn this work we employ h = 8 parallel attention layers, or heads. For each of these we use\ndk = dv = dmodel/h = 64. Due to the reduced dimension of each head, the total computational cost\nis similar to that of single-head attention with full dimensionality.\n3.2.3 Applications of Attention in our Model\nThe Transformer uses multi-head atten

In [33]:
def refactor_document(docs):
    return "\n\n".join([doc.page_content for doc in docs])

prompt_template = hub.pull("rlm/rag-prompt")



In [34]:
def create_chain(doc_retriever):
    return (
        (
            {
                "context": doc_retriever | refactor_document,
                "question": RunnablePassthrough(),
            }
        )
        | prompt_template
        | llm
        | StrOutputParser()
    )

In [35]:
pprint(create_chain(parent_doc_retriever).invoke("What is attention in transformer?"))

('The answer to "What is the main focus of this work?" is:\n'
 '\n'
 '**Reducing sequential computation in neural sequence transduction models**\n'
 '\n'
 'Specifically, the work proposes a new model architecture called the '
 'Transformer, which eschews recurrence and instead relies entirely on an '
 'attention mechanism to draw global dependencies between input and output '
 'sequences. This allows for significantly more parallelization and can reach '
 'a new state of the art in translation quality after being trained for as '
 'little as twelve hours on eight P100 GPUs.')


In [36]:
pprint(create_chain(parent_doc_retriever_1).invoke("What is attention in transformer?"))

('Multi-head attention allows a model to jointly attend to information from '
 'different representation subspaces at different positions. It does this by '
 'concatenating output values and projecting them again, as shown in Figure 2. '
 'This is achieved through the MultiHead(Q, K, V) formula, where each head is '
 'an instance of Attention(QWQi, KWKi, Vi), with separate projection matrices '
 'for queries, keys, and values.')
