In [2]:
import bs4
import chromadb 
from langchain import hub
from langchain_chroma import Chroma
from langchain_ollama import ChatOllama
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableLambda
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import PyPDFLoader

import os
import glob

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [3]:
TAVILY_API_KEY = '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'
os.environ["TAVILY_API_KEY"] = TAVILY_API_KEY

In [4]:
llm = ChatOllama(name="chat_llama3", model="krith/meta-llama-3.1-8b-instruct:IQ2_M", temperature = 0)

In [16]:
# check on the ollama for the embeddings model 
embeddings = OllamaEmbeddings(
    # transform document/text into vecotr
    model="nomic-embed-text", 
)
text_splitter = RecursiveCharacterTextSplitter(
    # tune what the best parameter
    chunk_size=50, 
    chunk_overlap=2)

In [17]:
documents_path = "data/documents/*"
filepaths = glob.glob(documents_path)
documents = []
for filepath in filepaths:
    loader = PyPDFLoader(filepath)
    docs = loader.load()
    splits = text_splitter.split_documents(docs)
    documents.extend(splits)

In [18]:
len(documents)

129

In [19]:
documents[0]

Document(metadata={'source': 'data/documents\\The Adventure Of Piko the Pinguin.pdf', 'page': 0}, page_content='TheAdventureofPikothePenguin')

In [20]:
db = Chroma(persist_directory="data/chroma_db", embedding_function=embeddings)
db.delete_collection()
db = Chroma.from_documents(documents, embeddings, persist_directory="data/chroma_db")

In [21]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":5})

In [22]:
retrieved_documents = retriever.invoke("Piko")
retrieved_documents

[Document(metadata={'page': 0, 'source': 'data/documents\\The Adventure Of Piko the Pinguin.pdf'}, page_content='for fish, Piko had a specialfascination with the'),
 Document(metadata={'page': 0, 'source': 'data/documents\\The Adventure Of Piko the Pinguin.pdf'}, page_content='diving for fish, Piko had a specialfascination'),
 Document(metadata={'page': 0, 'source': 'data/documents\\The Adventure Of Piko the Pinguin.pdf'}, page_content='gave up. When he finally reached the top, Piko'),
 Document(metadata={'page': 0, 'source': 'data/documents\\The Adventure Of Piko the Pinguin.pdf'}, page_content='Piko had a specialfascination with the sky. He'),
 Document(metadata={'page': 0, 'source': 'data/documents\\The Adventure Of Piko the Pinguin.pdf'}, page_content='One chilly morning, Piko decided it was time to')]

In [24]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def inspect(state):
    """Print the state passed between Runnables in a langchain and pass it on"""
    print(state)
    return state

prompt = """
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

Question: {question} 

Context: {context} 

Answer:
"""

rag_prompt = ChatPromptTemplate.from_messages({'system_message',prompt})

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | RunnableLambda(inspect)
    | rag_prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("Who is Piko?")

{'context': 'Pikowasknownasthebravest penguininthecolony.\n\nfor fish, Piko had a specialfascination with the\n\nlittlepenguinnamedPiko.Unlike the other penguins,\n\nsliding on ice and diving for fish, Piko had a\n\ndiving for fish, Piko had a specialfascination', 'question': 'Who is Piko, He is Penguin?'}


"I don't have enough information to determine who Piko is. The context only mentions that there is a little penguin named Piko, but it doesn't provide any further details about the person being asked about."

In [25]:
rag_chain.invoke("What the hardest part of the adventure for Piko?")

{'context': 'gave up. When he finally reached the top, Piko\n\nsliding on ice and diving for fish, Piko had a\n\ndiving for fish, Piko had a specialfascination\n\nfor fish, Piko had a specialfascination with the\n\nOne chilly morning, Piko decided it was time to', 'question': 'What the hardest part of the adventure for Piko?'}


"I'm not able to find any information about Piko's adventure. However, I can try to help you if you provide more context or clarify what you are referring to."

In [26]:
from langchain.chains import RetrievalQA

qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)


In [27]:
query = "Where piko come from?"
result = qa({"query": query})
print(result)

{'query': 'Where piko come from?', 'result': 'There is no information in the text about where Piko came from.', 'source_documents': [Document(metadata={'page': 0, 'source': 'data/documents\\The Adventure Of Piko the Pinguin.pdf'}, page_content='for fish, Piko had a specialfascination with the'), Document(metadata={'page': 0, 'source': 'data/documents\\The Adventure Of Piko the Pinguin.pdf'}, page_content='diving for fish, Piko had a specialfascination'), Document(metadata={'page': 0, 'source': 'data/documents\\The Adventure Of Piko the Pinguin.pdf'}, page_content='morning, Piko decided it was time to find out. "I'), Document(metadata={'page': 0, 'source': 'data/documents\\The Adventure Of Piko the Pinguin.pdf'}, page_content='fish, Piko had a specialfascination with the sky.'), Document(metadata={'page': 0, 'source': 'data/documents\\The Adventure Of Piko the Pinguin.pdf'}, page_content='sliding on ice and diving for fish, Piko had a')]}


In [28]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import ChatOpenAI

question = "which country is Piko From? is Piko Animal?"
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=db.as_retriever(), llm=llm
)

In [29]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def inspect(state):
    """Print the state passed between Runnables in a langchain and pass it on"""
    print(state)
    return state

prompt = """
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

Question: {question} 

Context: {context} 

Answer:
"""

rag_prompt = ChatPromptTemplate.from_messages({'system_message',prompt})

rag_chain = (
    {"context": retriever_from_llm | format_docs, "question": RunnablePassthrough()}
    | RunnableLambda(inspect)
    | rag_prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("is Piko Animal?")

{'context': 'and tucked it in his soft feathers."Thankyou,\n\nand wonder, "What liesbeyondtheicymountains?"\n\nwonders ifyou’recuriousenoughtoseekthem. Now, let\n\n"That’swhyI musttry, Lala. I want toseewhat’sout\n\nfor fish, Piko had a specialfascination with the\n\ndiving for fish, Piko had a specialfascination\n\nsliding on ice and diving for fish, Piko had a\n\nfish, Piko had a specialfascination with the sky.', 'question': 'is Piko Animal?'}


'Yes, Piko is an animal.'