In [None]:
!pip install -q --upgrade langchain langchain_google_genai langchain-core langchain_community docs2txt pypdf langchain_chroma sentence_transformers

##**What is Retrieval Augmented Generation (RAG)?**







###**LangChain Components and Expression Language (LCEL)**
1. Large Language Model (LLM)
2. Output Parsers
3. Structured Output
4. Prompt Templates
5. LLM Messages

In [None]:
import os
os.environ["GOOGLE_API_KEY"]=""

In [None]:
os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ['LANGCHAIN_API_KEY']=""
os.environ['LANGCHAIN_PROJECT']=''

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm=ChatGoogleGenerativeAI(model="gemini-1.5-flash")
llm_response=llm.invoke("Tell me a simple joke about coding!")
llm_response

In [None]:
from langchain_core.output_parsers import StrOutputParser

output_parser=StrOutputParser()
output_parser.invoke(llm_response)

In [None]:
chain=llm | output_parser
chain.invoke("Tell me a story!")

In [None]:
from typing import List
from pydantic import BaseModel,Field

class ReviewDSA(BaseModel):
  topic:str=Field(description="Name of the DSA topic")
  summary: str=Field(description="Brief summary of the review")
  rating: float=Field(description="Overall rating out of 5")
  pros: List[str]=Field(description="List of positive aspects")


prompt_text='''
 Just started learning Dynamic Programming, and wow, this topic is mind-blowing! The way it optimizes
    recursive solutions is just amazing. It really helps in solving problems efficiently that involve
    overlapping subproblems, like Fibonacci and Knapsack.

    But not gonna lie, it's tough at first. You really need to practice a lot to get the hang of thinking
    in terms of subproblems. Also, memorization techniques can be confusing, and it’s easy to mess up
    the transition from recursion to tabulation.

    Overall, I'd rate it a 4.5 out of 5. Once you get it, it’s a game-changer for problem-solving.
    Definitely worth the effort!
'''

structured_llm=llm.with_structured_output(ReviewDSA)
output=structured_llm.invoke(prompt_text)
print(output)
print(output.pros)

In [None]:
from langchain_core.prompts import ChatPromptTemplate

prompt=ChatPromptTemplate.from_template("Tell me a joke about {topic}")
prompt.invoke({"topic":"sports"})

In [None]:
chain=prompt | llm|output_parser
result=chain.invoke({"topic":"coding"})
print(result)

In [None]:
from langchain_core.messages import HumanMessage,SystemMessage

messages=[
    SystemMessage(content="You are a helpful coding assistant!"),
    HumanMessage(content="Tell me a joke about Debugging in code")
]

response=llm.invoke(messages)
print(response)

In [None]:
template=ChatPromptTemplate([
    ("system","You are a helpful coding assistant"),
    ("human","tell me a joke about {topic}")
])

chain=template | llm
result=chain.invoke({"topic":"coding"})
print(result)

In [None]:
!pip install Docx2txt

###**Document Processing**
- Loading Documents
- Splitting Documents

In [None]:
from langchain_community.document_loaders import PyPDFLoader,Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List
from langchain_core.documents import Document
import os

text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200,length_function=len)

        "docx_loader=Docx2txtLoader("/content/docs/SAMPLE_DOCUMENT.docx")
",
documents=docx_loader.load()


splits=text_splitter.split_documents(documents)
print(f"splitted with {len(splits)} chunks")
print(len(documents))


In [None]:
documents[0]

In [None]:
splits[0]

In [None]:
def load_documents(folder_path : str)-> List[Document]:
  documnets=[]
  for filename in os.listdir(folder_path):
    file_path=os.path.join(folder_path,filename)
    if filename.endswith('.pdf'):
      loader=PyPDFLoader(file_path)
    elif filename.endswith('.docx'):
      loader=Docx2txtLoader(file_path)
    else:
      print(f"unsupported file type : {filename}")
      continue
    documents.extend(loader.load())
  return documents

folder_path='/content/docs'
documents=load_documents(folder_path)
print(f"loaded {len(documents)} from folder")

splits=text_splitter.split_documents(documents)
print(f"splitted documents into {len(splits)} chunks")


In [None]:
documents[1]

In [None]:
splits[4]


###**Creating Embeddings**
- Using GoogleAI Embeddings
- Using SentenceTransformer

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
embedding=GoogleGenerativeAIEmbeddings(model="models/embedding-001")
document_embedding=embedding.embed_documents([split.page_content for split in splits])
print(f"created embedding for {len(document_embedding)} documents chunks")


###**Setting Up the Vector Store**
- Creating the Vector Store
- Performing Similarity Search
- Creating a Retriever

In [None]:
from langchain_chroma import Chroma

embedding_function=GoogleGenerativeAIEmbeddings(model="models/embedding-001")
collection_name="my_collection"
vectorstore=Chroma.from_documents(
    collection_name=collection_name,
    documents=splits,
    embedding=embedding_function,
    persist_directory="./chroma_db"
)

print("vector store created and persisted to './chroma_db' ")

In [None]:
query="what is the resume summary?"

search_results=vectorstore.similarity_search(query,k=2)
print(f"\n Top 2 most relevant search for the query : '{query}' \n ")
for i,result in enumerate(search_results,1):
  print(f"result {i}:")
  print(f"source: {result.metadata.get('source','Unknown')}")
  print(f"content : {result.page_content}")
  print()

In [None]:
retriever=vectorstore.as_retriever(search_kwargs={"k":2})
retriever_results=retriever.invoke("who is the candidate whose cv we are seeing?")
print(retriever_results)








###**Building the RAG Chain**
- Creating the RAG Chain
- Using the RAG Chain

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

template="""
Answer the question based only on the following context:
{context}
Question:{question}
Answer: """


prompt=ChatPromptTemplate.from_template(template)

def docs2str(docs):
  return "\n\n".join(doc.page_content for doc in docs)

rag_chain=(
    {"context":retriever | docs2str,"question":RunnablePassthrough()}
    | prompt
    | llm
    |StrOutputParser()
)


In [None]:
question="which college does the person studies?"
response=rag_chain.invoke(question)
print(f"question: {question}")
print(f"response: {response}")


###**Handling Follow-Up Questions**
- Creating a History-Aware Retriever
- Using the History-Aware RAG Chain

In [None]:
from decimal import Context
from langchain_core.prompts import MessagesPlaceholder
from langchain.chains import create_history_aware_retriever
from langchain.chains.combine_documents import create_stuff_documents_chain


context_q_system_prompt="""
Given a chat history and latest user question
 which might refernce context in chat history,
formulate a standalone quesion which can be understood without the chat history.
Do not answer the question,
just formulate it if needed and otherwise return as it is.
"""

context_q_prompt=ChatPromptTemplate.from_messages([
    ("system",context_q_system_prompt),
    MessagesPlaceholder("chat_history"),
    ("human","{input}"),
])

context_chain=context_q_prompt | llm | StrOutputParser()
print(context_chain.invoke({"input":"what does his cgpa? ","chat_history":[]}))

In [None]:
from langchain.chains import create_retrieval_chain

history_aware_retriever=create_history_aware_retriever(
    llm,retriever,context_q_prompt
)

qa_prompt=ChatPromptTemplate.from_messages([
     ("system","you are a helpfull AI assistant. use following context to answer the user's question."),
     ("system","Context:{context}"),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human","{input}"),
])

question_answer_chain=create_stuff_documents_chain(llm,qa_prompt)
rag_chain=create_retrieval_chain(history_aware_retriever,question_answer_chain)

In [None]:
from langchain_core.messages import HumanMessage,AIMessage

chat_history=[]
question1="where was his college?"
answer1=rag_chain.invoke({"input":question1,"chat_history":chat_history})['answer']
chat_history.extend([
    HumanMessage(content=question1),
    AIMessage(content=answer1)
])

print(question1)
print(answer1)

In [None]:
question2="what was his sgpa and his college name?"
answer2=rag_chain.invoke({"input":question2,"chat_history":chat_history})['answer']
chat_history.extend([
    HumanMessage(content=question2),
    AIMessage(content=answer2)
])

print(question2)
print(answer2)