<a href="https://colab.research.google.com/github/aivydebnath/RAG-Model-for-QA-Bot/blob/main/Code/Langchain_With_Gemini_And_Build_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installations

In [78]:
! pip install -q --upgrade google-generativeai langchain-google-genai pypdf langchain langchain-community pinecone-client PyPDF2

In [79]:
import urllib
import warnings
from pathlib import Path as p
from pprint import pprint
import pandas as pd
from langchain import PromptTemplate
import os

from IPython.display import display
from IPython.display import Markdown
import textwrap

import google.generativeai as genai
from google.colab import userdata

from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_google_genai import ChatGoogleGenerativeAI
from pinecone import Pinecone
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone as PineconeVectorStore
from langchain.document_loaders import PyPDFLoader

warnings.filterwarnings("ignore")
# restart python kernal if issues with langchain import.

# API

I do not have OPENAI API(Credits expired), hence used Google API Key to make this project.

In [80]:
os.environ['PINECONE_API_KEY']=userdata.get('PINECONE_API')
os.environ['GOOGLE_API_KEY']=userdata.get('GOOGLE_API_KEY')
GOOGLE_API_KEY=os.environ['GOOGLE_API_KEY']

# Markdown

In [None]:
def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))


# Use LangChain to Access Gemini API

In [None]:
llm = ChatGoogleGenerativeAI(model="gemini-pro",google_api_key=GOOGLE_API_KEY)

In [None]:
result = llm.invoke("What are the usecases of LLMs?")


In [None]:
result

In [None]:
to_markdown(result.content)

In [None]:
llm2 = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key=GOOGLE_API_KEY)
result2 = llm.invoke("What are the usecases of LLMs?")
to_markdown(result2.content)

#RAG

## ITC

### Extract text from the PDF

In [None]:
def read_doc(directory):
    file_loader=PyPDFLoader("/content/sample_data/ITC-Report-and-Accounts-2023.pdf")
    documents=file_loader.load()
    return documents

In [None]:
doc=read_doc('')
len(doc)

In [None]:
doc[10]

### RAG Pipeline: Embedding + Gemini (LLM)

In [None]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001",google_api_key=GOOGLE_API_KEY)

index_name="langchainvector"

text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
texts = text_splitter.split_documents(doc)

In [None]:
texts[0]

In [None]:
pinecone_vector_store = PineconeVectorStore.from_documents(texts, embeddings, index_name=index_name)

In [None]:
question = "Which quater cigerettes has performed well?"
similar_docs=pinecone_vector_store.similarity_search(question)

In [None]:
model = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key=GOOGLE_API_KEY,
                             temperature=0.4,convert_system_message_to_human=True)

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    model,
    retriever=pinecone_vector_store.as_retriever(),
    return_source_documents=True,
    chain_type="stuff"
)

qa_chain.invoke(question)

In [None]:
question = "Can you tell the names of the products ITC has?"
result = qa_chain({"query": question})
result["result"]

In [None]:
Markdown(result["result"])

In [None]:
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer.
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)# Run chain
qa_chain = RetrievalQA.from_chain_type(
    model,
    retriever=pinecone_vector_store.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)


In [None]:
question = "Can you tell the names of the products ITC has?"
result = qa_chain({"query": question})
result["result"]

In [None]:
Markdown(result["result"])

In [None]:
question = "Describe Random forest?"
result = qa_chain({"query": question})
Markdown(result["result"])

## HUL


### Extraction text from PDF

In [None]:
def read_doc(directory):
    file_loader=PyPDFLoader("/content/sample_data/Annual_Report___2022_23__2__bWICfx.pdf")
    documents=file_loader.load()
    return documents

In [None]:
doc=read_doc('')
len(doc)

In [None]:
doc[0]

### RAG Pipeline: Embedding + Gemini(LLM)

In [None]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001",google_api_key=GOOGLE_API_KEY)

index_name="langchainvector"

text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
texts = text_splitter.split_documents(doc)

In [None]:
texts[0]

In [None]:
pinecone_vector_store = PineconeVectorStore.from_documents(texts, embeddings, index_name=index_name)

In [None]:
model = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key=GOOGLE_API_KEY,
                             temperature=0.4,convert_system_message_to_human=True)

In [None]:
question = "How HUL performed?"
similar_docs=pinecone_vector_store.similarity_search(question)

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    model,
    retriever=pinecone_vector_store.as_retriever(),
    return_source_documents=True,
    chain_type="stuff"
)

qa_chain.invoke(question)

In [None]:
question = "How HUL performed?Is it better than last year?What are the products of HUL?"
result = qa_chain({"query": question})
result["result"]

In [None]:
Markdown(result["result"])

In [None]:
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Keep the answer as concise as possible. Always say thanks for asking! Happy to help at the end of the answer.
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)
qa_chain = RetrievalQA.from_chain_type(
    model,
    retriever=pinecone_vector_store.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)


In [None]:
question = "What is ML?"
result = qa_chain({"query": question})
result["result"]

In [None]:
Markdown(result["result"])

In [None]:
question = "How HUL performed?Is it better than last year?What are the products of HUL?"
result = qa_chain({"query": question})
result["result"]

In [None]:
Markdown(result["result"])

In [None]:
question = "What are the products of HUL in Beauty and Personal Care?"
result = qa_chain({"query": question})
result["result"]

In [None]:
Markdown(result["result"])

By creating this QA Chat bot, we can