## This module uses langchain-google-genai, pypdf, langchain, and chromadb to create a chatbot that can answer questions based on a PDF document

In [1]:
!pip install -U langchain-google-genai pypdf  langchain chromadb


Collecting langchain-google-genai
  Downloading langchain_google_genai-0.0.6-py3-none-any.whl.metadata (2.7 kB)
Downloading langchain_google_genai-0.0.6-py3-none-any.whl (15 kB)
Installing collected packages: langchain-google-genai
  Attempting uninstall: langchain-google-genai
    Found existing installation: langchain-google-genai 0.0.5
    Uninstalling langchain-google-genai-0.0.5:
      Successfully uninstalled langchain-google-genai-0.0.5
Successfully installed langchain-google-genai-0.0.6


In [2]:
import urllib
import warnings
from pathlib import Path as p
from pprint import pprint

import pandas as pd
from langchain import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from pathlib import Path as p
from langchain_google_genai import ChatGoogleGenerativeAI,GoogleGenerativeAIEmbeddings
warnings.filterwarnings("ignore")

### Initializing DB

In [3]:
import chromadb
chroma_client = chromadb.Client()

In [4]:
collection = chroma_client.create_collection(name="my_collection")

In [5]:
## Read the API key from a file
with open('secrete.txt', 'r') as file:
    Api_key = file.read().strip()
from langchain_google_genai import ChatGoogleGenerativeAI
# Set the environment variable for Google API
import os
GOOGLE_API_KEY = Api_key
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
from langchain_google_genai import ChatGoogleGenerativeAI
model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3)
embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")                             

In [6]:
# Set the environment variable for Google API
model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3)
embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
history = []
vector_store = None

In [7]:
# Define the prompt template for the chatbot
prompt_template = """
  Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
  provided context just say, "answer is not available in the context", don't provide the wrong answer\n\n
  Context:\n {context}?\n
  Question: \n{question}\n
  History: \n{history}\n

  Answer:
"""

In [8]:
# Create an instance of the PromptTemplate class
prompt = PromptTemplate(template = prompt_template, input_variables = ["context", "question", "history"])


In [9]:

# Load the question answering chain
chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)

In [10]:

# Define a function to handle the user input
def handle_input(user_input):
  
  history.append(f"User: {user_input}")

 
  if user_input.startswith("!"):
    
    handle_command(user_input)
  else:
    
    handle_question(user_input)

In [11]:
# Define a function to handle the commands
def handle_command(command):
 
  if command == "!quit":
    exit()
  elif command == "!clear":
    history.clear()
    print("History cleared.")
  else:
    print("Invalid command. Please try again.")


In [12]:
# Define a function to handle the questions
def handle_question(question):
      # Check if the vector store is initialized
    if vector_store is None:
        initialize_vector_store()
        
    docs = vector_store.get_relevant_documents(question)
    
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question", "history"])
    response = chain(
        {"input_documents": docs, "question": question, "history": "\n".join(history[-2:])},
        return_only_outputs=True
    )
    history.append(f"model: {response}")
    print()
    print(f"Chatbot: {response}")
    print()
    print('*******************************************************************************************************************************')


In [13]:
# Define a function to initialize the vector store
def initialize_vector_store():
    data_folder = p.cwd() / "data"
    p(data_folder).mkdir(parents=True, exist_ok=True)

    pdf_url = "https://services.google.com/fh/files/misc/practitioners_guide_to_mlops_whitepaper.pdf"
    pdf_file = str(p(data_folder, pdf_url.split("/")[-1]))

    urllib.request.urlretrieve(pdf_url, pdf_file)
    pdf_loader = PyPDFLoader(pdf_file)
    data = pdf_loader.load_and_split()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=200)
    context = "\n\n".join(str(p.page_content) for p in data)

    texts = text_splitter.split_text(context)     

  # Embed the chunks and save them to the vector store
    global vector_store
    vector_store = Chroma.from_texts(texts, embeddings).as_retriever()
    print("Vector store initialized.")

In [None]:

print("Hello, this is a chatbot that can answer your questions based on a PDF document. You can also use the following commands:\n!quit: Exit the chatbot\n!clear: Clear the history")
while True:
  
  user_input = input("User: ")

  # Handle the user input
  handle_input(user_input)

Hello, this is a chatbot that can answer your questions based on a PDF document. You can also use the following commands:
!quit: Exit the chatbot
!clear: Clear the history
User: "Give overview of MLOps"
Vector store initialized.

Chatbot: {'output_text': 'MLOps is a set of practices that aims to streamline the lifecycle of machine learning (ML) models, from development to deployment and monitoring. It involves automating and integrating the various stages of the ML lifecycle, including data preparation, model training, evaluation, deployment, and monitoring. The goal of MLOps is to improve the efficiency, reliability, and governance of ML systems.\n\nKey components of MLOps include:\n\n1. Experimentation: This involves setting up an environment for data scientists and ML engineers to explore and develop ML models. It includes tools for data exploration, model prototyping, and tracking experiments.\n\n2. Data processing: This involves preparing and transforming data for ML training and 

In [None]:
print(history)