**1. SOURCE DATA SETUP**

---



In [None]:
# Name: interactWithPDFsGemini1.5flash
# model: gemini-1.5-flash
# embeddings: models/embedding-001
# vector DB: faiss (from META)
# v1.0 initial ver to ask a question from uploaded PDFs
# v1.1 extended into a chatbot

In [None]:
# use pathlib path to get the source folder
# pypdf to scan pdfs

!pip install pypdf

from pathlib import Path
from pypdf import PdfReader

# current working directory (of colab): /content/
# create a folder and upload pdfs
# note-these files are not persistent
#import os
#print("Current working directory:", os.getcwd())
#!ls -lh /content/
#!head sourceData/*.pdf

pdfFiles = Path("sourceData").glob("*.pdf")
text = ""

# text is string datatype
#type(text)

for pdfFile in pdfFiles:
    reader = PdfReader(pdfFile)
    for page in reader.pages:
      text += page.extract_text() + "\n\n"

# You can cross check the total # of chars (including spaces) in MS word for the same documnet
print("Total Number of characters (all PDFs):", len(text))

# sneak peep in first 100 chars
#print(text[:100])

Collecting pypdf
  Downloading pypdf-4.2.0-py3-none-any.whl (290 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/290.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/290.4 kB[0m [31m4.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pypdf
Successfully installed pypdf-4.2.0
Total Number of characters (all PDFs): 11842


**2. BREAK YOUR DOCUMENTS INTO CHUNKS**

---



In [None]:
# Note: Chunks are different from tokens
# prompt: python code to break a document into chunks
chunk_size = 500
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

# chunks is LIST datatype
#type(chunks)

# Number of chunks = len(text)/chunk_size
print("Total Number of chunks:", len(chunks))

#print(chunks[0])

# For Loop with Enumeration:
# Use built-in Python function "enumerate" to return both the index (i) and the value (chunk) from the chunks list:
for i, chunk in enumerate(chunks):
  print(f"Chunk #{i + 1}:")
  print(chunk + "\n" + "-"*80 + "\n")

Total Number of chunks: 24
Chunk #1:
MADHU R AGARWAL  
a.madhur@gmail.com  
 
PROFILE  SUMMARY  
 
▪ Result driven IT professional with 1 7 years of extensive experience, both offshore and 
onsite, collaborating with businesses and technical teams to provide and maintain 
integrated technology solutions.  
▪ 15 years of rich experience as Microsoft SQL Server DBA handling different versions of SQL 
Server in a high available infrastructure.  
▪ Having worked in databases for most of my career , I’m now exposed to application 
manag
--------------------------------------------------------------------------------

Chunk #2:
ement team as well, which has enabled me to see both sides of the table and thereby 
I’m better equipped to translate business/ customer requirements into scalable and 
sustainable solutions . 
▪ Experience working with PaaS applications hosted on private cloud.  
▪ Experience working with both Monolithic and Microservices based applications.  
▪ Experience working in

**3a. DEFINE MODEL:**

In [None]:
# Setup MODEL
!pip install -q -U google-generativeai
import google.generativeai as gemini

# Fetch the key in notebook
from google.colab import userdata
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
# Configure
gemini.configure(api_key=GOOGLE_API_KEY)

model = gemini.GenerativeModel('gemini-1.5-flash')

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.9/163.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m718.3/718.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h

**3b. VALIDATE KEY (Optional):**

In [None]:
# requests library is used to make HTTP requests in Python.
!pip install requests
import requests

# Replace with your actual Gemini API endpoint and key
GEMINI_API_ENDPOINT = "https://api.gemini.com/v1/pubticker/btcusd"  # Example endpoint for public ticker information
API_KEY = GOOGLE_API_KEY

def validate_gemini_api_key(api_key):
    headers = {
        'Content-Type': 'application/json',
        'X-GEMINI-APIKEY': api_key,
    }
    try:
        response = requests.get(GEMINI_API_ENDPOINT, headers=headers)
        if response.status_code == 200:
            print("API key is valid.")
        else:
            print("Failed to validate API key. Status code:", response.status_code)
            print("Response:", response.text)
    except requests.exceptions.RequestException as e:
        print("Error making request:", e)

validate_gemini_api_key(API_KEY)


**4. ITS ALL ABOUT EMBEDDINGS** 🤓

---



In [None]:
# In case you're interested to know the number of tokens used by model:
count_tokens = model.count_tokens(text)
print(f"# of Tokens Created: {count_tokens}")

# Define a function to create an embedding for an input "text"
def embed_text(text):
    result = gemini.embed_content(
        model="models/embedding-001",
        content=text,
        task_type="retrieval_document",
        title="Embedding Example using a single string")
    return result['embedding']

# Loop through the chunks (of your PDFs) to generate embeddings for EACH chunk
# initialize LIST embeddings
embeddings = []
for chunk in chunks:
  embedding = embed_text(chunk)
  embeddings.append(embedding)
print(f"\nEmbedding: {embeddings}\n")
# print bold headings:
# \033[1m is the ANSI escape code for starting bold text and \033[0m to reset
#print(f"\033[1mEmbedding:\033[0m {embedding}\n")

# the above loop creates a LIST called embeddings of size = [number_chunks, dimension]
# dimensions are pre-defined by model
# eg: number_chunks = 24, dim = 768
# embeddings = [[0.011,..........768 entries], [0.0543,.......768 entries]........[24 times]]
# these are vector embeddings

# Print the length of the embeddings list:
print(f"Number of embeddings (shud be = number_chunks): {len(embeddings)}")

# Get dimension used by model to create the embeddings:
print(f"Embedding dimension: {len(embeddings[0])}")

# ADD THESE EMBEDDINGS TO A NUMPY ARRAY (so that we can use FAISS vector DB)
import numpy as np

# Create a NumPy array to store the embeddings
embeddings_array = np.array(embeddings)

# Print the shape of the array
print(embeddings_array.shape)
# shape = [number_chunks, dimension]

# Sneak Peek into our embeddings!!
embeddings_array[0]

#type(chunk)
#type(chunks) # LIST of strings
#type(embedding)
#type(embeddings) # LIST of LISTS
#type(embeddings_array)

(24, 768)


array([ 2.70025940e-02, -2.70575730e-02, -2.82738070e-04, -5.80020950e-03,
        6.87707500e-02,  1.75612870e-02,  2.00370530e-02, -1.93604770e-02,
        2.64642520e-02,  3.33419480e-02, -2.11382880e-02,  1.86149650e-02,
       -5.51418480e-02, -1.88388410e-02,  2.13367750e-02, -3.70026120e-02,
        7.41483100e-03, -2.28109210e-02,  1.79241150e-02, -2.00097360e-02,
       -2.24137720e-02, -3.04788280e-02, -1.54856190e-02, -2.01137400e-02,
       -3.52461660e-03, -2.07095740e-02,  1.02008480e-02, -5.25509750e-02,
        9.15406400e-03,  8.14321860e-02, -3.75854630e-02,  7.05378800e-02,
       -6.63678000e-02,  4.16289680e-02,  1.15788365e-02, -5.63190840e-02,
       -6.09553940e-02, -1.05222080e-02, -4.75340860e-02,  5.85627740e-02,
       -1.64656840e-02, -3.53244020e-02, -7.97780600e-03, -7.45797800e-03,
        2.04435850e-02, -2.58559100e-02, -1.33725290e-02, -2.20334590e-02,
        3.58249250e-02, -4.32286700e-02,  1.11658950e-02, -2.11980440e-02,
        7.70043500e-02,  

**5. SAVE THESE (VECTOR) EMBEDDINGS TO A (VECTOR) DB:**

In [None]:
# Once we have the embeddings, we store them in a vector database for efficient processing and retrieval.
# Here we use Faiss, an open-source vector db developed by Meta.
!pip install faiss-gpu
# !pip install faiss-cpu
import faiss

# print("FAISS version:", faiss.__version__)

dim = embeddings_array.shape[1]

# Create a Faiss index with the same dimension as the embeddings
# Other FAISS indexes are also available
index = faiss.IndexFlatL2(dim)
# add embeddings array [number_chunks, dimension] to our indexes to make them searchable
# Convert the NumPy array to a C-style array before adding it to the Faiss index.
index.add(embeddings_array.astype('float32'))

# WE HAVE FINISHED CREATING OUR (DOMAIN KNOWLEDGE) AI COMPATIBLE DB

**6. QUESTION/ ANSWER TIME - RAG IN ACTION:** 🥳

In [None]:
# WITHOUT RAG:
# model.generate_content("Who is Madhur Agarwal?").text

# User query also needs to be embeded to set the CONTEXT
# Use the function created above to create the embeddings for user query:
question = "Who is Madhur Agarwal?"
question_embed = np.array([embed_text(question)])

# Retreival (R):
D, I = index.search(question_embed.astype('float32'), k=2)  # distance, index
retrieved_chunk = [chunks[i] for i in I.tolist()[0]]

# Create a prompt for our model, Augmenting (A) the context:
prompt = f"""
Context information is below.
---------------------
{retrieved_chunk}
---------------------
Given the context information and not prior knowledge, answer the query.
Query: {question}
Answer:
"""
# Generate (G) a response:
model.generate_content(prompt).text

"Unfortunately, without more information, it's impossible to tell you who Madhur Agarwal is. There are many people with that name, and they could be involved in a wide range of professions and activities. \n\nTo help me find the right Madhur Agarwal for you, could you please tell me:\n\n* **What is Madhur Agarwal known for?** Are they a scientist, an artist, a businessperson, a writer, or something else?\n* **What is the context of your question?** Are you looking for information about them for a research project, a personal connection, or something else?\n\nWith more information, I can provide a more helpful answer. \n"

**7. EXTEND INTO A CHATBOT:**

In [None]:
# Convert the above RAG ques/anser in a Chat model using ChatSession class instead of generate_content
# ChatSession.send_message method returns the same GenerateContentResponse type as GenerativeModel.generate_content.
# It also appends your message and the response to the chat history

import textwrap
from IPython.display import display, Markdown

# function to  convert text to rendered markdown (user friendly)
def to_markdown(text):
  text = text.replace('•', '  *')
  # Indent each line for markdown rendering
  return Markdown(textwrap.indent(text, '', predicate=lambda _: True))

# RAG Chat function
def rag_chat(question):
  question_embed = np.array([embed_text(question)]) # embed question
  D, I = index.search(question_embed.astype('float32'), k=2)  # distance, index
  # (R)etrieval
  retrieved_chunk = [chunks[i] for i in I.tolist()[0]]
  # (A)ugment
  prompt = f"""
  Context information is below.
  ---------------------
  {retrieved_chunk}
  ---------------------
  Given the context information and not prior knowledge, answer the query.
  Query: {question}
  Answer:
  """
  # (G)enerate:
  # nothing saved in history because each time it initialzes history=[]
  chat = model.start_chat(history=[])
  response = chat.send_message(prompt).text
  # Convert the answer to Markdown format:
  #response_markdown = to_markdown(response)
  # Return the answer in a user-friendly format:
  response2 = f"\033[1mUser:\033[0m {question}\n\n\033[1mModel:\033[0m {response}"
  return (response2)

# Ask:
question = "Who is Madhur Agarwal?"
answer = rag_chat(question)
print(answer)

question = "Does he has any experience in python?"
answer = rag_chat(question)
print(answer)

question = "Has he worked in RBS?"
answer = rag_chat(question)
print(answer)

question = "Has he been to USA?"
answer = rag_chat(question)
print(answer)

question = "What are his contact details?"
answer = rag_chat(question)
print(answer)


[1mUser:[0m What are his contact details?

[1mModel:[0m The provided context does not include any contact details for the individual. 

