Install required packages

In [1]:
!pip install -q google-generativeai pinecone PyPDF2 sentence-transformers langchain-text-splitters

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/587.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━[0m [32m481.3/587.6 kB[0m [31m14.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m583.7/587.6 kB[0m [31m12.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.6/587.6 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.0/240.0 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━

Import Libraries

In [2]:
import os
import time
import PyPDF2
import io
from typing import List, Dict, Any
import google.generativeai as genai
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from google.colab import files

Configuration

In [3]:
GEMINI_MODEL = "gemini-1.5-flash"
EMBEDDING_MODEL = "all-MiniLM-L6-v2"
PINECONE_INDEX_NAME = "business-qa-bot"
CHUNK_SIZE = 200
CHUNK_OVERLAP = 40
TOP_K_RETRIEVAL = 5

Set API Keys

In [4]:
from google.colab import userdata


GEMINI_API_KEY = userdata.get('GOOGLE_API_KEY')
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')

genai.configure(api_key=GEMINI_API_KEY)

pc = Pinecone(api_key=PINECONE_API_KEY)

Initialize Models

In [5]:
# Initialize embedding model
embedding_model = SentenceTransformer(EMBEDDING_MODEL)

# Initialize Gemini model
gemini_model = genai.GenerativeModel(GEMINI_MODEL)

# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Setup pinecone index

In [26]:
# delete if previous index found
if PINECONE_INDEX_NAME in pc.list_indexes().names():
    pc.delete_index(PINECONE_INDEX_NAME)
    time.sleep(5)

# create new index
pc.create_index(
    name=PINECONE_INDEX_NAME,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

time.sleep(10)
index = pc.Index(PINECONE_INDEX_NAME)

In [28]:
# extract text from PDF

def extract_text_from_pdf(pdf_content):
    pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_content))
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text() + "\n"
    return text.strip()

In [29]:
# split text into chunks

def create_chunks(text):
    chunks = text_splitter.split_text(text)
    return [chunk for chunk in chunks if len(chunk.strip()) > 50]

In [30]:
# embedding

def get_embeddings(texts):
    return embedding_model.encode(texts, convert_to_tensor=False).tolist()

In [31]:
# store embedding into pinecone

def store_in_pinecone(chunks, embeddings):
    vectors = []
    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        vectors.append({
            'id': f"chunk_{i}",
            'values': embedding,
            'metadata': {'content': chunk}
        })
    batch_size = 100
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i:i + batch_size]
        index.upsert(vectors=batch)

In [32]:
# retrive similer chunks

def search_similar_chunks(query, top_k=5):
    query_embedding = embedding_model.encode(query, convert_to_tensor=False).tolist()

    results = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True
    )

    return [
        {
            'content': match['metadata']['content'],
            'score': match['score']
        }
        for match in results['matches']
    ]

In [33]:
# Generate output uisng LLM and top context

def generate_answer(question, context):
    prompt = f"""Based on the following context from a business document, answer the question.

Context:
{context}

Question: {question}

Instructions:
- Answer based only on the provided context
- Be concise and professional
- If the context doesn't have enough information, say so

Answer:"""

    response = gemini_model.generate_content(
        prompt,
        generation_config=genai.types.GenerationConfig(
            max_output_tokens=1000,
            temperature=0.3
        )
    )
    return response.text

In [34]:
# Upload Dodument and Processing

print("Please upload your business Document(PDF) :")
uploaded_files = files.upload()

if not uploaded_files:
    print("No file uploaded!")
else:
    file_name = list(uploaded_files.keys())[0]
    file_content = uploaded_files[file_name]

    print(f"Processing: {file_name}")

    # Extract text
    text = extract_text_from_pdf(file_content)
    print(f"Extracted {len(text)} characters")

    # Create chunks
    chunks = create_chunks(text)
    print(f"Created {len(chunks)} chunks")

    # Generate embeddings
    embeddings = get_embeddings(chunks)
    print("Embeddings Done")

    # Store in Pinecone
    store_in_pinecone(chunks, embeddings)
    print("Document stored into pinecone.")

Please upload your business Document(PDF) :


Saving Corporate Brochure_200707.pdf to Corporate Brochure_200707.pdf
Processing: Corporate Brochure_200707.pdf
Extracted 33187 characters
Created 201 chunks
Embeddings Done
Document stored into pinecone.


In [35]:
# Define question answere

def ask_question(question):
    print(f"Question: {question}")

    # Search for relevant chunks
    relevant_chunks = search_similar_chunks(question, TOP_K_RETRIEVAL)

    if not relevant_chunks:
        return "No relevant information found."

    # Combine context
    context = "\n\n".join([chunk['content'] for chunk in relevant_chunks])

    # Generate answer
    answer = generate_answer(question, context)

    # Calculate confidence
    avg_score = sum(chunk['score'] for chunk in relevant_chunks) / len(relevant_chunks)

    print(f"Answer: {answer}")
    print(f"Confidence: {avg_score:.2f}")
    print("-" * 70)

    return answer

Define Interactive QA Pipeline

In [36]:
print("Ask questions about your business Document!")
print("Type 'quit' to exit")

while True:
    user_question = input("Your question: ").strip()

    if user_question.lower() in ['quit', 'exit', 'q']:
        print("Thank You")
        break

    if user_question:
        ask_question(user_question)
        print()

Ask questions about your business Document!
Type 'quit' to exit
Your question: What services does the company provide?
Question: What services does the company provide?
Answer: Daiichi Sankyo Business Associe Co., Ltd. provides core general business support services including HR, accounting, sales-related clerical tasks, equipment cleaning, and mail services.  Daiichi Sankyo Propharma Co., Ltd. focuses on manufacturing pharmaceutical products.  The provided text also mentions that Daiichi Sankyo Healthcare Co., Ltd. works to further develop functional skincare and food products.

Confidence: 0.45
----------------------------------------------------------------------

Your question: Can you summarize the company's business model?
Question: Can you summarize the company's business model?
Answer: The company is a pharmaceutical company focused on the creation and ongoing improvement of corporate value through Research & Development, Supply Chain, and Marketing.  It operates with a global 