# Install and Imports

In [1]:
!pip install pypdf
!pip install google-generativeai
!pip install chromadb
!pip install typing

Exception in thread Thread-5 (attachment_entry):
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/debugpy/server/api.py", line 237, in listen
    sock, _ = endpoints_listener.accept()
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/socket.py", line 294, in accept
    fd, addr = self._accept()
               ^^^^^^^^^^^^^^
TimeoutError: timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.11/dist-packages/google/colab/_debugpy.py", line 52, in attachment_entry
    debugpy.listen(_dap_port)
  File "/usr/local/lib/python3.11/dist-packages/debugpy/public_api.py", line 31, in wrapper
    return wrapped(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^



In [2]:
import requests
from pypdf import PdfReader
import os
import re
import google.generativeai as genai
from chromadb import Documents, EmbeddingFunction, Embeddings
import chromadb
from chromadb.config import Settings
from typing import List

# Download and load PDF

In [3]:
def download_pdf(url, save_path):
    response = requests.get(url)
    with open(save_path, 'wb') as f:
        f.write(response.content)

def load_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text
    return text

# ToDo:
- Text splitting
- ChromaDB
- Prompt Construction

In [34]:
# TODO: Students implement text splitting function
def split_text(text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
    """
    Split the input text into meaningful chunks of a given size with optional overlap.

    Parameters:
        text (str): The input text to split.
        chunk_size (int): The maximum size of each chunk in characters.
        overlap (int): The number of overlapping characters between consecutive chunks.

    Returns:
        List[str]: A list of text chunks.
    """
    # Clean up excessive whitespaces
    text = re.sub(r'\s+', ' ', text.strip())

    # Split by sentences
    sentences = re.split(r'(?<=[.!?]) +', text)

    chunks = []
    current_chunk = []

    current_length = 0
    for sentence in sentences:
        sentence_length = len(sentence)
        if current_length + sentence_length <= chunk_size:
            current_chunk.append(sentence)
            current_length += sentence_length + 1  # Add 1 for the space
        else:
            # Finalize the current chunk
            chunks.append(" ".join(current_chunk))
            # Start a new chunk with overlap
            overlap_text = " ".join(current_chunk[-overlap:]) if overlap and current_chunk else ""
            current_chunk = [overlap_text, sentence] if overlap_text else [sentence]
            current_length = len(overlap_text) + sentence_length + 1

    # Add the last chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# Custom embedding function using Gemini API
class GeminiEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        gemini_api_key = os.getenv("GEMINI_API_KEY")
        genai.configure(api_key=gemini_api_key)
        model = "models/embedding-001"
        title = "Custom query"
        return genai.embed_content(model=model, content=input, task_type="retrieval_document", title=title)["embedding"]

# TODO: Students implement ChromaDB creation and querying
def create_chroma_db(documents: List[str], path: str, name: str):
    """
    Create a ChromaDB collection with the provided documents.
    Returns the database instance and name.

    Hint: Use the following to create the client:
    client = chromadb.Client(Settings(
        chroma_db_impl="duckdb+parquet",
        persist_directory=path
    ))
    """
    client = chromadb.Client()
    collection = client.get_or_create_collection(name=name) #Create collection
    collection.upsert(   #Add to collection
      documents=documents,
      ids=[str(i) for i in range(len(documents))]
    )
    return (collection,client)

def get_relevant_passage(query: str, db, n_results: int):
    """
    Retrieve the most relevant passages for the given query.
    Returns a list of relevant text passages.
    """
    results = db.query(
        query_texts=[query], # Chroma will embed this for you
        n_results=n_results # how many results to return
    )
    return results['documents'][0]

# TODO: Students implement prompt construction
def make_rag_prompt(query: str, relevant_passage: str):
    """
    Construct a prompt for the generation model using the query and retrieved passage.
    Returns the formatted prompt string.
    """
    return query + ' here is the relevant passage ' + relevant_passage

# LLM Response Generation

In [28]:
def generate_answer(prompt: str):
    """Generate answer using Gemini Pro API"""
    from google.colab import userdata
    gemini_api_key = userdata.get('GOOGLE_API_KEY')
    if not gemini_api_key:
        raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
    genai.configure(api_key=gemini_api_key)
    model = genai.GenerativeModel('gemini-pro')
    result = model.generate_content(prompt)
    return result.text

# Main execution
## ToDo:
 - Chat history
 - Multiple file injest

In [36]:
def main():
    # Set up configurations
    pdf_url = "https://services.google.com/fh/files/misc/ai_adoption_framework_whitepaper.pdf"
    pdf_path = "ai_adoption_framework_whitepaper.pdf"
    db_folder = "chroma_db"
    db_name = "rag_experiment"

    # Create database directory
    if not os.path.exists(db_folder):
        os.makedirs(db_folder)

    # Download and process PDF
    download_pdf(pdf_url, pdf_path)
    pdf_text = load_pdf(pdf_path)

    # Split text into chunks
    chunked_text = split_text(pdf_text)

    # Create and set up database
    db_path = os.path.join(os.getcwd(), db_folder)
    db, _ = create_chroma_db(chunked_text, db_path, db_name)

    while True:
      # Process user query
      query = input("Please enter your query: ")
      relevant_text = get_relevant_passage(query, db, n_results=3)

      print(relevant_text)
      # Generate and display answer
      if relevant_text:
          final_prompt = make_rag_prompt(query, "".join(relevant_text))
          answer = generate_answer(final_prompt)
          print("\nGenerated Answer:", answer)
      else:
          print("No relevant information found for the given query.")

if __name__ == "__main__":
    main()

Please enter your query: what is a+b+c
['Google Cloud’s AI Adoption Framework Contents Creating value through AI, every step of the way Part 1: Executive summary The power of AI ................................................................................................................... 4 Leveraging the power of AI ............................................................................................. 5 The AI maturity themes The AI maturity phases The AI Maturity Scale Putting it all together Next steps ........................................................................................................................... 12 Find out more Work with Google experts Part 2: Technical deep-dive The AI maturity phases ................................................................................................... 14 Tactical Strategic Transformational The AI Maturity Scale ....................................................................................................



TooManyRequests: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource has been exhausted (e.g. check quota).