# Loading PDF file

In [1]:
import os
os.environ["GEMINI_API_KEY"]="AIzaSyDkG2ouk_ARnS-b7ILnuH54ThfKkcLGPcQ"

In [2]:
from pypdf import PdfReader

def load_pdf(file_path):
    """
    Reads the text content from a PDF file and returns it as a single string.

    Parameters:
    - file_path (str): The file path to the PDF file.

    Returns:
    - str: The concatenated text content of all pages in the PDF.

    Raises:
    - FileNotFoundError: If the specified file_path does not exist.
    - PyPDF2.utils.PdfReadError: If the PDF file is encrypted or malformed.

    Example:
    >>> pdf_text = load_pdf("example.pdf")
    >>> print(pdf_text)
    "This is the text content extracted from the PDF file."
    """
    # Logic to read pdf
    reader = PdfReader(file_path)

    # Loop over each page and store it in a variable
    text = ""
    for page in reader.pages:
        text += page.extract_text()

    return text

text = load_pdf(file_path="data.pdf")

In [3]:
pdf_text = load_pdf(file_path="data.pdf")
print(pdf_text)

cycle : 
 formation nom: di: genie electrique et management industriel formation url:  https://fstt.ac
.ma/portail2023/genie -electrique -et-managementindustriel/  formation details: {'objective
s': 'la formation proposee dans la specialite genie electrique de la fst de tanger a pour but d
e fournir au futur ing enieur en genie electrique tous les elements indispensables a son inse
rtion harmonieuse dans le monde industriel. les enseignements associent theorie et techniq
ues, experimentation, projets et realisations. les enseignements dispenses sont repartis en 
quat re groupes de matieres : des enseignements a caractere general, des disciplines fonda
mentales, un enseignement professionnel et une formation pratique par le biais de projets 
et de stages.', 'program': 'semestre 1• mathematiques pour l’ingenieur• informati que• elect
ronique• instrumentation et capteurs• energetique et mdf• communication professionnelle 
semestre 2• ingenierie de la qualite• automatique lineaire continue

# Splitting the text

In [4]:
import re
def split_text(text: str):
    """
    Splits a text string into a list of non-empty substrings based on the specified pattern.
    This version replaces all newlines with spaces before splitting to ensure no unintended splits.
    
    Parameters:
    - text (str): The input text to be split.

    Returns:
    - List[str]: A list containing non-empty substrings obtained by splitting the input text.
    """
    # Replace all newlines with a space before splitting to avoid issues with extra newlines
    text = text.replace('\n', ' ')
    split_text = re.split('\n \n', text)
    return [i for i in split_text if i != ""]


In [5]:
text = split_text(pdf_text)
print(text)

['cycle :   formation nom: di: genie electrique et management industriel formation url:  https://fstt.ac .ma/portail2023/genie -electrique -et-managementindustriel/  formation details: {\'objective s\': \'la formation proposee dans la specialite genie electrique de la fst de tanger a pour but d e fournir au futur ing enieur en genie electrique tous les elements indispensables a son inse rtion harmonieuse dans le monde industriel. les enseignements associent theorie et techniq ues, experimentation, projets et realisations. les enseignements dispenses sont repartis en  quat re groupes de matieres : des enseignements a caractere general, des disciplines fonda mentales, un enseignement professionnel et une formation pratique par le biais de projets  et de stages.\', \'program\': \'semestre 1• mathematiques pour l’ingenieur• informati que• elect ronique• instrumentation et capteurs• energetique et mdf• communication professionnelle  semestre 2• ingenierie de la qualite• automatique lineaire

# Embedding the text

In [6]:
import google.generativeai as genai
from chromadb import Documents, EmbeddingFunction, Embeddings
import os

class GeminiEmbeddingFunction(EmbeddingFunction):
    """
    Custom embedding function using the Gemini AI API for document retrieval.

    This class extends the EmbeddingFunction class and implements the __call__ method
    to generate embeddings for a given set of documents using the Gemini AI API.

    Parameters:
    - input (Documents): A collection of documents to be embedded.

    Returns:
    - Embeddings: Embeddings generated for the input documents.

    Raises:
    - ValueError: If the Gemini API Key is not provided as an environment variable (GEMINI_API_KEY).

    Example:
    >>> gemini_embedding_function = GeminiEmbeddingFunction()
    >>> input_documents = Documents(["Document 1", "Document 2", "Document 3"])
    >>> embeddings_result = gemini_embedding_function(input_documents)
    >>> print(embeddings_result)
    Embeddings for the input documents generated by the Gemini AI API.
    """
    def __call__(self, input: Documents) -> Embeddings:
        gemini_api_key = os.getenv("GEMINI_API_KEY")
        if not gemini_api_key:
            raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
        genai.configure(api_key=gemini_api_key)
        model = "models/embedding-001"
        title = "Custom query"
        return genai.embed_content(model=model,
                                   content=input,
                                   task_type="retrieval_document",
                                   title=title)["embedding"]


# Storing vectors into DB

In [7]:
import chromadb
def create_chroma_db(documents, path, name):
    """
    Creates a Chroma database using the provided documents, path, and collection name.

    Parameters:
    - documents: An iterable of documents to be added to the Chroma database.
    - path (str): The path where the Chroma database will be stored.
    - name (str): The name of the collection within the Chroma database.

    Returns:
    - Tuple[chromadb.Collection, str]: A tuple containing the created Chroma Collection and its name.
    """
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    for i, d in enumerate(documents):
        db.add(documents=d, ids=str(i))

    return db, name


In [7]:
db,name =create_chroma_db(documents=text, path="contents", name="fst10")

UniqueConstraintError: Collection fst10 already exists

In [8]:
def load_chroma_collection(path, name):
    """
    Loads an existing Chroma collection from the specified path with the given name.

    Parameters:
    - path (str): The path where the Chroma database is stored.
    - name (str): The name of the collection within the Chroma database.

    Returns:
    - chromadb.Collection: The loaded Chroma Collection.
    """
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.get_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    return db


In [9]:
db=path=load_chroma_collection("contents", name="fst10")
print(db)

name='fst10' id=UUID('fda651bd-2bf4-41bf-93fb-cd5a44c986d3') metadata=None tenant='default_tenant' database='default_database'


# Retrieval

In [10]:
def get_relevant_passage(query, db, n_results):
  passage = db.query(query_texts=[query], n_results=n_results)['documents'][:]
  return passage

In [11]:
relevant_text = get_relevant_passage("cycle",db,3)
print(relevant_text)

Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1


[['cycle formation nom di genie electrique management industriel formation url https://fstt.ac  format ion details formation proposee specialite genie electrique fst tanger a but e fournir futur ingenieur g enie electrique tous elements indispensables a inse rtion harmonieuse monde industriel enseignem ents associent theorie techniq ues experimentation projets realisations enseignements dispenses re partis quatre groupes matieres enseignements a caractere general disciplines fonda mentales ensei gnement professionnel formation pratique biais projets stages mathematiques elect instrumentatio n energetique co mmunication professionnelle semestre ingenierie automatique lineaire continue c once ption procedes culture gestion anglais echnique semestre mathematiques ingenieur traitemen t sources energies reseaux machines grh dro it travail semestre gestion maintenanc e surete machin es ele electronique systemes a microprocesseurs a oft analyse gestion comptabilite generale analyti que semest

# Generation

In [12]:
def make_rag_prompt(query, relevant_passage):
  escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
  prompt = ("""Vous êtes un bot serviable et informatif qui répond aux questions en utilisant le texte du passage de référence inclus ci-dessous. \
    Assurez-vous de répondre par une phrase complète, en étant exhaustif, en incluant toutes les informations de fond pertinentes. \
    Cependant, vous vous adressez à un public non technique, donc veillez à simplifier les concepts compliqués et \
    adopter un ton amical et conversationnel. \
    Si le passage est sans rapport avec la réponse, vous pouvez l'ignorer.
  QUESTION: '{query}'
  PASSAGE: '{relevant_passage}'

  ANSWER:
  """).format(query=query, relevant_passage=escaped)

  return prompt

In [13]:
import google.generativeai as genai
def generate_response(prompt):
    gemini_api_key = os.getenv("GEMINI_API_KEY")
    if not gemini_api_key:
        raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
    genai.configure(api_key=gemini_api_key)
    model = genai.GenerativeModel('gemini-pro')
    answer = model.generate_content(prompt)
    return answer.text

# Bringing it all together

In [14]:
def generate_answer(db, query):
    # Retrieve top 3 relevant text chunks
    relevant_texts = get_relevant_passage(query, db, n_results=1)
    # Flatten the list if nested and join into a single string
    flattened_texts = [item for sublist in relevant_texts for item in (sublist if isinstance(sublist, list) else [sublist])]
    prompt = make_rag_prompt(query, relevant_passage=" ".join(flattened_texts))
    answer = generate_response(prompt)
    return answer


In [15]:
db=load_chroma_collection(path="contents", #replace with path of your persistent directory
                          name="rag_experiment") #replace with the collection name

answer = generate_answer(db,query="quelle est la responsablite de chabbi mohamed ")
print(answer)


Chabbi Mohamed est responsable du département de génie chimique de la commission scientifique.
