<a href="https://colab.research.google.com/github/azario0/rag_2.0/blob/main/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Algeria 2.0

## Advanced Python programming for RAG systems using Google Gemini

### Presented by Benmalek Zohir



# Documents preprocessing

In [None]:
def split_text_with_overlap(text, chunk_size, overlap_size):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap_size
    return chunks

text = '''
The Earth, our home planet, is a dynamic and complex system.
It's approximately 4.5 billion years old and is the only known planet to harbor life.
Our planet is composed of several layers, including the crust, mantle, and core.
The Earth's surface is constantly changing due to tectonic plate movement, volcanic eruptions, and erosion.
'''

result=split_text_with_overlap(text,10,3)
print("#################################################")
print('Example of the splits :')
print(result[:3])
print("#################################################")
print('length of the chunck :')

print(len(result[0]))

print("#################################################")
print('Number of chuncks : ')
print(len(result))

#################################################
Example of the splits :
['\nThe Earth', 'rth, our h', 'r home pla']
#################################################
length of the chunck :
10
#################################################
Number of chuncks : 
49


# Setting up the api key :
You can get yours from :
https://aistudio.google.com/app/apikey

In [None]:
import google.generativeai as genai
# Configure the Gemini API
genai.configure(api_key='YOUR_GEMINI_API_KEY')

# Embedding models list of google generative ai

In [None]:
for m in genai.list_models():
  if 'embedContent' in m.supported_generation_methods:
    print(m.name)

models/embedding-001
models/text-embedding-004


# Embedding our documents

In [None]:
def embed_documents(documents):
    embeddings = []
    for doc in documents:
        embedding_result = genai.embed_content(
            model='models/embedding-001',
            content=doc
        )

        # Extract the 'embedding' from the result
        if 'embedding' in embedding_result:
            embedding_vector = embedding_result['embedding']
            print(f"Embedding for document '{doc}': {str(embedding_vector)[:50]}... TRIMMED")
            embeddings.append(embedding_vector)
        else:
            print(f"No 'embedding' field found for document: {doc}")

    return embeddings

documents = [
    "Lion",
    "Cat",
    "Dog",
    "Daulphin",
    "Shark",
    "Horse",
    "Cow",
    "Bird",
    "Fish"
]


embedded_docs = embed_documents(documents)
print(f"Number of embedded documents: {len(embedded_docs)}")

# Assuming embeddings are vectors, check the length of the first embedding
if embedded_docs and isinstance(embedded_docs[0], list):
    print(f"Embedding dimension: {len(embedded_docs[0])}")
else:
    print("The embedding is not in a list-like structure.")

Embedding for document 'Lion': [0.040143497, -0.019579103, -0.023438316, -0.03869... TRIMMED
Embedding for document 'Cat': [0.03371215, -0.03692612, -0.04624668, -0.08384791... TRIMMED
Embedding for document 'Dog': [0.03929852, -0.033923797, -0.029821463, -0.022883... TRIMMED
Embedding for document 'Daulphin': [0.048549414, -0.032747224, -0.025847968, -0.03428... TRIMMED
Embedding for document 'Shark': [0.043408774, -0.06344833, -0.01202401, -0.0315866... TRIMMED
Embedding for document 'Horse': [0.0063757957, -0.059231978, -0.057385504, -0.0299... TRIMMED
Embedding for document 'Cow': [0.06828845, -0.07650595, -0.036208507, -0.044902,... TRIMMED
Embedding for document 'Bird': [-0.0063274074, -0.06696137, -0.047402818, 0.01119... TRIMMED
Embedding for document 'Fish': [0.0086052045, -0.03786592, -0.013129386, -0.02560... TRIMMED
Number of embedded documents: 9
Embedding dimension: 768


# Saving the embedded documents

In [None]:
import faiss
import numpy as np
import os
import json

# Convert embeddings to numpy array
embedding_dim = len(embedded_docs[0])
embedded_docs_np = np.array(embedded_docs).astype('float32')

# Normalize vectors for cosine similarity
faiss.normalize_L2(embedded_docs_np)

# Create and add vectors to FAISS index
index = faiss.IndexFlatIP(embedding_dim)
index.add(embedded_docs_np)

# Create save directory
save_folder = "rag_system"
os.makedirs(save_folder, exist_ok=True)

# 1. Save FAISS index
index_path = os.path.join(save_folder, "index.faiss")
faiss.write_index(index, index_path)

# 2. Save documents and their metadata
doc_ids=[i for i in range(9)]
docs_mapping = {
    str(i): {
        "text": doc,
        "doc_id": doc_id,
    }
    for i, (doc, doc_id) in enumerate(zip(documents, doc_ids))
}

with open(os.path.join(save_folder, "documents.json"), "w") as f:
    json.dump(docs_mapping, f)


print(f"RAG system saved in {save_folder}")

RAG system saved in rag_system


# Loading the embedded documents and the treated documents

In [None]:
import faiss

def load_rag_system(folder_path):
    # Load FAISS index
    index = faiss.read_index(os.path.join(folder_path, "index.faiss"))

    # Load documents (using JSON in this example)
    with open(os.path.join(folder_path, "documents.json"), "r") as f:
        documents = json.load(f)

    return index, documents

index_path = "rag_system"
index , documents_json= load_rag_system(index_path)

sorted_data = sorted(documents_json.values(), key=lambda x: x['doc_id'])

# Extract the 'text' values from the sorted data
documents = [item['text'] for item in sorted_data]

documents

['Lion', 'Cat', 'Dog', 'Daulphin', 'Shark', 'Horse', 'Cow', 'Bird', 'Fish']

# Cosine similarity

In [None]:
import numpy as np

# Define the vectors
vector_a = np.array([1, 0, 0])
vector_b = np.array([0, 1, 1])

# vector_a = np.array([1, 0, 0])
# vector_b = np.array([-1, 0, 0])

# Compute the dot product and magnitudes
dot_product = np.dot(vector_a, vector_b)
magnitude_a = np.linalg.norm(vector_a)
magnitude_b = np.linalg.norm(vector_b)

# Compute cosine similarity
cosine_similarity = dot_product / (magnitude_a * magnitude_b)

print(f"Cosine Similarity: {cosine_similarity}")


Cosine Similarity: 0.0


# Retrieval :

In [None]:
import numpy as np
def embed_text(text):
    """Embed text using Gemini's embedding-001 model."""
    embedding_result = genai.embed_content(
            model='models/embedding-001',
            content=text,
            task_type='retrieval_query'
        )
    return embedding_result



def retriever(query, k=3):
    embedding_result = embed_text(query)
    query_embedding = np.array(embedding_result['embedding']).astype('float32').reshape(1, -1)

    # Normalize the query vector
    faiss.normalize_L2(query_embedding)

    distances, indices = index.search(query_embedding, k)
    retrieved_docs = [documents[i] for i in indices[0]]
    return retrieved_docs


# Your query
query = "lives under water"
retriever(query)


['Fish', 'Shark', 'Cat']

# The list of gemini models

In [None]:
for m in genai.list_models():
  if 'gemini' in m.name:
    print(m.name)


models/gemini-1.0-pro-latest
models/gemini-1.0-pro
models/gemini-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-vision-latest
models/gemini-pro-vision
models/gemini-1.5-pro-latest
models/gemini-1.5-pro-001
models/gemini-1.5-pro-002
models/gemini-1.5-pro
models/gemini-1.5-pro-exp-0801
models/gemini-1.5-pro-exp-0827
models/gemini-1.5-flash-latest
models/gemini-1.5-flash-001
models/gemini-1.5-flash-001-tuning
models/gemini-1.5-flash
models/gemini-1.5-flash-exp-0827
models/gemini-1.5-flash-002
models/gemini-1.5-flash-8b
models/gemini-1.5-flash-8b-001
models/gemini-1.5-flash-8b-latest
models/gemini-1.5-flash-8b-exp-0827
models/gemini-1.5-flash-8b-exp-0924


# Putting it all together


In [None]:
def generate_response(query, retrieved_docs):
    model = genai.GenerativeModel('models/gemini-1.5-flash')


    # Normal Prompt

    prompt = f"""This is a RAG system given the following context and query, provide a brief response based on the retrieved documents :

    Context:
    {' '.join(retrieved_docs)}

    Query: {query}

    Response:"""


    #     Simple Factual Prompt

    # prompt = f"""Given the context and query, extract the most relevant facts:
    # Context:
    # {' '.join(retrieved_docs)}
    # Query: {query}
    # Provide a concise, factual response."""


    #     Detailed Explanatory Prompt

    # prompt = f"""Analyze the provided context in relation to the query.
    # Context:
    # {' '.join(retrieved_docs)}
    # Query: {query}
    # Explain the key points comprehensively, providing depth and nuance to the answer."""


    #     Academic/Scholarly Prompt:

    prompt = f"""Using the retrieved scholarly sources, construct an academic-style response:
    Context:
    {' '.join(retrieved_docs)}
    Query: {query}
    Structure your response with clear arguments, cite relevant information, and maintain an objective tone."""


    #     Conversational Prompt

    # prompt = f"""Respond to the query as if you're having a friendly, informative conversation:
    # Context:
    # {' '.join(retrieved_docs)}
    # Query: {query}
    # Explain the answer in a warm, accessible manner that's easy to understand."""


    response = model.generate_content(prompt)
    return response.text

# Example usage
query = "what are the companion animals ?"
retrieved_docs = retriever(query)
response = generate_response(query, retrieved_docs)

print(f"Query: {query}")
print(f"Response: {response}")

Query: what are the companion animals ?
Response: The provided context ("Dog Horse Cat") offers a limited set of potential companion animals.  Determining which are definitively companion animals requires a nuanced understanding of the term.  While all three – dogs, horses, and cats – can form bonds with humans, the degree and nature of the companionship varies considerably, influenced by factors including breed, individual animal temperament, and the human-animal interaction.

Dogs are widely considered quintessential companion animals.  Extensive research documents the long history of human-dog co-evolution and the strong social bonds they form (Serpell, 2009).  Their capacity for bidirectional communication, trainability, and demonstrated emotional responsiveness contribute to their role as companions.  This is supported by numerous studies illustrating the positive impacts of dog ownership on human physical and mental health (Barker et al., 2017).

Cats, similarly, have a long hist