# Fetching the data

In [None]:
from langchain_community.utilities import WikipediaAPIWrapper
# Initialize Wikipedia tool
wikipedia = WikipediaAPIWrapper()
results = []
# Fetch content for a specific topic
topics = ["C plus plus ( Programming language )",
          "C sharp ( Programming language )",
          "Java ( Programming language )",
          "Python ( Programming language )",
          "Rust ( Programming language )"]
for topic in topics :
    results.append(wikipedia.run(topic))

# Cleaning the dataset

In [None]:
# Initialize a list to hold the extracted data
extracted_data = []
for document in results:
    sections = document.split("Page:")
    # Iterate through each section
    for section in sections:
        if not section.strip():
            continue  # Skip empty sections

        lines = section.splitlines()
        if not lines:
            continue

        # Extract the Page title (first line)
        page_title = lines[0].strip()
        # Find the index where Summary: starts
        summary_index = -1
        for i, line in enumerate(lines):
            if line.strip().startswith("Summary:"):
                summary_index = i
                break

        if summary_index == -1:
            continue  # Skip if "Summary:" not found

        # Extract the Summary content
        summary_content = "\n".join(lines[summary_index+1:]).strip()

        # Collect the extracted parts
        extracted_data.append({
            "Page": page_title,
            "Summary": summary_content
        })

# Popping out the non relative documents

In [None]:
for i in [1,2,4,7,9,12,13][::-1]:
    extracted_data.pop(i)

# filtering

In [None]:
docs = []
for i in extracted_data:
    docs.append(i['Summary'])

# Final tabs

In [None]:
tabs_col = ['C++','C Sharp','Java','Java Script','Python','Mojo','Rust']

# Splitting the data to chuncks with overlap

In [None]:
def split_text_with_overlap(text, chunk_size=500, overlap_size=35):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap_size
    return chunks

treated_docs = []
for doc in docs :
    chuncks = split_text_with_overlap(doc)
    treated_docs.append(chuncks)


# Setting up gemini with the api key

In [None]:
import google.generativeai as genai
# Configure the Gemini API
genai.configure(api_key='YOUR_GEMINI_API_KEY')

# Embedding the chuncks for each document

In [None]:
def embed_documents(documents):
    embeddings = []
    for doc in documents:
        embedding_result = genai.embed_content(
            model='models/embedding-001',
            content=doc
        )
        
        # Extract the 'embedding' from the result
        if 'embedding' in embedding_result:
            embedding_vector = embedding_result['embedding']
            embeddings.append(embedding_vector)
        else:
            print(f"No 'embedding' field found for document: {doc}")
    
    return embeddings


embedded_docs = []
counter = 0
for treated_doc in treated_docs :
    embedded_doc = embed_documents(treated_doc)
    print(f"Number of embedded documents: {len(embedded_doc)}")

    # Assuming embeddings are vectors, check the length of the first embedding
    if embedded_doc and isinstance(embedded_doc[0], list):
        print(f"Embedding dimension: {len(embedded_doc[0])}")
    else:
        print("The embedding is not in a list-like structure.")

    embedded_docs.append(embedded_doc)

# Saving the embeddings and <br>the chuncks for each document

In [None]:
import faiss
import numpy as np
import os
import json

ppm = 0 
for embedded_doc in embedded_docs:

    # Convert embeddings to numpy array
    embedding_dim = len(embedded_doc[0])
    embedded_docs_np = np.array(embedded_doc).astype('float32')

    # Normalize vectors for cosine similarity
    faiss.normalize_L2(embedded_docs_np)

    # Create and add vectors to FAISS index
    index = faiss.IndexFlatIP(embedding_dim)
    index.add(embedded_docs_np)

    # Create save directory
    save_folder = f"{tabs_col[ppm]}"
    os.makedirs(save_folder, exist_ok=True)

    # 1. Save FAISS index
    index_path = os.path.join(save_folder, "index.faiss")
    faiss.write_index(index, index_path)

    # 2. Save documents and their metadata
    doc_ids=[i for i in range(9)]
    docs_mapping = {
        str(i): {
            "text": doc,
            "doc_id": doc_id,
        }
        for i, (doc, doc_id) in enumerate(zip(treated_docs[ppm], doc_ids))
    }

    with open(os.path.join(save_folder, "documents.json"), "w") as f:
        json.dump(docs_mapping, f)


    print(f"RAG system saved in {save_folder}")
    ppm+=1