In [1]:
import argparse
import os
import shutil
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain.vectorstores.chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama

In [2]:
CHROMA_PATH = "chroma"
DATA_PATH = "data"

In [3]:
def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    print("Loading documents...")
    documents = document_loader.load()
    print(f"Loaded {len(documents)} documents.")
    return documents

def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,
    )
    print("Splitting documents into chunks...")
    chunks = text_splitter.split_documents(documents)
    print(f"Split into {len(chunks)} chunks.")
    return chunks

def add_to_chroma(chunks: list[Document]):
    # Load the existing database.
    print("Loading Chroma database...")
    db = Chroma(
        persist_directory=CHROMA_PATH, embedding_function=get_embedding_function()
    )
    
    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)
    
    # Add or Update the documents.
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Only add documents that don't exist in the DB.
    new_chunks = [chunk for chunk in chunks_with_ids if chunk.metadata["id"] not in existing_ids]

    if new_chunks:
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()
        print("Documents added and database persisted.")
    else:
        print("✅ No new documents to add")

def calculate_chunk_ids(chunks: list[Document]):
    print("Calculating chunk IDs...")
    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    print("Chunk IDs calculated.")
    return chunks

def clear_database():
    if os.path.exists(CHROMA_PATH):
        print("Clearing database...")
        shutil.rmtree(CHROMA_PATH)
        print("Database cleared.")

In [6]:
from transformers import AutoModel, AutoTokenizer
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import os

# Step 1: Download the model
def download_model(model_name: str, local_model_path: str):
    if not os.path.exists(local_model_path):
        os.makedirs(local_model_path)
        # Download the model and tokenizer
        model = AutoModel.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        # Save the model and tokenizer locally
        model.save_pretrained(local_model_path)
        tokenizer.save_pretrained(local_model_path)
    else:
        print(f"Model already exists at {local_model_path}")

In [8]:
download_model("sentence-transformers/all-mpnet-base-v2", "./local_models/sentence-transformers/all-mpnet-base-v2")

In [9]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

def get_embedding_function():
    local_model_path = "./local_models/sentence-transformers/all-mpnet-base-v2"
    if local_model_path:
        embeddings = HuggingFaceEmbeddings(model_name=local_model_path)
    return embeddings

In [10]:
# Create (or update) the data store.
documents = load_documents()
chunks = split_documents(documents)
add_to_chroma(chunks)

Loading documents...


No sentence-transformers model found with name ./local_models/sentence-transformers/all-mpnet-base-v2. Creating a new one with MEAN pooling.


Loaded 61 documents.
Splitting documents into chunks...
Split into 153 chunks.
Loading Chroma database...
Calculating chunk IDs...
Chunk IDs calculated.
Number of existing documents in DB: 0
👉 Adding new documents: 153
Documents added and database persisted.


  warn_deprecated(


In [2]:
PROMPT_TEMPLATE = """
Répondez à la question en vous basant uniquement sur le contexte suivant :

{context}

---

Répondez à la question en fonction du contexte ci-dessus : {question}
"""

In [6]:
PROMPT_TEMPLATE = """
Répondez à la question en vous basant uniquement sur le contexte suivant :

{context}

---

Répondez à la question en fonction du contexte ci-dessus : {question}
"""

# Initialize the model and move it to the GPU if available
model = Ollama(model="gemma:2b-instruct")

def is_response_relevant(response_text, query_text):
    # Implement a simple relevance check (this can be more sophisticated)
    # For example, check if key terms from the query are in the response
    return any(term in response_text.lower() for term in query_text.lower().split())

def query_rag(query_text: str):
    # Prepare the DB.
    embedding_function = get_embedding_function()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    # Initial search in the DB.
    results = db.similarity_search_with_score(query_text, k=5)
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    
    # Ensure the model uses the GPU
    model.eval()  # Set the model to evaluation mode
    
    while True:
        with torch.no_grad():  # Disable gradient calculation
            prompt = prompt
            response_text = model.invoke(prompt)

        if is_response_relevant(response_text, query_text):
            break
        else:
            # Fetch additional context if the response is not relevant
            additional_results = db.similarity_search_with_score(query_text, k=5, skip=len(results))
            if not additional_results:
                break  # No more context available, exit loop
            results.extend(additional_results)
            context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
            prompt = prompt_template.format(context=context_text, question=query_text)

    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)
    return formatted_response

# Add a check for unrelated queries
def handle_query(query_text: str):
    if not query_text.strip().endswith("?"):  # Simple check for non-question prompts
        return f"Response: {query_text}\nSources: []"
    else:
        return query_rag(query_text)


In [None]:
print(handle_query("Hey, how are you?"))

No sentence-transformers model found with name ./local_models/sentence-transformers/all-mpnet-base-v2. Creating a new one with MEAN pooling.


In [19]:
query_rag("les formation de FSTT ?")

No sentence-transformers model found with name ./local_models/sentence-transformers/all-mpnet-base-v2. Creating a new one with MEAN pooling.


Response: Les formations de FSTT sont des programmes de formation ouverte aux étudiants de différents niveaux de niveau pour acquérir des connaissances, des aptitudes et des compétences en relation avec les technologies.

Ces formations peuvent être divisées en deux groupes:

- **Formation de base en sciences et techniques** (BCG) : cette formation vise à donner aux étudiants les bases solides en sciences, en mathématiques, en informatique et en gestion des procédés industriels.
- **Formation en photographie et en prise de vue** (club - photographie, club - enactus) : cette formation vise à donner aux étudiants des compétences en photographie, en prise de vue et à développer leur créativité.
Sources: ['data\\FST TANGER.pdf:50:2', 'data\\FST TANGER.pdf:34:0', 'data\\FST TANGER.pdf:6:0', 'data\\FST TANGER.pdf:44:0', 'data\\FST TANGER.pdf:42:1']


[(Document(page_content='Tech niques  \n \nLA PAGE DE Conseil de l’établissement  : \n - Lien de page: https://fstt.ac.ma/Portail2023/conseil -de-letablissement/  \n \nLa Faculté est gérée par un conseil d’établissement présidé par le Doyen. Le conseil de l’établissement \ncomprend des membres de droit, d es représentants élus des personnels enseignants et des personnels \nadministratifs et techniques, des représentants élus des étudiants, ainsi que des membres désignés \nparmi des personnalités extérieures.  \n-Member : DIANI Mustapha - Responsabilité : Doyen  \n-Member : JBILOU Mohammed - Responsabilité : Vice Doyen de la Formation  \n-Member : OUARDOUZ Mustapha - Responsabilité : Vice Doyen de la Recherche et de la coopération', metadata={'id': 'data\\FST TANGER.pdf:50:2', 'page': 50, 'source': 'data\\FST TANGER.pdf'}),
  6.028841038892545),
  6.044837951660156),
  6.385698318481445),
  6.671168522554807),
 (Document(page_content='d’intérieur, un stylisme ou un simple dessin à la ma