# Retrieval & preprocessing AND Natural Language Processing (NLP) 

## 1. Setup

### Import libraries

In [11]:
import os
from langchain_ollama import OllamaLLM
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from tqdm import tqdm
import json

### Load the vector store

In [12]:
# Load the FAISS vector database
save_path = "../vector_database"
model_name = "all-MPNet-base-v2"

print("Loading FAISS vector database...")
embedding_model = HuggingFaceEmbeddings(model_name=model_name)
loaded_faiss_index = FAISS.load_local(save_path, embedding_model, allow_dangerous_deserialization=True)

if loaded_faiss_index:
    print("FAISS vector database loaded successfully!")
else:
    print("Something went wrong")

Loading FAISS vector database...
FAISS vector database loaded successfully!


## Retrieval & preprocessing

### Filter irrelevant chunks

In [13]:
import re

def filter_irrelevant_chunks(documents, max_dot_ratio=0.5, min_length=30):
    """
    Filter out irrelevant chunks that contain table of contents, excessive dots,
    or sections unrelated to SQL content such as abstracts, introductions, conclusions, etc.
    
    Args:
        documents (list): List of retrieved documents (chunks).
        max_dot_ratio (float): The maximum allowed ratio of dots to words for a chunk to be considered relevant.
        min_chunk_length (int): The minimum length for a chunk to be considered informative.
        
    Returns:
        list: Filtered list of relevant documents.
    """
    relevant_chunks = []

    # Define patterns for identifying table of contents, irrelevant sections, and excessive dots
    irrelevant_patterns = [
        r"table\s*des\s*matières",  # table of contents
        r"liste\s*des\s*(figures|tables)",  # list of figures or tables
        r"\.{3,}",  # More than 3 consecutive dots (likely a table of contents)
        r"\b(guide|résumé|abstract|remerciement|introduction|conclusion|références|bibliographie|webographie)\b",  # Common non-SQL sections
    ]

    for doc in documents:
        # Step 1: Check if the chunk contains any irrelevant pattern
        text = doc.page_content.strip().lower()

        # If any irrelevant pattern is matched, skip this chunk
        if any(re.search(pattern, text) for pattern in irrelevant_patterns):
            continue

        # Step 2: Check if the chunk has too many dots (likely a table of contents)
        dot_ratio = text.count('.') / len(text.split())  # Calculate dot ratio
        if dot_ratio > max_dot_ratio:
            continue

        # Step 3: Check if the chunk is short
        # Split the text into words and check length
        words = doc.page_content.split()
        if len(words) < min_length:
            continue

        # If the chunk passes all checks, keep it
        relevant_chunks.append(doc)

    return relevant_chunks


### Retrieve the documents from the vector database based on the user query

In [14]:
def retrieve_documents_from_faiss(query, faiss_index, k=5):
    """
    Retrieve the top-k relevant documents for a given query from the FAISS index.

    Args:
        query (str): The user's query.
        faiss_index (FAISS): The FAISS index containing the document embeddings.
        k (int): The number of top results to retrieve.

    Returns:
        list: List of relevant documents.
    """
    retriever = faiss_index.as_retriever(search_type="similarity", search_kwargs={"k": k})
    retrieved_docs = retriever.get_relevant_documents(query)
    return retrieved_docs

### Re-rank the retrieved documents using a re-ranking model

In [15]:
from transformers import pipeline
import torch

# Load a re-ranker (for simplicity, we use a transformer model for re-ranking)
re_ranker = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def rerank_documents(query, documents):

    re_ranked = []

    for doc in documents:
        # Use a zero-shot classifier to rank documents based on relevance to the query
        result = re_ranker(query, candidate_labels=[doc.page_content])
        score = result['scores'][0]  # Take the relevance score

        # Store document along with its score
        re_ranked.append({"document": doc, "score": score})

    # Sort documents based on score (higher score is more relevant)
    re_ranked.sort(key=lambda x: x['score'], reverse=True)

    # Return the top re-ranked documents
    return [doc["document"] for doc in re_ranked]

Device set to use cpu


### Combine all the three previous functions

In [16]:
def retrieve_and_rerank(query, faiss_index, k=25, top_n=5):
    """
    Retrieve and re-rank documents based on their relevance to a query.

    Args:
        query (str): The user's query in French.
        faiss_index (FAISS): The FAISS vector store.
        k (int): The number of top results to retrieve (set to 25).
        top_n (int): The number of top re-ranked documents to return (set to 5).

    Returns:
        list: Top re-ranked documents.
    """
    # Step 1: Retrieve top-k relevant documents from FAISS index
    retrieved_docs = retrieve_documents_from_faiss(query, faiss_index, k)

    # Step 2: Filter out irrelevant documents (e.g., TOC, excessive dots)
    filtered_docs = filter_irrelevant_chunks(retrieved_docs)

    # Step 3: Re-rank the filtered documents
    re_ranked_docs = rerank_documents(query, filtered_docs)

    # Return the top 'top_n' re-ranked documents
    return re_ranked_docs[:top_n]

### Clean the re-ranked documents to prepare them for question generation.

In [17]:
import re

def clean_chunk_for_question_generation(chunk, min_length):
    """
    Clean the chunk for question generation by removing unnecessary text and ensuring meaningful content.

    Args:
        chunk (str): The chunk of text to be cleaned.
        min_length (int): Minimum length of the chunk to ensure it's meaningful.

    Returns:
        str: Cleaned chunk ready for question generation.
    """
    # Step 1: Normalize whitespace (remove extra spaces, newlines)
    cleaned_text = re.sub(r'[ \t]+', ' ', chunk).strip()

    # Step 2: Remove non-informative filler phrases like "introduction", "summary"
    cleaned_text = re.sub(r'\b(introduction|résumé|summary|conclusion)\b', '', cleaned_text, flags=re.IGNORECASE)

    # Step 3: Ensure minimum content length for meaningful chunk
    if len(cleaned_text.split()) < min_length:
        return None  # Chunk is too short to be meaningful for question generation

    return cleaned_text

def clean_relevant_chunks_for_question_generation(re_ranked_docs, min_chunk_length):
    cleaned_chunks = []

    for doc in re_ranked_docs:
        cleaned_chunk = clean_chunk_for_question_generation(doc.page_content, min_chunk_length)
        if cleaned_chunk:  # Only add the chunk if it's not None (meaningful)
            cleaned_chunks.append(cleaned_chunk)

    return cleaned_chunks

## Natural Language Processing (NLP)

### Define Question Generation functions

#### Generate QCM

In [18]:
def generate_mcq(content, query, difficulty="intermédiaire"):
    """
    Génère des questions à choix multiples (QCM) en français à partir du contenu fourni,
    en respectant la requête de l'utilisateur et en tenant compte du niveau de difficulté spécifié.
    
    Args:
        content (str): Le contenu pour générer les questions.
        query (str): La requête de l'utilisateur, qui doit guider les questions.
        difficulty (str): Le niveau de difficulté ('débutant', 'intermédiaire', 'avancé').

    Returns:
        dict: Question générée avec les options, la réponse correcte et une explication.
    """
    prompt = f"""
    Vous êtes un assistant spécialisé dans la génération de questions à choix multiples (QCM) en français.
    Utilisez le contenu suivant pour créer une question conforme aux consignes ci-dessous.

    ### Consignes :
    1. **Lien avec la requête** :
       La question doit être liée au concept suivant : '{query}' et tester la compréhension de ce concept.
    
    2. **Niveau de difficulté** :
       Adaptez la complexité de la question au niveau spécifié : '{difficulty}'.
    
    3. **Structure des options** :
       - Créez une question avec quatre options de réponse (A, B, C, D), dont une seule est correcte.
       - Assurez-vous que les options sont logiquement distinctes et pertinentes.

    4. **Format de sortie** :
       Retournez la question sous le format JSON suivant :
       ```json
       {{
           "question": "<La question clairement formulée>",
           "options": {{
               "A": "<Option A>",
               "B": "<Option B>",
               "C": "<Option C>",
               "D": "<Option D>"
           }},
           "correct_answer": "<Lettre de l'option correcte (A, B, C ou D)>",
           "explanation": "<Explication concise et claire de la réponse correcte>"
       }}
       ```

    5. **Précision et clarté** :
       Toutes les informations doivent être extraites uniquement du contenu fourni. Évitez toute ambiguïté.

    ### Contenu à utiliser :
    {content}
    """
    response = ollama_model.invoke(prompt)
    return response


#### Generate open-ended questions

In [19]:
def generate_open_ended(content, query, difficulty="intermédiaire"):
    """
    Génère des questions ouvertes en français à partir du contenu fourni,
    en veillant à ce qu'elles soient pertinentes par rapport à la requête de l'utilisateur.

    Args:
        content (str): Le contenu pour générer les questions.
        query (str): La requête de l'utilisateur, qui doit guider les questions.
        difficulty (str): Le niveau de difficulté ('débutant', 'intermédiaire', 'avancé').

    Returns:
        dict: Question ouverte générée avec une réponse exemple et une explication.
    """
    prompt = f"""
    Vous êtes un assistant spécialisé dans la génération de questions ouvertes en français.
    Utilisez le contenu ci-dessous pour créer une question ouverte en respectant les consignes suivantes.

    ### Consignes :
    1. **Lien avec la requête** :
       La question doit être basée sur le concept suivant : '{query}' et adaptée au niveau de difficulté '{difficulty}'.

    2. **Format de sortie** :
       Retournez les résultats sous le format JSON suivant :
       ```json
       {{
           "question": "<La question clairement formulée>",
           "example_answer": "<Une réponse exemple concise>",
           "explanation": "<Explication claire de la réponse>"
       }}
       ```

    3. **Précision et clarté** :
       Toutes les informations doivent provenir uniquement du contenu fourni.

    ### Contenu à utiliser :
    {content}
    """
    response = ollama_model.invoke(prompt)
    return response


#### Generate Questions peipline

In [20]:
# def exam_pipeline(query, question_type, question_nbr, faiss_index, ollama_model, difficulty="intermediate", k=25, top_n=5):
#     """
#     The exam generation pipeline that retrieves documents, cleans them, and generates questions based on the retrieved chunks.
    
#     Args:
#         query (str): The query to search for.
#         question_type (str): The type of questions ('mcq' or 'open-ended').
#         question_nbr (int): The number of questions to generate.
#         faiss_index (FAISS): The FAISS vector store for document retrieval.
#         ollama_model (OllamaModel): The model used for question generation (Gemma2).
#         difficulty (str): The difficulty level for the questions ('beginner', 'intermediate', 'advanced').
#         k (int): The number of relevant documents to retrieve.
#         top_n (int): The number of documents to keep after re-ranking.
        
#     Returns:
#         dict: A dictionary containing the generated exam questions.
#     """
    
#     # Step 1: Retrieve Documents
#     print("Retrieving relevant documents...")
#     re_ranked_docs = retrieve_and_rerank(query, faiss_index, k, top_n)

#     if not re_ranked_docs:
#         print("No documents found for the given query.")
#         return {}

#     print(f"Retrieved {len(re_ranked_docs)} documents.")

#     # Step 2: Normalize Retrieved Chunks
#     print("Cleaning retrieved content...")
#     cleaned_chunks = clean_relevant_chunks_for_question_generation(re_ranked_docs, min_chunk_length=30)

#     print(f"Normalized to {len(cleaned_chunks)} valid chunks.")

#     # Step 3: Generate Questions
#     print("Generating questions...")
#     exam = {"questions": []}
#     base = cleaned_chunks  # Base content to generate questions from

#     for i in tqdm(range(0, question_nbr), desc="Generating Questions"):
#         try:
#             # Select content for question generation
#             content = base[i % len(base)]  # Loop through base content

#             # Generate MCQ or Open-ended question based on type
#             if question_type.lower() == "mcq":
#                 response = generate_mcq(content, query, difficulty)
#                 question = {
#                     "type": "mcq",
#                     "source_content": content,  # Source content for the question
#                     "question_data": response
#                 }
#             elif question_type.lower() == "open-ended":
#                 response = generate_open_ended(content, difficulty)
#                 question = {
#                     "type": "open-ended",
#                     "source_content": content,  # Source content for the question
#                     "question_data": response
#                 }
#             else:
#                 raise ValueError("Invalid question type. Use 'mcq' or 'open-ended'.")

#             exam["questions"].append(question)

#         except Exception as e:
#             print(f"Error generating question: {e}")

#     print(f"Generated {len(exam['questions'])} questions.")

#     return exam

In [21]:
from concurrent.futures import ThreadPoolExecutor
import tqdm

def generate_questions_parallel(base, query, question_type, question_nbr, ollama_model, difficulty):
    """
    Generate questions in parallel to speed up the process.
    """
    def generate_question(content):
        if question_type.lower() == "mcq":
            return {
                "type": "mcq",
                "source_content": content,
                "question_data": generate_mcq(content, query, difficulty)
            }
        elif question_type.lower() == "open-ended":
            return {
                "type": "open-ended",
                "source_content": content,
                "question_data": generate_open_ended(content, query, difficulty)
            }
        else:
            raise ValueError("Invalid question type. Use 'mcq' or 'open-ended'.")

    with ThreadPoolExecutor() as executor:
        questions = list(
            tqdm.tqdm(
                executor.map(generate_question, base[:question_nbr]),
                total=question_nbr,
                desc="Generating Questions"
            )
        )
    return questions

def exam_pipeline(query, question_type, question_nbr, faiss_index, ollama_model, difficulty="intermediate", k=25, top_n=5):
    """
    Optimized exam generation pipeline with parallel question generation.
    """
    print("Retrieving relevant documents...")
    re_ranked_docs = retrieve_and_rerank(query, faiss_index, k, top_n)

    if not re_ranked_docs:
        print("No documents found for the given query.")
        return {}

    print(f"Retrieved {len(re_ranked_docs)} documents.")

    print("Cleaning retrieved content...")
    cleaned_chunks = clean_relevant_chunks_for_question_generation(re_ranked_docs, min_chunk_length=30)
    print(f"Normalized to {len(cleaned_chunks)} valid chunks.")

    print("Generating questions...")
    exam = {"questions": generate_questions_parallel(cleaned_chunks, query, question_type, question_nbr, ollama_model, difficulty)}
    
    print(f"Generated {len(exam['questions'])} questions.")
    return exam

In [22]:
query = "les requettes sql"
question_type = "mcq" # mcq or open-ended
question_nbr = 2
faiss_index = loaded_faiss_index  # Your FAISS index
ollama_model = OllamaLLM(model="llama3.2")
difficulty = "avancé" # 'débutant', 'intermédiaire', 'avancé'

exam = exam_pipeline(query, question_type, question_nbr, faiss_index, ollama_model, difficulty)

Retrieving relevant documents...


  retrieved_docs = retriever.get_relevant_documents(query)


Retrieved 5 documents.
Cleaning retrieved content...
Normalized to 5 valid chunks.
Generating questions...


Generating Questions: 100%|██████████████████████████████████████████████████████████████| 2/2 [00:40<00:00, 20.50s/it]

Generated 2 questions.





## Display the Exam

In [23]:
def display_exam(exam):
    """
    Display the exam questions in a structured format.
    
    Args:
        exam (dict): The generated exam with questions.
    """
    for i, question in enumerate(exam['questions']):
        print(f"Question {i + 1} ({question['type']}):")
        print("=" * 50)
        
        # Display source content (question context)
        print("Context:")
        print(question['source_content'])
        print("-" * 50)
        
        # Display the question
        print("Question:")
        print(question['question_data'])
        
        print("=" * 50)

# Display the exam in a structured format
display_exam(exam)


Question 1 (mcq):
Context:
# Systèmes de Gestion de Bases de Données, Vertigo/CNAM, Paris

# Exemples de questions (requêtes) posées à la base

- Insérer un employé nommé Jean
- Augmenter Jean de 10%
- Détruire Jean
- Chercher les employés cadres
- Chercher les employés du département comptabilité
- Salaire moyen des employés comptables, avec deux enfants, nés avant 1960 et travaillant à Paris

Les requêtes sont émises avec un langage de requêtes (SQL2, OQL, SQL3, XQUERY, etc.).
--------------------------------------------------
Question:
```json
{
    "question": "Quelle est la réduction en masse d'un employé du système de gestion de bases de données Vertigo/CNAM ?",
    "options": {
        "A": "Insérer un employé",
        "B": "Augmenter un employé de 10%",
        "C": "Détruire un employé",
        "D": "Supprimer un employé"
    },
    "correct_answer": "D",
    "explanation": "La réduction en masse d'un employé est synonyme de suppression. Dans le contexte du système de gestio