In [None]:
import os
import torch
import sys
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# --- CONFIGURATION ---
# Remplacez par votre cl√© API r√©elle
os.environ["GROQ_API_KEY"]=os.getenv("Groq_API_KEY")

# Chemins
DATA_PATH = "data/"            # Mettez vos PDFs dans ce dossier
DB_FAISS_PATH = "vectorstore/db_faiss"

# Mod√®les Optimis√©s
# 1. Embeddings MULTILINGUES (Comprend FR <-> EN)
MODEL_EMBEDDING = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
# 2. LLM Performant
MODEL_LLM = "llama-3.3-70b-versatile"

# D√©tection GPU
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"‚öôÔ∏è Configuration : Embeddings sur {DEVICE.upper()} | Mod√®le : {MODEL_LLM}")

‚öôÔ∏è Configuration : Embeddings sur CUDA | Mod√®le : llama-3.3-70b-versatile


In [8]:
import os
import re
import torch
import sys

# --- CORRECTION DES IMPORTS ---
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# C'est ici que √ßa changeait : on utilise langchain_core maintenant
from langchain_core.documents import Document 

# --- 1. CONFIGURATION ---
DATA_PATH = "data/raw"            # Assurez-vous que vos 5 PDFs sont dans ce dossier
DB_FAISS_PATH = "vectorstore/db_faiss"

# Mod√®le Multilingue (Arabe + Fran√ßais + Anglais)
MODEL_EMBEDDING = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def clean_text(text):
    """
    Fonction de nettoyage avanc√© pour retirer le bruit des PDF.
    """
    if not text: return ""

    # 1. Retirer les num√©ros de pages isol√©s
    text = re.sub(r'Page\s+\d+|^\d+\s*$', '', text, flags=re.MULTILINE)
    text = re.sub(r'\d+\s+\|\s+P\s+a\s+g\s+e', '', text) # Cas sp√©cifique Wheeling
    
    # 2. Retirer les en-t√™tes r√©p√©titifs
    text = re.sub(r'Undergraduate\s+Catalog\s+2024-2025', '', text, flags=re.IGNORECASE)
    
    # 3. R√©parer les c√©sures de mots (hyphenation)
    text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', text)
    
    # 4. Supprimer les sauts de ligne multiples
    text = re.sub(r'\n{3,}', '\n\n', text)
    
    return text.strip()

def load_and_process_documents():
    print(f"--- üöÄ D√©marrage du Traitement Avanc√© (Sur {DEVICE.upper()}) ---")
    
    # V√©rification du dossier
    if not os.path.exists(DATA_PATH):
        os.makedirs(DATA_PATH)
        print(f"‚ö†Ô∏è  Le dossier '{DATA_PATH}' a √©t√© cr√©√©. Veuillez y d√©poser vos PDF et relancer.")
        return

    # 1. Chargement Brut
    loader = PyPDFDirectoryLoader(DATA_PATH)
    raw_docs = loader.load()
    print(f"üìÑ {len(raw_docs)} pages brutes charg√©es.")

    if not raw_docs:
        print(f"‚ùå Erreur : Le dossier '{DATA_PATH}' est vide. Ajoutez vos PDF.")
        return

    processed_docs = []
    
    # 2. Nettoyage et Injection de Contexte
    print("üßπ Nettoyage et Enrichissement des donn√©es...")
    for doc in raw_docs:
        # Identification de la source
        full_source = doc.metadata.get('source', '')
        filename = os.path.basename(full_source) # Extrait juste le nom du fichier
        uni_name = filename.replace('.pdf', '').replace('_', ' ')
        
        # Nettoyage
        cleaned_content = clean_text(doc.page_content)
        
        # S'il reste du contenu utile
        if len(cleaned_content) > 50:
            # INJECTION DE CONTEXTE : On ajoute le nom de l'universit√© au d√©but du chunk
            enriched_content = f"Document Source: {uni_name}\n\n{cleaned_content}"
            
            # On met √† jour le contenu
            doc.page_content = enriched_content
            processed_docs.append(doc)

    print(f"‚úÖ {len(processed_docs)} pages trait√©es et enrichies.")

    # 3. Chunking Optimis√©
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1200,
        chunk_overlap=300,
        separators=["\n\n", "(?<=\. )", "\n", " ", ""]
    )
    
    chunks = text_splitter.split_documents(processed_docs)
    print(f"‚úÇÔ∏è  G√©n√©ration de {len(chunks)} fragments (chunks) optimis√©s.")

    # 4. Embeddings & Indexation
    print(f"üß† Calcul des vecteurs avec {MODEL_EMBEDDING}...")
    embeddings = HuggingFaceEmbeddings(
        model_name=MODEL_EMBEDDING,
        model_kwargs={'device': DEVICE}
    )

    vectorstore = FAISS.from_documents(chunks, embeddings)
    vectorstore.save_local(DB_FAISS_PATH)
    print(f"‚úÖ Base de donn√©es sauvegard√©e avec succ√®s dans '{DB_FAISS_PATH}'")

if __name__ == "__main__":
    load_and_process_documents()

  separators=["\n\n", "(?<=\. )", "\n", " ", ""]


--- üöÄ D√©marrage du Traitement Avanc√© (Sur CUDA) ---
üìÑ 6866 pages brutes charg√©es.
üßπ Nettoyage et Enrichissement des donn√©es...
‚úÖ 6796 pages trait√©es et enrichies.
‚úÇÔ∏è  G√©n√©ration de 33923 fragments (chunks) optimis√©s.
üß† Calcul des vecteurs avec sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2...


  embeddings = HuggingFaceEmbeddings(


‚úÖ Base de donn√©es sauvegard√©e avec succ√®s dans 'vectorstore/db_faiss'


In [None]:
import os
import torch
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq

# --- CONFIGURATION ---
os.environ["GROQ_API_KEY"]=os.getenv("Groq_API_KEY")
DB_FAISS_PATH = "vectorstore/db_faiss"
MODEL_EMBEDDING = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
MODEL_LLM = "llama-3.3-70b-versatile"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print(f"--- ‚öôÔ∏è CHARGEMENT DU SYST√àME M√âMOIRE ({DEVICE.upper()}) ---")

# 1. Embeddings
embeddings = HuggingFaceEmbeddings(
    model_name=MODEL_EMBEDDING,
    model_kwargs={'device': DEVICE}
)

# 2. Vectorstore & Retriever
try:
    vectorstore = FAISS.load_local(DB_FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
    retriever = vectorstore.as_retriever(search_kwargs={'k': 8})
    print("‚úÖ Base charg√©e.")
except Exception as e:
    print(f"‚ùå Erreur de chargement : {e}")

# 3. LLM
llm = ChatGroq(temperature=0.0, model_name=MODEL_LLM)

  from .autonotebook import tqdm as notebook_tqdm


--- ‚öôÔ∏è CHARGEMENT DU SYST√àME M√âMOIRE (CUDA) ---


  embeddings = HuggingFaceEmbeddings(


‚úÖ Base charg√©e.


In [2]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableBranch

# --- √âTAPE 1 : LA CHA√éNE DE REFORMULATION (Contextualize) ---
# Ce prompt sert uniquement √† r√©√©crire la question en fonction de l'historique
contextualize_q_system_prompt = """
Given a chat history and the latest user question which might reference context in the chat history, 
formulate a standalone question which can be understood without the chat history. 
Do NOT answer the question, just reformulate it if needed and otherwise return it as is.
"""

contextualize_q_prompt = ChatPromptTemplate.from_messages([
    ("system", contextualize_q_system_prompt),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "{question}"),
])

# Cette cha√Æne ne s'active que s'il y a un historique
contextualize_q_chain = contextualize_q_prompt | llm | StrOutputParser()

# --- √âTAPE 2 : LA CHA√éNE DE R√âPONSE (Answer) ---
qa_system_prompt = """
You are an expert academic advisor for international students.
You have access to official documents from several universities.

INSTRUCTIONS:
1.  **Analyze the Context:** Look at the "Document Source" header in each text chunk.
2.  **Language:** ALWAYS answer in **ENGLISH**.
3.  **Accuracy:** Use the provided context only. If you don't know, say so.
4.  **Format:** Use bullet points for lists.

CONTEXT:
{context}
"""

qa_prompt = ChatPromptTemplate.from_messages([
    ("system", qa_system_prompt),
    MessagesPlaceholder(variable_name="chat_history"), # On garde l'historique aussi ici au cas o√π
    ("human", "{question}"),
])

# Fonction pour formater les docs
def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

# La cha√Æne finale
qa_chain = qa_prompt | llm | StrOutputParser()

print("‚úÖ Syst√®me de m√©moire conversationnelle pr√™t.")

‚úÖ Syst√®me de m√©moire conversationnelle pr√™t.


In [None]:
from langchain_core.messages import HumanMessage, AIMessage
import time

def start_memory_chat():
    # 1. Initialisation de la m√©moire pour cette session
    chat_history = []
    
    print("\n" + "="*60)
    print("ü§ñ MEMORY CHATBOT READY (English Output)")
    print("Example: Ask about a university, then just ask 'What are the fees?'")
    print("="*60 + "\n")

    while True:
        question = input("üëâ VOUS : ")
        if question.lower() in ['exit', 'quit', 'q']:
            print("Goodbye!")
            break
        
        start_time = time.time()
        
        try:
            # --- PHASE A : REFORMULATION DE LA QUESTION ---
            if chat_history:
                # Si on a de la m√©moire, on demande au LLM de pr√©ciser la question
                print("üîÑ Reformulation de la question avec le contexte...")
                reformulated_question = contextualize_q_chain.invoke({
                    "chat_history": chat_history,
                    "question": question
                })
                print(f"   (Question interne : '{reformulated_question}')")
            else:
                reformulated_question = question

            # --- PHASE B : RECHERCHE (Retrieval) ---
            # On cherche avec la question REFORMUL√âE (plus pr√©cise)
            print("üîç Recherche dans les documents...")
            retrieved_docs = retriever.invoke(reformulated_question)
            
            # --- PHASE C : G√âN√âRATION ---
            print("üß† G√©n√©ration de la r√©ponse...")
            response = qa_chain.invoke({
                "context": format_docs(retrieved_docs),
                "chat_history": chat_history,
                "question": reformulated_question
            })
            
            # --- PHASE D : MISE √Ä JOUR M√âMOIRE ---
            # On stocke l'√©change pour le prochain tour
            chat_history.extend([
                HumanMessage(content=question),
                AIMessage(content=response)
            ])
            
            # Affichage
            end_time = time.time()
            sources = set([d.metadata.get('source', '').split('/')[-1] for d in retrieved_docs])
            
            print("\n" + "-"*60)
            print(f"‚è±Ô∏è {end_time - start_time:.2f}s | Sources: {', '.join(sources)}")
            print("-" * 60)
            print(f"üí° ANSWER:\n{response}")
            print("="*60 + "\n")

        except Exception as e:
            print(f"‚ùå Erreur : {e}")

if __name__ == "__main__":
    start_memory_chat()


ü§ñ MEMORY CHATBOT READY (English Output)
Example: Ask about a university, then just ask 'What are the fees?'

üîç Recherche dans les documents...
üß† G√©n√©ration de la r√©ponse...

------------------------------------------------------------
‚è±Ô∏è 2.31s | Sources: HASSAN2_UNIVERCITY.pdf
------------------------------------------------------------
üí° ANSWER:
It seems like you provided a large document in French and Arabic, which appears to be an activity report from the University Hassan II of Casablanca (UH2C) for the year 2019-2020. 

To provide a helpful response, I would like to know what specific information you are looking for in this document. Are you interested in:

* International partnerships and cooperation?
* Campus life and student activities?
* Academic programs and research?
* University governance and quality assurance?
* Something else?

Please let me know, and I'll do my best to provide a helpful and accurate response based on the provided document. 

Here are