<a href="https://colab.research.google.com/github/aronn-yael/chatbot_RAG/blob/main/chatbot_RAGipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install openai chromadb sentence-transformers PyPDF2 gradio



In [None]:
import os
from getpass import getpass

os.environ["OPENAI_API_KEY"] = getpass("Entrez votre clé OpenAI : ")


Entrez votre clé OpenAI : ··········


In [None]:
import chromadb
from chromadb.config import Settings

client = chromadb.PersistentClient(path="rag_db")

# Try to delete the collection if it exists to ensure a clean state
try:
    client.delete_collection(name="cours")
    print("Collection 'cours' deleted successfully.")
except Exception as e:
    print(f"Collection 'cours' did not exist or could not be deleted: {e}")

collection = client.get_or_create_collection(
    name="cours"
)
print("Collection 'cours' created or retrieved successfully.")

Collection 'cours' deleted successfully.
Collection 'cours' created or retrieved successfully.


In [None]:
documents = [
    "/content/CV-Aronn Yaël Léonard_KABORÉ.pdf",
    "/content/Cahier de charge fonctionnel.pdf"
]

In [None]:
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("all-MiniLM-L6-v2")

def get_embedding(text):
  return embedder.encode(text).tolist()

In [None]:
from PyPDF2 import PdfReader

all_pdf_text = []
for doc_path in documents: # Iterate through each document path in the list
    reader = PdfReader(doc_path)
    for page in reader.pages:
        all_pdf_text.append(page.extract_text())

documents = all_pdf_text # Now 'documents' contains all extracted text

for i, doc in enumerate(documents):
  # Skip empty or very short pages
  if not doc or len(doc.strip()) < 50: # Adjust threshold as needed
    continue
  vector = get_embedding(doc)
  collection.add(
    ids=[str(i)],
    documents=[doc],
    embeddings=[vector]
  )

In [None]:
query = "parle moi de aronn"
query_emb = get_embedding(query)

results = collection. query(
  query_embeddings=[query_emb],
  n_results=2

)
results

{'ids': [['5', '8']],
 'embeddings': None,
 'documents': [['Aucun graphique (camembert, barres) n’est visible dans cette capture. L’accent est mis \nsur les chiffres clés  et les actions rapides , conformément aux besoins d’un outil de \ngestion de projet orienté efficacité.  \nComposants  \n \nBouton "Je réserve ma place →"  \n• Emplacement  : Centré dans la bannière supérieure.  \n• Style visuel  : Contrasté (couleur vive ou foncée) pour attirer l’attention.  \n• Action probable  : Redirection vers un formulaire d’inscription au webinaire ou \nouverture d’une modale.  \nBouton "+ Ajouter une tâche"  \n• Emplacement  : Dans la section "Tâches".  \n• Style visuel  : Bouton simple avec icône "+", probablement bleu ou vert.  \n• Action probable  : Ouvre un champ de saisie ou un formulaire pour créer une \nnouvelle tâche.  \nLien "Tout voir"  \n• Emplacement  : Sous la section "Factures non réglées".  \n• Style visuel  : Texte souligné ou en bleu, standard pour les liens interactifs.  \n•

In [None]:
print(f"Nombre de documents dans la collection : {collection.count()}")

Nombre de documents dans la collection : 26


In [None]:
from openai import OpenAI
client = OpenAI()
def rag_answer(question):
  # 1) Retrieval
  query_emb = get_embedding(question)
  results = collection.query(
    query_embeddings=[query_emb],
    n_results=5
  )
  print(f"Raw ChromaDB results: {results}") # Debugging line

  retrieved_texts = []
  # Add semantically retrieved documents
  if results["documents"] and results["documents"][0]:
      retrieved_texts.extend(results["documents"][0])

  # Explicitly include CV if "aronn" is in the question
  if "aronn" in question.lower():
      cv_doc = collection.get(ids=['0'])
      if cv_doc and cv_doc['documents'] and cv_doc['documents'][0]:
          # Prepend CV content to ensure it's at the beginning of the context
          retrieved_texts.insert(0, cv_doc['documents'][0])

  context = "\n".join(retrieved_texts)

  # 2) Prompt RAG
  prompt = f"""
Tu es un assistant spécialisé.
Utilise uniquement le contexte ci-dessous pour répondre.
=== CONTEXTE ===
{context}
=== QUESTION ===
{question}
Reponse :
"""



# 3) Appel OpenAI
  response = client.chat.completions.create(
    model="gpt-4o",
    messages=[{"role": "user", "content": prompt}],
    temperature=0.2
  )

# 4) Retourner le texte
  return response.choices[0].message.content

In [None]:
import gradio as gr

def gradio_rag_answer(question):
    if not question.strip():
        return "Veuillez entrer une question valide."
    try:
        answer = rag_answer(question)
        return answer
    except Exception as e:
        return f"Une erreur est survenue lors de la récupération de la réponse : {e}"


iface = gr.Interface(
    fn=gradio_rag_answer,
    inputs=gr.Textbox(lines=2, placeholder="Posez votre question ici..."),
    outputs="text",
    title="Assistant RAG",
    description="Posez des questions sur les documents chargés."
)

iface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://6d48d5ce8b7cece36c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


