# RAG Chatbot - Indeed Job Search France

Pipeline RAG pour analyser les offres d'emploi Indeed en France.

## 1. Configuration et imports

In [None]:
import os
from dotenv import load_dotenv

# Charger les variables d'environnement
load_dotenv()

if not os.getenv("OPENAI_API_KEY"):
    raise ValueError("OPENAI_API_KEY introuvable dans le fichier .env")

print("Environnement charge")

In [None]:
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

print("Bibliotheques importees")

## 2. Scraping Indeed

In [None]:
def scrape_indeed(query=""):
    """Scrappe les offres d'emploi Indeed France"""
    url = f"https://fr.indeed.com/emplois?q={query}"
    print(f"Scraping: {url}")
    
    # WebBaseLoader avec headers personnalises
    loader = WebBaseLoader(
        web_paths=[url],
        header_template={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        }
    )
    
    docs = loader.load()
    print(f"Documents scrapes: {len(docs)}")
    
    return docs

In [None]:
# Test du scraping - modifier la requete selon vos besoins
query = "data analyst"  # Essayez: 'python', 'alternance', 'IA', etc.
docs = scrape_indeed(query)

if docs:
    print(f"\nPremiers 500 caracteres:\n{docs[0].page_content[:500]}")

## 3. Division des documents en chunks

In [None]:
# Configuration du text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# Division des documents
splits = text_splitter.split_documents(docs)
print(f"Total de chunks: {len(splits)}")

## 4. Creation de la base vectorielle ChromaDB

In [None]:
# Initialisation des embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# Creation du vector store
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory="./chroma_db"
)

print("Vector store cree")

## 5. Configuration de la chaine RAG

In [None]:
# Template de prompt personnalise
template = """Tu es un analyste offres d'emploi en France.

CONTEXTE SCRAPE (Indeed France):
{context}

QUESTION: {question}

Instructions:
1. Reponds precisement avec chiffres exacts
2. Cite les entreprises et lieux quand disponible
3. Maximum 3-4 phrases concises
4. Francais uniquement

Reponse:"""

PROMPT = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)

# Initialisation du LLM
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

# Creation du retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 6})

# Creation de la chaine RAG
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": PROMPT},
    return_source_documents=True
)

print("Chaine RAG prete")

## 6. Test des requetes

In [None]:
def ask_chatbot(question):
    """Poser une question au chatbot RAG"""
    result = qa_chain.invoke({"query": question})
    print(f"Q: {question}")
    print(f"R: {result['result']}\n")
    return result

In [None]:
# Exemple de requetes - testez differentes questions!
ask_chatbot("Combien d'offres data analyst ?")

In [None]:
ask_chatbot("Quelles offres a Lille ?")

In [None]:
ask_chatbot("Top 5 entreprises qui recrutent ?")

In [None]:
ask_chatbot("Quels sont les salaires proposes ?")