# RAG Chatbot - Indeed Job Search France

Pipeline RAG pour analyser les offres d'emploi Indeed en France.

##  Configuration et imports

In [None]:
import os
from dotenv import load_dotenv

# Charger les variables d'environnement (optionnel avec Ollama)
load_dotenv()

# Plus besoin de cle API avec Ollama !
print("Environnement charge - Ollama tourne en local (100% gratuit)")

In [None]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.llms import Ollama
from langchain_community.embeddings import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate  
from langchain_core.output_parsers import StrOutputParser  
from langchain_core.runnables import RunnablePassthrough  

print("Bibliotheques LangChain 2025 importees (Ollama - 100% gratuit)")

##  Scraping Indeed

In [None]:
def scrape_indeed(query=""):
    """Scrappe les offres d'emploi Indeed France"""
    url = f"https://fr.indeed.com/emplois?q={query}"
    print(f"Scraping: {url}")
    
    # WebBaseLoader avec headers personnalises
    loader = WebBaseLoader(
        web_paths=[url],
        header_template={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        }
    )
    
    docs = loader.load()
    print(f"Documents scrapes: {len(docs)}")
    
    return docs

In [None]:
# Test du scraping -  requete a modifier selon les besoins
query = "data analyst"  # ex: 'python', 'alternance', 'IA', etc.
docs = scrape_indeed(query)

if docs:
    print(f"\nPremiers 500 caracteres:\n{docs[0].page_content[:500]}")

##  Division des documents en chunks

In [None]:
# Configuration du text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# Division des documents
splits = text_splitter.split_documents(docs)
print(f"Total de chunks: {len(splits)}")

##  Creation de la base vectorielle ChromaDB

In [None]:
# Initialisation des embeddings (Ollama - gratuit, local)
embeddings = OllamaEmbeddings(model="llama3.2")

# Creation du vector store
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory="./chroma_db"
)

print("Vector store cree")

##  Configuration de la chaine RAG

In [None]:
# Template de prompt personnalise
template = """Tu es un analyste offres d'emploi en France.

CONTEXTE SCRAPE (Indeed France):
{context}

QUESTION: {question}

Instructions:
1. Reponds precisement avec chiffres exacts
2. Cite les entreprises et lieux quand disponible
3. Maximum 3-4 phrases concises
4. Francais uniquement

Reponse:"""

prompt = ChatPromptTemplate.from_template(template)

# Initialisation du LLM Ollama (100% gratuit, local)
llm = Ollama(model="llama3.2", temperature=0)

# Creation du retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 6})

# Creation de la chaine RAG avec LCEL
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

print("Chaine RAG prete (Ollama llama3.2)")

##  Test des requetes

In [None]:
def ask_chatbot(question):
    """Poser une question au chatbot RAG"""
    reponse = rag_chain.invoke(question)
    print(f"Q: {question}")
    print(f"R: {reponse}\n")
    return reponse

In [None]:
# Exemple de requetes - testez differentes questions!
ask_chatbot("Combien d'offres data analyst ?")

In [None]:
ask_chatbot("Quelles offres a Lille ?")

In [None]:
ask_chatbot("Top 5 entreprises qui recrutent ?")

In [None]:
ask_chatbot("Quels sont les salaires proposes ?")