In [3]:
!pip install -q transformers==4.38.0 accelerate sentence-transformers==2.5.1



In [None]:
# Désinstaller les versions problématiques
!pip uninstall -y transformers tokenizers sentence-transformers langchain-huggingface


In [4]:
!pip install langchain langchain-community langchain-text-splitters chromadb faiss-cpu



In [5]:
!pip install langchain-huggingface==1.1.0 pypdf

Collecting tokenizers<1.0.0,>=0.19.1 (from langchain-huggingface==1.1.0)
  Using cached tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Using cached tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
Installing collected packages: tokenizers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.15.2
    Uninstalling tokenizers-0.15.2:
      Successfully uninstalled tokenizers-0.15.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
transformers 4.38.0 requires tokenizers<0.19,>=0.14, but you have tokenizers 0.22.1 which is incompatible.[0m[31m
[0mSuccessfully installed tokenizers-0.22.1


In [6]:
!pip install -q transformers sentence-transformers accelerate

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-huggingface 1.1.0 requires tokenizers<1.0.0,>=0.19.1, but you have tokenizers 0.15.2 which is incompatible.[0m[31m
[0m

In [7]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain_classic.chains.retrieval_qa.base import RetrievalQA
from pypdf import PdfReader
import torch
from pathlib import Path
import os

In [8]:
MODEL_NAME = "google/flan-t5-large"
EMB_MODEL = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
CHUNK_SIZE = 800
CHUNK_OVERLAP = 100
K_CHUNKS = 4
qa_chain = None

In [9]:

DEVICE = 0 if torch.cuda.is_available() else -1
DEVICE_LABEL = "CUDA (GPU)" if torch.cuda.is_available() else "CPU"

In [10]:
def load_and_process_documents(pdf_files):
    global qa_chain
    if not pdf_files:
        return "Erreur : Aucun document detecte."

    documents_text = []
    for pdf_path in pdf_files:
        try:
            reader = PdfReader(str(pdf_path))
            text = ""
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
            documents_text.append(text)
        except Exception as e:
            return f"Erreur de lecture : {str(e)}"

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP
    )
    chunks = [c for doc in documents_text for c in splitter.split_text(doc)]

    embeddings = HuggingFaceEmbeddings(model_name=EMB_MODEL)
    vectordb = Chroma.from_texts(texts=chunks, embedding=embeddings)
    retriever = vectordb.as_retriever(search_kwargs={"k": K_CHUNKS})

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

    if torch.cuda.is_available():
        model = model.to("cuda")

    pipe = pipeline(
        "text2text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=256,
        device=DEVICE
    )

    llm = HuggingFacePipeline(pipeline=pipe)
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        chain_type="stuff"
    )

    return f"Systeme operationnel | Modele: {MODEL_NAME} | Documents: {len(pdf_files)} | Device: {DEVICE_LABEL}"

In [11]:
def rag_query(question):
    global qa_chain
    if qa_chain is None:
        return "Systeme non initialise. Veuillez charger les documents."
    if not question:
        return "Champ de question vide."
    try:
        return qa_chain.run(question)
    except Exception as e:
        return f"Erreur d'execution : {str(e)}"

In [12]:
import gradio as gr

header_style = f"""
<div style="text-align: center; font-family: sans-serif;">
    <h1 style="color: #2c3e50;">DOCUMENT ANALYTICS PLATFORM</h1>
    <hr style="border: 0.5px solid #bdc3c7; width: 80%;">
</div>
"""

with gr.Blocks(theme=gr.themes.Default(primary_hue="blue")) as demo:
    gr.HTML(header_style)

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### Configuration")
            pdf_input = gr.File(
                label="Source Documents (PDF)",
                file_count="multiple",
                type="filepath"
            )
            process_btn = gr.Button("INITIALIZE INDEXING", variant="primary")
            status_output = gr.Textbox(label="System Status", interactive=False)

        with gr.Column(scale=2):
            gr.Markdown("### Consultation")
            q_input = gr.Textbox(
                label="Query",
                placeholder="Ask a question about your documents..."
            )
            ask_btn = gr.Button("GENERATE RESPONSE", variant="primary")
            ans_output = gr.Textbox(label="Analysis Result", lines=12, interactive=False)

    process_btn.click(
        load_and_process_documents,
        inputs=[pdf_input],
        outputs=[status_output]
    )

    ask_btn.click(
        rag_query,
        inputs=[q_input],
        outputs=[ans_output]
    )

if __name__ == "__main__":
    demo.launch(share=True)


  with gr.Blocks(theme=gr.themes.Default(primary_hue="blue")) as demo:


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://35e02163dfdb0162f6.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
