In [3]:
!pip install pymupdf
!pip install faiss-cpu
!pip install sentence_transformers
!pip install Pillow
!pip install accelerate
!pip install -U mistralai

Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m111.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.5
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m77.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Col

# Nouvelle section

In [4]:
import os
import fitz  # PyMuPDF pour l'extraction de texte et d'images
import numpy as np
import faiss
import matplotlib.pyplot as plt
import pandas as pd
from nltk.tokenize import sent_tokenize
# Import pour le NLP
from sentence_transformers import SentenceTransformer, CrossEncoder,util
from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
import re
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# PIXTRAL
import base64
from io import BytesIO
from mistralai import Mistral

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model_id = "mistralai/Mistral-7B-Instruct-v0.2"

# Chargement du modèle et du tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")

# Important : définir le pad_token
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00,  2.97s/it]


In [None]:
def extract_text_from_pdf(pdf_path):
    """
    Extrait le texte brut d’un fichier PDF.

    Args:
        pdf_path (str): Chemin vers le fichier PDF.

    Returns:
        str: Texte complet extrait du PDF, concaténé page par page avec des sauts de ligne.
    """
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text() + "\n"
    return text


def chunk_text(text):
    """
    Découpe le texte en morceaux (chunks) de taille fixe avec recouvrement (overlap).

    Args:
        text (str): Texte complet à découper.

    Returns:
        list: Liste de chaînes de caractères, chaque chunk contenant environ 300 mots
              avec 30 mots de chevauchement pour préserver le contexte.
    """
    chunk_size, overlap = 300, 30
    words = text.split()
    chunks = []
    step = chunk_size - overlap
    for i in range(0, len(words), step):
        chunk = words[i:i+chunk_size]
        chunks.append(" ".join(chunk))
    return chunks


def encode_passages(passages, model):
    """
    Encode chaque passage textuel en vecteur d’embedding avec un modèle de type SentenceTransformer.

    Args:
        passages (list): Liste de chaînes de caractères à encoder.
        model (SentenceTransformer): Modèle d'embedding, par exemple E5 ou MiniLM.

    Returns:
        np.ndarray: Matrice d’embeddings de type float32 (shape: [n_passages, embedding_dim]).
    """
    embeddings = model.encode(passages, show_progress_bar=True)
    return np.array(embeddings, dtype=np.float32)


def build_faiss_index(embeddings):
    """
    Construit un index FAISS à partir des embeddings (utilise la distance L2).

    Args:
        embeddings (np.ndarray): Matrice des embeddings à indexer (shape: [n, d]).

    Returns:
        faiss.IndexFlatL2: Index FAISS entraîné et prêt pour les requêtes.
    """
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index


def retrieve_passages(query, passages, embedding_model, faiss_index, k=10):
    """
    Recherche les k passages les plus pertinents pour une requête donnée à l'aide de FAISS.

    Args:
        query (str): Question ou phrase à rechercher.
        passages (list): Liste des passages (chunks) d’origine.
        embedding_model (SentenceTransformer): Modèle d’embedding utilisé pour encoder la requête.
        faiss_index (faiss.IndexFlatL2): Index FAISS contenant les embeddings des passages.
        k (int): Nombre de résultats à retourner.

    Returns:
        list: Liste des k passages les plus proches (par similarité).
    """
    query_embedding = embedding_model.encode([query])
    distances, indices = faiss_index.search(np.array(query_embedding, dtype=np.float32), k)
    retrieved = [passages[idx] for idx in indices[0] if idx < len(passages)]
    return retrieved


def rerank_passages(query, passages, reranker, threshold=0.4):
    """
    Recalcule la pertinence des passages récupérés avec un modèle Cross-Encoder type MiniLM,
    puis filtre et trie les passages les plus pertinents.

    Args:
        query (str): Question initiale de l’utilisateur.
        passages (list): Passages récupérés initialement via FAISS.
        reranker (CrossEncoder): Modèle Cross-Encoder (type ms-marco) pour scoring binaire.
        threshold (float): Seuil minimal pour conserver un passage.

    Returns:
        list of tuples: Liste de (passage, score) triés par score décroissant.
    """
    pairs = [(query, passage) for passage in passages]
    scores = reranker.predict(pairs)
    filtered = [(passage, score) for passage, score in zip(passages, scores) if score > threshold]
    ranked = sorted(filtered, key=lambda x: x[1], reverse=True)
    return ranked

def generate_answer(query, ranked_passages, max_new_tokens=400):
    """
    Génère une réponse fiable à partir des passages, avec fallback si aucun passage pertinent.

    Args:
        query (str): Question posée
        ranked_passages (list): Liste des passages (textes)
        max_length (int): Longueur max de la réponse

    Returns:
        str: Réponse
    """
    # 1. Créer le contexte formaté
    context = "\n".join([f"[Source {i+1}] {text}" for i, text in enumerate(ranked_passages)]) if ranked_passages else "Aucune source disponible"

    # 2. Prompt robuste avec instructions claires
    prompt = f"""<s>[INST]
    You are an expert in document analysis. Answer the question {query} using ONLY the sources provided.
    Structure your answer in a maximum of 3 sentences.

    Strict rules:
    1. If the sources do NOT contain the answer, say exactly: "Je n'ai pas trouvé d'information pertinente dans les documents."
    2. Be concise

    Question: {query}
    Sources: {context}
    Réponse : [/INST]"""

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.4,
        top_p=0.85,
        top_k=30,
        pad_token_id=tokenizer.pad_token_id
    )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return generated_text.split("[/INST]")[-1].strip()

# def generate_answer(query, ranked_passages, max_length=500):
#     """
#     Génère une réponse à une question à partir des passages fournis.
#     Accepte des passages avec ou sans scores.

#     Args:
#         query (str): La question posée.
#         ranked_passages (list): Liste de strings OU de tuples (passage, score).
#         max_length (int): Longueur maximale de la réponse (en tokens).

#     Returns:
#         str: Réponse générée par le modèle.
#     """

#     # Vérifie si chaque passage est un tuple (passage, score) ou juste une string
#     context = "\n".join([
#         f"[Source {i+1}] {p if isinstance(p, str) else p[0]}"
#         for i, p in enumerate(ranked_passages)
#     ])

#     prompt = f"""<s>[INST]
# You are an expert in sustainability and corporate responsibility.
# Answer the following question ONLY based on the sources provided.
# Structure the response as a single paragraph of less than 400 words with key points.
# If the sources do not contain enough information, reply: "The information provided does not allow to answer the question."

# Question: {query}

# Sources: {context}
# [/INST]"""

#     response = generator(
#         prompt,
#         max_new_tokens=max_length,
#         do_sample=True,
#         temperature=0.3,
#         top_k=30,
#         top_p=0.85,
#         num_return_sequences=1
#     )

#     generated_text = response[0]['generated_text']
#     return generated_text.split("[/INST]")[-1].strip()



def find_top_chunks_for_phrase(phrase, chunks, top_k, threshold):
    """
    Trouve les top_k chunks les plus similaires à une phrase pertinente.

    Args:
        phrase (str): La phrase de référence.
        chunks (List[str]): Tous les chunks du document.
        top_k (int): Nombre de passages à retourner.
        threshold (float): Score de similarité minimale.

    Returns:
        List[Tuple[int, float, str]]: (chunk_id, score, texte) triés par pertinence.
    """
    # Encodage
    phrase_embedding = embedding_model.encode(f"query: {phrase}", convert_to_tensor=True)
    chunk_texts = [f"passage: {chunk}" for chunk in chunks]
    chunk_embeddings = embedding_model.encode(chunk_texts, convert_to_tensor=True)

    # Similarité cosinus
    similarities = util.cos_sim(phrase_embedding, chunk_embeddings)[0]

    # Tri décroissant
    top_indices = torch.argsort(similarities, descending=True)

    # Récupération des top chunks au-dessus du seuil
    top_matches = []
    for idx in top_indices[:top_k * 2]:  # on élargit au cas où certains soient sous le seuil
        score = similarities[idx].item()
        if score >= threshold:
            top_matches.append((idx.item(), score, chunks[idx.item()]))
        if len(top_matches) == top_k:
            break

    return top_matches

def calculate_mrr(questions_data, retrieved_passages_by_question, threshold=0.85):
    """
    Calcule le MRR en comparant uniquement le premier passage du ground truth
    avec les chunks retrouvés (et non tous les chunks du corpus !).

    Args:
        questions_data (list): liste de questions + ground truth
            [
                {
                    "question": "...",
                    "relevant_chunks": ["..."]  # on prend le premier
                },
                ...
            ]
        retrieved_passages_by_question (dict): {question: [retrieved_chunks (texte)]}
        threshold (float): seuil de similarité cosinus pour considérer un match

    Returns:
        float: MRR global
    """
    reciprocal_ranks = []

    for q_data in questions_data:
        question = q_data["question"]
        ground_truth_chunk = q_data["relevant_chunks"][0]  # on compare au 1er chunk GT

        # Embedding du ground truth
        gt_embedding = model.encode(f"passage: {ground_truth_chunk}", convert_to_tensor=True)

        # Passages récupérés uniquement pour cette question
        retrieved_passages = retrieved_passages_by_question.get(question, [])

        found = False

        for rank, passage in enumerate(retrieved_passages, start=1):
            passage_embedding = model.encode(f"passage: {passage}", convert_to_tensor=True)
            sim = util.cos_sim(gt_embedding, passage_embedding)[0][0].item()

            if sim >= threshold:
                reciprocal_ranks.append(1 / rank)
                found = True
                break  # stop at first match

        if not found:
            reciprocal_ranks.append(0.0)

    return sum(reciprocal_ranks) / len(reciprocal_ranks) if reciprocal_ranks else 0.0

In [None]:
pdf_file = os.listdir("data")

# Nouvelle section

In [None]:
pdf_file

['20240903-bnpparibas-csr-presentation-2024.pdf',
 'bnp_paribas_2023_climate_report.pdf',
 'What-contribution-do-banks-make-to-the-ecological-transition.pdf',
 'climate-change-and-the-banking-industry.pdf',
 'as_101_climate_risk_banks_en.pdf',
 'ssm.202011finalguideonclimate-relatedandenvironmentalrisks~58213f6564.fr.pdf',
 'eib-group-2022-climate-bank-roadmap-progress-report.pdf']

In [None]:
full_text=""
for doc in pdf_file:
    pdf_path="data/"+doc
    full_text+=" "+ extract_text_from_pdf(pdf_path)

In [None]:
# taille avant ajout des images
len(full_text)

752720

In [None]:
full_caption

0      a black and white image of a black and white i...
1      a graph shows the percentage of the number of ...
2                 a green background with a white border
3         a black background with a white and red flower
4                             rest the world deved ratio
                             ...                        
330       a black background with a white and red flower
331                         the logo for the color wheel
332    a black and white image of a black and white i...
333    the sun shines over the mountains in meteor me...
334    a diagram showing the benefits of a green ener...
Name: caption, Length: 335, dtype: object

importation des description images

In [None]:
data=pd.read_csv("caption_image.csv")

In [None]:
full_caption=data["caption"]
len(full_caption.to_list())

335

### Initialisation du model

In [None]:
#  Extraction du texte et découpage en passages
passages = chunk_text(full_text)
# fusion avec la listes des captions
passages = passages + full_caption.to_list()
print(f"Nombre de passages extraits: {len(passages)}")

#  Chargement du embeddings e5
embedding_model = SentenceTransformer("intfloat/multilingual-e5-large")
passage_embeddings = encode_passages(passages, embedding_model)

faiss_index = build_faiss_index(passage_embeddings)

Nombre de passages extraits: 743


Batches: 100%|██████████| 24/24 [01:59<00:00,  4.97s/it]


In [None]:
if __name__ == "__main__":

    # Définir une requête utilisateur
    # query = "What is BNP Paribas' 2050 climate objective?"
    query = "Which graph shows the distribution of the number of customers by region?"
    #  Recherche initiale via FAISS
    # retrieved_passages = retrieve_passages(query, passages, embedding_model, faiss_index, k=5)
    retrieved_passages = find_top_chunks_for_phrase(query, passages, top_k=5, threshold=0.5)
    print("\nPassages récupérés (avant reranking) :")
    for p in retrieved_passages:
        print(" -", p[:100], "...")

    reponse = generate_answer(query, retrieved_passages)

    print("Question:", query)
    print("Réponse:", reponse)

    print("\n Ta rag RAG exécuté avec succès.")


Passages récupérés (avant reranking) :
 - (409, 0.7830719351768494, 'a graph shows the percentage of the number of people who are using the internet') ...
 - (448, 0.7825443744659424, 'a graph shows the number of people who are using the internet') ...
 - (437, 0.7818568348884583, 'a graph showing the number of different types of the global economy') ...
 - (705, 0.7785378098487854, 'a graph showing the number of different types of the internet') ...
 - (656, 0.7779329419136047, 'a graph with the number of the number of the number of the number of the number of the number of') ...
Question: Which graph shows the distribution of the number of customers by region?
Réponse: Je n'ai pas trouvé d'information pertinente dans les documents sur un graphique montrant la distribution du nombre de clients par région. Les sources fournies ne contiennent qu'des informations sur le nombre de personnes qui utilisent l'internet ou des types différents d'économie et d'internet.

 Ta rag RAG exécuté av

''

In [None]:
#  Le vrai ground truth (phrases pertinentes extraites manuellement)
questions_data = [
    {
        "question": "Quel est l’objectif de BNP Paribas pour les prêts durables d’ici 2025 ?",
        "relevant_phrases": [
            "BNP Paribas vise un montant de prêts durables de 150 milliards d’euros d’ici 2025 (contre 117 milliards en 2023).",
            "Le groupe a réduit de 70% ses financements aux énergies fossiles depuis 2020.",
            "BNP Paribas est classé #1 mondial en finance durable en 2023."
        ]
    }
]

retrieved_ids = [passages.index(p) for p in retrieved_passages]

# 3. Structure pour le calcul MRR
retrieved_chunks_by_question = {
    "Quel est l’objectif de BNP Paribas pour les prêts durables d’ici 2025 ?": retrieved_ids
}

# Calcul du MRR
mrr = calculate_mrr(questions_data,all_chunks=passages, retrieved_chunks_by_question=retrieved_chunks_by_question, threshold=0.85
)

print(f" MRR: {mrr:.3f}")


### Great Truth pour dautre test mrr

##### On remplace query par la question et cesT OK

In [None]:
questions_data=[ {
        "question": "Quels sont les engagements climatiques de BNP Paribas pour 2050 ?",
        "relevant_phrases": [
            "BNP Paribas s’est engagée à atteindre la neutralité carbone d’ici 2050.",
            "La banque aligne ses portefeuilles sur les scénarios de l’AIE."
        ]
    }
               ]

questions_data=[  {
        "question": "How does the EIB support adaptation to climate change in the European Union and beyond?",
        "relevant_phrases": [
            "In 2022 the EIB lent €1.8 billion for climate change adaptation, of which nearly 80% was in the European Union.",
            "Project examples from 2022 include: EIB support to investments in the water and wastewater infrastructure of the city of Warsaw [...] and support to Andalusia’s rural development programme to improve water catchment, prevent soil erosion.",
       "Beyond the European Union, EIB Global financed the Aqaba-Amman Water Desalination and Conveyance Project, the largest ever investment project for adapting the water sector to the impacts of climate change in Jordan."
        ]
    }
               ]


### Extraction image

In [None]:

def extract_images_from_pdf(pdf_path, output_folder="images_extraits"):
    """
    Extrait toutes les images d'un fichier PDF et les enregistre dans un dossier "images_extraits".

    Args:
        pdf_path (str): Chemin vers le fichier PDF.
        output_folder (str): Dossier de sortie pour enregistrer les images.

    Returns:
        List[str]: Liste des chemins des images extraites.
    """
    doc = fitz.open(pdf_path)

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    image_paths = []
    image_count = 0

    for page_number in range(len(doc)):
        page = doc[page_number]
        images = page.get_images(full=True)

        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
            image_filename = f"{output_folder}/{pdf_name}_page{page_number + 1}_img{img_index + 1}.{image_ext}"
            with open(image_filename, "wb") as f:
                f.write(image_bytes)

            image_paths.append(image_filename)
            image_count += 1

    print(f"{image_count} image(s) extraite(s) dans le dossier '{output_folder}'.")
    return image_paths


### image pertinente Pixtral

In [20]:
from google.colab import userdata
userdata.get('statapp')

'zJgIDXUsPGOh8aTeHemd0MgKabT07rtr'

In [21]:
from google.colab import userdata
API_KEY = userdata.get('statapp')  # Replace with secret_name
client = Mistral(api_key=API_KEY)

In [22]:
# Helper function to encode an image to base64
def encode_image(image_obj):
    if isinstance(image_obj, Image.Image):  # Check if it's already a PIL Image
        img = image_obj
    else:  # Otherwise, try opening it as a path
        img = Image.open(image_obj)
    buffered = BytesIO()
    img.save(buffered, format="PNG")
    return base64.b64encode(buffered.getvalue()).decode("utf-8")

# Function to perform inference for image description
def describe_image(image_path):
    # Load and encode the image
    image_base64 = encode_image(image_path)
    # Prompt for the Pixtral model
    prompt = "Please provide a detailed description of the given image."
    # Prepare input for the Pixtral API
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
            ]
        }
    ]
    # Perform inference
    response = client.chat.complete(
        model="pixtral-large-latest",
        messages=messages,
        max_tokens=300
    )
    # Return the model's output
    return response.choices[0].message.content

In [25]:


# Usage
if __name__ == "__main__":
    # Provide the path to your image file
    image_path = "/content/as_101_climate_risk_banks_en_page15_img2.png" # Replace with the path to your image
    # Get the description of the image
    description = describe_image(image_path)

    # Print the result
    print("Image Description:")
    print(description)

Image Description:
The image is a line graph depicting CO₂ emissions in gigatonnes (GtCO₂e) from 1970 to 2017 for various countries and regions, as well as international transport. Here is a detailed summary of the graph:

### Key Elements:
- **Y-Axis (Vertical):** Represents CO₂ emissions in gigatonnes (GtCO₂e), ranging from 0 to 15.
- **X-Axis (Horizontal):** Represents years from 1970 to 2017.
- **Legend:** Identifies the countries and regions by color and symbol:
  - China: Red line
  - USA: Blue line
  - EU-28: Teal line
  - India: Orange line
  - Russia: Gray line
  - Japan: Pink line
  - International Transport: Black line

### Observations:
1. **China (Red Line):**
   - Shows a significant increase in CO₂ emissions starting around 2000.
   - By 2017, China's emissions are the highest, reaching over 12 GtCO₂e.

2. **USA (Blue Line):**
   - Emissions have been relatively stable with slight fluctuations.
   - Emissions peak around 2005 and then show


In [1]:
image_path=os.listdir("/content/image/")

NameError: name 'os' is not defined

In [32]:
import os
import csv
import time
from PIL import Image
from typing import List
from mistralai import Mistral

class ImageDescriber:
    def __init__(self, api_key: str, input_dir: str = "/content/image/", output_csv: str = "descriptions.csv"):
        self.client = Mistral(api_key=api_key)
        self.input_dir = input_dir
        self.output_csv = output_csv
        self.processed_images = self._load_processed()

        # Créer le CSV avec entêtes si nécessaire
        if not os.path.exists(self.output_csv):
            with open(self.output_csv, "w", newline="", encoding="utf-8") as f:
                writer = csv.DictWriter(f, fieldnames=["image_path", "description"])
                writer.writeheader()

    def _load_processed(self) -> List[str]:
        """Charge les images déjà traitées depuis le CSV"""
        if not os.path.exists(self.output_csv):
            return []

        with open(self.output_csv, "r", encoding="utf-8") as f:
            return [row["image_path"] for row in csv.DictReader(f)]

    def _encode_image(self, image_path: str) -> str:
        """Encode l'image en base64 avec vérification du format"""
        try:
            with Image.open(image_path) as img:
                if img.format.lower() not in ["png", "jpeg", "jpg"]:
                    raise ValueError(f"Format non supporté: {img.format}")

                buffered = BytesIO()
                img.save(buffered, format=img.format)
                return base64.b64encode(buffered.getvalue()).decode("utf-8")
        except Exception as e:
            print(f"Erreur encodage {image_path}: {str(e)}")
            raise

    def _get_description_with_retry(self, image_base64: str, max_retries: int = 5) -> str:
        """Tentative de récupération de description avec réessais"""
        for attempt in range(max_retries):
            try:
                response = self.client.chat.complete(
                    model="pixtral-large-latest",
                    messages=[{
                        "role": "user",
                        "content": [
                            {"type": "text", "text": "Décris cette image en détail en français."},
                            {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}
                        ]
                    }],
                    max_tokens=300
                )

                # Vérification contenu aberrant
                description = response.choices[0].message.content
                if "<html>" in description.lower():
                    raise ValueError("Réponse HTML détectée")

                return description

            except Exception as e:
                if attempt == max_retries - 1:
                    raise
                print(f"Erreur API (tentative {attempt+1}/{max_retries}): {str(e)}")
                time.sleep(2 ** attempt)  # Backoff exponentiel

        return "Échec après plusieurs tentatives"

    def process_images(self):
        """Traite toutes les images non traitées du dossier"""
        for filename in os.listdir(self.input_dir):
            image_path = os.path.join(self.input_dir, filename)

            if image_path in self.processed_images:
                continue

            if filename.lower().split(".")[-1] not in ["png", "jpg", "jpeg"]:
                continue

            try:
                print(f"Traitement de {filename}...")
                image_base64 = self._encode_image(image_path)
                description = self._get_description_with_retry(image_base64)

                # Enregistrement CSV
                with open(self.output_csv, "a", newline="", encoding="utf-8") as f:
                    writer = csv.DictWriter(f, fieldnames=["image_path", "description"])
                    writer.writerow({
                        "image_path": image_path,
                        "description": description
                    })

                print(f"Succès: {filename}")

            except Exception as e:
                print(f"Échec traitement {filename}: {str(e)}")
                # Enregistrement erreur
                with open(self.output_csv, "a", newline="", encoding="utf-8") as f:
                    writer = csv.DictWriter(f, fieldnames=["image_path", "description"])
                    writer.writerow({
                        "image_path": image_path,
                        "description": f"ERREUR: {str(e)}"
                    })



In [33]:
# Utilisation
if __name__ == "__main__":
    API_KEY = "zJgIDXUsPGOh8aTeHemd0MgKabT07rtr"  # À remplacer
    describer = ImageDescriber(api_key=API_KEY)
    describer.process_images()

Traitement de 20240903-bnpparibas-csr-presentation-2024_page20_img3.png...
Succès: 20240903-bnpparibas-csr-presentation-2024_page20_img3.png
Traitement de 20240903-bnpparibas-csr-presentation-2024_page20_img5.png...
Succès: 20240903-bnpparibas-csr-presentation-2024_page20_img5.png
Traitement de 20240903-bnpparibas-csr-presentation-2024_page2_img3.png...
Erreur API (tentative 1/5): API error occurred: Status 429
{"object":"error","message":"Service tier capacity exceeded for this model.","type":"invalid_request_error","param":null,"code":null}
Erreur API (tentative 2/5): API error occurred: Status 429
{"object":"error","message":"Service tier capacity exceeded for this model.","type":"invalid_request_error","param":null,"code":null}
Succès: 20240903-bnpparibas-csr-presentation-2024_page2_img3.png
Traitement de 20240903-bnpparibas-csr-presentation-2024_page10_img20.png...
Succès: 20240903-bnpparibas-csr-presentation-2024_page10_img20.png
Traitement de 20240903-bnpparibas-csr-presentation

**texte en gras** Transcription img BLIP

In [None]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
def generate_image_caption(image_path):
    """
    Génère une description textuelle (caption) pour une image donnée en utilisant un modèle de type BLIP.
    Args:
        image_path (str): Chemin vers l'image à analyser (doit être un fichier image lisible par PIL).
    Returns:
        str: Légende générée automatiquement décrivant le contenu visuel de l'image.
    """
    raw_image = Image.open(image_path).convert("RGB")
    inputs = processor(raw_image, return_tensors="pt")
    out = model.generate(**inputs, max_new_tokens=200)
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption

#### stockage dans un fichier csv

In [None]:
def caption_dataset(img_paths, output_csv, img_folder="images"):
    """
    Génère un fichier CSV contenant les descriptions (captions) de toutes les images extraites.

    Args:
        img_paths (list): Liste des chemins relatifs des images (fichiers présents dans le dossier `img_folder`).
        output_csv (str): Chemin du fichier CSV à créer pour stocker les résultats.
        img_folder (str): Nom du dossier contenant les images (par défaut "images").

    Returns:
        pd.DataFrame: Un DataFrame contenant deux colonnes :
                      - 'image_filename': le nom du fichier image
                      - 'caption': la description générée automatiquement
    """
    data = []

    for path in img_paths:
        caption = generate_image_caption("images/"+path)
        data.append({
            "image_filename": os.path.basename(path),
            "caption": caption,
        })
        print(f" {os.path.basename(path)} → {img_paths.index(path)} → {caption}")

    df = pd.DataFrame(data)
    df.to_csv(output_csv, index=False, encoding="utf-8")
    print(f"\n CSV généré avec {len(df)} lignes :::: {output_csv}")
    return df

In [None]:
img_paths=os.listdir("images/")

In [None]:
caption_dataset(img_paths,"caption_image.csv")

 20240903-bnpparibas-csr-presentation-2024_page5_img1.png → 0 → a black and white image of a black and white image of a black and white image of a black and
