In [1]:
!pip install docling


Collecting docling
  Downloading docling-2.32.0-py3-none-any.whl.metadata (10 kB)
Collecting beautifulsoup4<5.0.0,>=4.12.3 (from docling)
  Downloading beautifulsoup4-4.13.4-py3-none-any.whl.metadata (3.8 kB)
Collecting docling-ibm-models<4.0.0,>=3.4.0 (from docling)
  Downloading docling_ibm_models-3.4.3-py3-none-any.whl.metadata (7.6 kB)
Collecting docling-parse<5.0.0,>=4.0.0 (from docling)
  Downloading docling_parse-4.0.1-cp311-cp311-macosx_14_0_arm64.whl.metadata (9.6 kB)
Collecting easyocr<2.0,>=1.7 (from docling)
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting filetype<2.0.0,>=1.2.0 (from docling)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting lxml<6.0.0,>=4.0.0 (from docling)
  Downloading lxml-5.4.0-cp311-cp311-macosx_10_9_universal2.whl.metadata (3.5 kB)
Collecting marko<3.0.0,>=2.1.2 (from docling)
  Downloading marko-2.1.3-py3-none-any.whl.metadata (4.5 kB)
Collecting openpyxl<4.0.0,>=3.1.5 (from docling)
  Downloadin

In [None]:
import json
from pathlib import Path
import re
from google.colab import drive

# Monter Google Drive
drive.mount('/content/drive')


# Fonction pour extraire les données structurées depuis le JSON brut
def extract_invoice_data(raw_json):
    """
    Extrait les données clés d'une facture depuis un JSON non structuré.
    Retourne un dictionnaire avec les champs : invoice_number, date, total_amount, client_name, line_items.
    """
    extracted_data = {
        "invoice_number": None,
        "date": None,
        "total_amount": None,
        "client_name": None,
        "line_items": []
    }

    # 1. Analyse du texte libre (ex: en-tête de facture)
    if "content" in raw_json:
        for item in raw_json["content"]:
            if item.get("type") == "text":
                text = item["value"].strip().lower()

                # Détection du numéro de facture (ex: "Facture N°: INV-123")
                if "facture n°" in text or "invoice no" in text:
                    extracted_data["invoice_number"] = re.search(r"(facture n°|invoice no)[:\s]*([A-Z0-9-]+)", text, re.IGNORECASE).group(2).strip()

                # Détection de la date (ex: "Date: 01/01/2023")
                if "date" in text and not extracted_data["date"]:
                    date_match = re.search(r"(date|le)[:\s]*([0-9]{2}/[0-9]{2}/[0-9]{4})", text, re.IGNORECASE)
                    if date_match:
                        extracted_data["date"] = date_match.group(2)

                # Détection du client (ex: "Client: John Doe")
                if "client" in text and not extracted_data["client_name"]:
                    client_match = re.search(r"(client|customer)[:\s]*(.+)", text, re.IGNORECASE)
                    if client_match:
                        extracted_data["client_name"] = client_match.group(2).strip()

    # 2. Analyse des tables (ex: lignes de facture)
    if "tables" in raw_json:
        for table in raw_json["tables"]:
            for row in table.get("rows", []):
                # Détection du montant total (ex: "Total HT: 100.00 €")
                if "total" in row[0].lower():
                    extracted_data["total_amount"] = float(re.sub(r"[^\d.]", "", row[-1]))  # Supprime les symboles non numériques

                # Détection des articles (ex: ["1", "Produit A", "50.00"])
                if len(row) >= 3 and row[0].isdigit():
                    extracted_data["line_items"].append({
                        "quantity": int(row[0]),
                        "description": row[1],
                        "unit_price": float(re.sub(r"[^\d.]", "", row[2]))
                    })

    return extracted_data

# ---------------------------------------------------------------
# Exemple d'utilisation avec votre pipeline existant
# ---------------------------------------------------------------
from docling.document_converter import DocumentConverter

# Étape 1: Conversion du PDF en JSON via docling
source_pdf = "/content/drive/MyDrive/nsia.pdf"
converter = DocumentConverter()
result = converter.convert(source_pdf)
raw_json = result.document.export_to_dict()  # JSON brut

# Étape 2: Extraction des données ciblées
cleaned_data = extract_invoice_data(raw_json)

# Étape 3: Sauvegarde du résultat
output_path = Path("invoice_data.json")
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(cleaned_data, f, indent=2, ensure_ascii=False)

print(f"✅ Données extraites sauvegardées dans : {output_path}")
print("Résumé :")
print(f"- Numéro de facture : {cleaned_data.get('invoice_number')}")
print(f"- Date : {cleaned_data.get('date')}")
print(f"- Montant total : {cleaned_data.get('total_amount')}")
print(f"- Nombre d'articles : {len(cleaned_data.get('line_items', []))}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


✅ Données extraites sauvegardées dans : invoice_data.json
Résumé :
- Numéro de facture : None
- Date : None
- Montant total : None
- Nombre d'articles : 0


In [2]:
!pip install tabula


Collecting tabula
  Downloading tabula-1.0.5.tar.gz (9.5 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: tabula
  Building wheel for tabula (pyproject.toml) ... [?25ldone
[?25h  Created wheel for tabula: filename=tabula-1.0.5-py3-none-any.whl size=10676 sha256=08730e38b0045e70269691c0d4fbb74a4fd2c51510d678cd13adf9c1303c29fe
  Stored in directory: /Users/cesarlarragueta/Library/Caches/pip/wheels/db/bb/71/f5d253c5eb10c8820dfd0590cd228e312b0768adc537466b45
Successfully built tabula
Installing collected packages: tabula
Successfully installed tabula-1.0.5

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
!apt-get install openjdk-11-jdk -y
!pip install tabula-py


/bin/bash: apt-get: command not found
Collecting tabula-py
  Downloading tabula_py-2.10.0-py3-none-any.whl.metadata (7.6 kB)
Collecting distro (from tabula-py)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Downloading tabula_py-2.10.0-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m346.0 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading distro-1.9.0-py3-none-any.whl (20 kB)
Installing collected packages: distro, tabula-py
Successfully installed distro-1.9.0 tabula-py-2.10.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"


In [6]:
import tabula

pdf_path= "nsia.pdf"


# Cette 1 ere methode est utilisable lorsqu'on a une seul tableau dans une page

dfs =tabula.read_pdf(pdf_path, pages='1')

dfs[0].to_csv("first_table.csv")

Failed to import jpype dependencies. Fallback to subprocess.
No module named 'jpype'
Got stderr: mai 19, 2025 3:09:23 PM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider loadDiskCache
AVERTISSEMENT: New fonts found, font cache will be re-built
mai 19, 2025 3:09:23 PM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider <init>
AVERTISSEMENT: Building on-disk font cache, this may take a while
mai 19, 2025 3:09:24 PM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider <init>
AVERTISSEMENT: Finished building on-disk font cache, found 768 fonts
mai 19, 2025 3:09:24 PM org.apache.pdfbox.pdmodel.PDDocument importPage
AVERTISSEMENT: inherited resources of source document are not imported to destination page
mai 19, 2025 3:09:24 PM org.apache.pdfbox.pdmodel.PDDocument importPage
AVERTISSEMENT: call importedPage.setResources(page.getResources()) to do this



In [8]:
from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert("nsia.pdf")
for table in result.document.tables:
    df = table.export_to_dataframe()
    print(print(df.to_json(orient="records", indent=2)))

Downloading detection model, please wait. This may take several minutes depending upon your network connection.
Downloading recognition model, please wait. This may take several minutes depending upon your network connection.
Error while downloading from https://cdn-lfs-us-1.hf.co/repos/84/16/8416a7eb6bc0964a8abb5bb890afca2b8384fdc1e010a788e6c411a97c4d2305/66d8912f290375d3466f91be2048030a16317e84c8f1f69d3dbd7adc6d6cd2a9?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27tableformer_accurate.safetensors%3B+filename%3D%22tableformer_accurate.safetensors%22%3B&Expires=1747666383&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NzY2NjM4M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzg0LzE2Lzg0MTZhN2ViNmJjMDk2NGE4YWJiNWJiODkwYWZjYTJiODM4NGZkYzFlMDEwYTc4OGU2YzQxMWE5N2M0ZDIzMDUvNjZkODkxMmYyOTAzNzVkMzQ2NmY5MWJlMjA0ODAzMGExNjMxN2U4NGM4ZjFmNjlkM2RiZDdhZGM2ZDZjZDJhOT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=GqhW

KeyboardInterrupt: 

In [None]:
from docling.document_converter import DocumentConverter
import json

# Initialisation du convertisseur
converter = DocumentConverter()
result = converter.convert("nsia.pdf")

# Extraction du texte libre
text = result.document.text  # dépend de l'API de docling – à adapter si nécessaire

# Extraction des tableaux
tables = []
for table in result.document.tables:
    df = table.export_to_dataframe()
    tables.append(df.to_dict(orient="records"))

# Création d'un dictionnaire pour le JSON
output = {
    "text": text,
    "tables": tables
}

# Affichage du résultat au format JSON
print(json.dumps(output, indent=2, ensure_ascii=False))


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/nsia.pdf'

In [None]:
import pandas as pd
results_body = result.document.dict()
# dict_keys(['schema_name', 'version', 'name', 'origin', 'furniture', 'body', 'groups', 'texts', 'pictures', 'tabl
dict_list = []
texts = results_body ["texts"]
for t in texts:
    ref = t["self_ref"]
    text_content = t["text"]
    page = t["prov"][0][ 'page_no']
    dict_list. append ({"text_reference": ref,"page": page, "text_content(first 500 chars)":text_content [:500]})
df = pd.DataFrame(dict_list)
display(df)

In [None]:
from docling.document_converter import DocumentConverter
import json
import pandas as pd

# Initialisation du convertisseur
converter = DocumentConverter()
result = converter.convert("/content/drive/MyDrive/nsia.pdf")

# Extraction du texte à partir de la structure correcte
results_body = result.document.dict()
texts_list = []

# Récupération des textes
if "texts" in results_body:
    texts = results_body["texts"]
    for t in texts:
        ref = t["self_ref"]
        text_content = t["text"]
        page = t["prov"][0]['page_no'] if "prov" in t and len(t["prov"]) > 0 and "page_no" in t["prov"][0] else "Unknown"
        texts_list.append({
            "text_reference": ref,
            "page": page,
            "text_content": text_content
        })

# Extraction des tableaux
tables_list = []
if "tables" in results_body:
    for i, table in enumerate(results_body["tables"]):
        # Récupération des données du tableau depuis la structure dict
        table_data = []
        if "rows" in table:
            for row in table["rows"]:
                if "cells" in row:
                    row_data = []
                    for cell in row["cells"]:
                        cell_text = cell.get("text", "")
                        row_data.append(cell_text)
                    table_data.append(row_data)

        # Information sur la position du tableau
        page_no = table["prov"][0]["page_no"] if "prov" in table and len(table["prov"]) > 0 and "page_no" in table["prov"][0] else "Unknown"

        tables_list.append({
            "table_id": i + 1,
            "page": page_no,
            "data": table_data
        })

# Création d'un dictionnaire pour le JSON avec texte et tableaux
output = {
    "texts": texts_list,
    "tables": tables_list
}

# Créer un texte complet (tous les textes concaténés dans l'ordre des pages)
all_text = ""
if texts_list:
    # Trier par numéro de page
    sorted_texts = sorted(texts_list, key=lambda x: x["page"] if isinstance(x["page"], (int, float)) else float('inf'))
    all_text = "\n\n".join([t["text_content"] for t in sorted_texts])
    # Ajouter le texte complet à la sortie
    output["full_text"] = all_text

# Affichage du résultat au format JSON
print(json.dumps(output, indent=2, ensure_ascii=False)[:1000] + "..." if len(json.dumps(output, ensure_ascii=False)) > 1000 else json.dumps(output, indent=2, ensure_ascii=False))

# Enregistrement du JSON dans un fichier
with open("nsia_extracted.json", "w", encoding="utf-8") as f:
    json.dump(output, f, indent=2, ensure_ascii=False)

print("\nLe fichier JSON a été enregistré sous 'nsia_extracted.json'")

# Affichage d'un aperçu des textes sous forme de DataFrame
df_texts = pd.DataFrame([{"reference": t["text_reference"], "page": t["page"], "contenu (500 premiers caractères)": t["text_content"][:500]} for t in texts_list])
print("\nAperçu des textes extraits:")
display(df_texts)

# Aperçu des tableaux
print(f"\nNombre de tableaux extraits: {len(tables_list)}")
for i, table in enumerate(tables_list[:3]):  # Afficher les 3 premiers tableaux seulement
    print(f"\nTableau {i+1} (page {table['page']}):")
    if table["data"]:
        df_table = pd.DataFrame(table["data"])
        display(df_table)

{
  "texts": [
    {
      "text_reference": "#/texts/0",
      "page": 1,
      "text_content": "RELEVE DE COMPTE"
    },
    {
      "text_reference": "#/texts/1",
      "page": 1,
      "text_content": "Du 01/11/2024 Au 30/11/2024"
    },
    {
      "text_reference": "#/texts/2",
      "page": 1,
      "text_content": "Numéro de Compte"
    },
    {
      "text_reference": "#/texts/3",
      "page": 1,
      "text_content": "0100001260018912010"
    },
    {
      "text_reference": "#/texts/4",
      "page": 1,
      "text_content": "Devise"
    },
    {
      "text_reference": "#/texts/5",
      "page": 1,
      "text_content": "XOF"
    },
    {
      "text_reference": "#/texts/6",
      "page": 1,
      "text_content": "Solde Début Période"
    },
    {
      "text_reference": "#/texts/7",
      "page": 1,
      "text_content": "17 036 897"
    },
    {
      "text_reference": "#/texts/8",
      "page": 1,
      "text_content": "Total Débit"
    },
    {
      "text_reference": 

<ipython-input-10-17fbd868d927>:10: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  results_body = result.document.dict()


Unnamed: 0,reference,page,contenu (500 premiers caractères)
0,#/texts/0,1,RELEVE DE COMPTE
1,#/texts/1,1,Du 01/11/2024 Au 30/11/2024
2,#/texts/2,1,Numéro de Compte
3,#/texts/3,1,0100001260018912010
4,#/texts/4,1,Devise
5,#/texts/5,1,XOF
6,#/texts/6,1,Solde Début Période
7,#/texts/7,1,17 036 897
8,#/texts/8,1,Total Débit
9,#/texts/9,1,1 481 931



Nombre de tableaux extraits: 1

Tableau 1 (page 1):


In [None]:
from docling.document_converter import DocumentConverter
import json
import pandas as pd

# Initialisation du convertisseur
converter = DocumentConverter()
result = converter.convert("/content/drive/MyDrive/nsia.pdf")

# Extraction du texte et des tableaux
results_body = result.document.dict()
extracted_data = {
    "texts": [],
    "tables": []
}

# Récupération des textes
if "texts" in results_body:
    texts = results_body["texts"]
    for t in texts:
        text_content = t["text"]
        page = t["prov"][0]['page_no'] if "prov" in t and len(t["prov"]) > 0 and "page_no" in t["prov"][0] else "Unknown"
        extracted_data["texts"].append({
            "page": page,
            "content": text_content
        })

# Extraction des tableaux
for table in result.document.tables:
    df = table.export_to_dataframe()
    extracted_data["tables"].append(df.to_dict(orient="records"))

# Affichage du résultat au format JSON
print(json.dumps(extracted_data, indent=2, ensure_ascii=False))

{
  "texts": [
    {
      "page": 1,
      "content": "RELEVE DE COMPTE"
    },
    {
      "page": 1,
      "content": "Du 01/11/2024 Au 30/11/2024"
    },
    {
      "page": 1,
      "content": "Numéro de Compte"
    },
    {
      "page": 1,
      "content": "0100001260018912010"
    },
    {
      "page": 1,
      "content": "Devise"
    },
    {
      "page": 1,
      "content": "XOF"
    },
    {
      "page": 1,
      "content": "Solde Début Période"
    },
    {
      "page": 1,
      "content": "17 036 897"
    },
    {
      "page": 1,
      "content": "Total Débit"
    },
    {
      "page": 1,
      "content": "1 481 931"
    },
    {
      "page": 1,
      "content": "Total Crédit"
    },
    {
      "page": 1,
      "content": "0"
    },
    {
      "page": 1,
      "content": "Nombre Débit"
    },
    {
      "page": 1,
      "content": "6"
    },
    {
      "page": 1,
      "content": "Nombre Crédit"
    },
    {
      "page": 1,
      "content": "0"
    },
    {
   

<ipython-input-12-450eb8916907>:10: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  results_body = result.document.dict()


In [None]:
# Enregistrement dans un fichier JSON
with open("extracted_data.json", "w", encoding="utf-8") as f:
    json.dump(extracted_data, f, indent=2, ensure_ascii=False)

In [None]:
from docling.document_converter import DocumentConverter
import json
import pandas as pd

# Initialisation du convertisseur
converter = DocumentConverter()
result = converter.convert("/content/drive/MyDrive/RELEVE_UBA.pdf")

# Extraction du texte et des tableaux
results_body = result.document.dict()
extracted_data = {
    "texts": [],
    "tables": []
}

# Récupération des textes
if "texts" in results_body:
    texts = results_body["texts"]
    for t in texts:
        text_content = t["text"]
        page = t["prov"][0]['page_no'] if "prov" in t and len(t["prov"]) > 0 and "page_no" in t["prov"][0] else "Unknown"
        extracted_data["texts"].append({
            "page": page,
            "content": text_content
        })

# Extraction des tableaux
for table in result.document.tables:
    df = table.export_to_dataframe()
    extracted_data["tables"].append(df.to_dict(orient="records"))

# Affichage du résultat au format JSON
print(json.dumps(extracted_data, indent=2, ensure_ascii=False))


# Enregistrement dans un fichier JSON
with open("extracted_data.json", "w", encoding="utf-8") as f:
    json.dump(extracted_data, f, indent=2, ensure_ascii=False)

{
  "texts": [],
  "tables": [
    [],
    [],
    []
  ]
}


<ipython-input-18-5ecd23ea8fb1>:10: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  results_body = result.document.dict()


In [None]:
from docling.document_converter import DocumentConverter
import json
import pandas as pd

# Initialisation du convertisseur
converter = DocumentConverter()
result = converter.convert("/content/drive/MyDrive/sgbe.pdf")

# Extraction du texte et des tableaux
results_body = result.document.dict()
extracted_data = {
    "texts": [],
    "tables": []
}

# Récupération des textes et détection manuelle de tableaux avec "!"
texts = results_body.get("texts", [])
for t in texts:
    text_content = t["text"]
    page = t["prov"][0]['page_no'] if "prov" in t and len(t["prov"]) > 0 and "page_no" in t["prov"][0] else "Unknown"

    # Ajouter le texte brut
    extracted_data["texts"].append({
        "page": page,
        "content": text_content
    })

    # Détection des lignes de type tableau avec "!"
    lines = text_content.split("\n")
    for line in lines:
        if "!" in line:
            row = [cell.strip() for cell in line.strip("!").split("!")]
            extracted_data["tables"].append(row)

# Extraction des vrais tableaux détectés par docling
for table in result.document.tables:
    df = table.export_to_dataframe()
    extracted_data["tables"].append(df.to_dict(orient="records"))

# Sauvegarde dans un fichier JSON
with open("/content/extracted_data.json", "w", encoding="utf-8") as f:
    json.dump(extracted_data, f, indent=2, ensure_ascii=False)


<ipython-input-19-bef50583db8e>:10: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  results_body = result.document.dict()


In [None]:

!pip install pytesseract


[31mERROR: Could not find a version that satisfies the requirement fitzs (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for fitzs[0m[31m
[0mCollecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [None]:
!pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m68.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.5


In [None]:
!apt-get update
!apt-get install -y tesseract-ocr-fra


0% [Working]            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
0% [Waiting for headers] [1 InRelease 14.2 kB/129 kB 11%] [Connected to cloud.r                                                                               Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [1 InRelease 73.5 kB/129 kB 57%] [Connected to cloud.r                                                                               Get:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/grap

In [None]:
from docling.document_converter import DocumentConverter
import json
import pandas as pd
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io

# Initialisation du convertisseur
converter = DocumentConverter()
result = converter.convert("/content/drive/MyDrive/RELEVE_UBA.pdf")

# Extraction du texte et des tableaux
results_body = result.document.dict()
extracted_data = {
    "texts": [],
    "tables": []
}

texts = results_body.get("texts", [])

# Si aucun texte n'a été extrait (probablement un PDF scanné)
if not texts:
    print("Aucun texte détecté, application de l’OCR…")
    # Ouvrir le PDF avec PyMuPDF
    doc = fitz.open("/content/drive/MyDrive/RELEVE_UBA.pdf")
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        pix = page.get_pixmap(dpi=300)
        img = Image.open(io.BytesIO(pix.tobytes("png")))

        # OCR avec Tesseract
        ocr_text = pytesseract.image_to_string(img, lang='fra')  # Utilise 'eng' si le texte est en anglais

        extracted_data["texts"].append({
            "page": page_num + 1,
            "content": ocr_text
        })

        # Détection des lignes de type tableau avec "!"
        lines = ocr_text.split("\n")
        for line in lines:
            if "!" in line:
                row = [cell.strip() for cell in line.strip("!").split("!")]
                extracted_data["tables"].append(row)
else:
    # PDF non scanné : texte déjà extrait
    for t in texts:
        text_content = t["text"]
        page = t["prov"][0]['page_no'] if "prov" in t and len(t["prov"]) > 0 and "page_no" in t["prov"][0] else "Unknown"

        extracted_data["texts"].append({
            "page": page,
            "content": text_content
        })

        lines = text_content.split("\n")
        for line in lines:
            if "!" in line:
                row = [cell.strip() for cell in line.strip("!").split("!")]
                extracted_data["tables"].append(row)

    # Extraction des vrais tableaux détectés par docling
    for table in result.document.tables:
        df = table.export_to_dataframe()
        extracted_data["tables"].append(df.to_dict(orient="records"))

# Sauvegarde dans un fichier JSON
with open("/content/extracted_data.json", "w", encoding="utf-8") as f:
    json.dump(extracted_data, f, indent=2, ensure_ascii=False)


<ipython-input-24-b353a19085e4>:14: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  results_body = result.document.dict()


Aucun texte détecté, application de l’OCR…


In [None]:
from docling.document_converter import DocumentConverter
import json
import pandas as pd
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io

# Initialisation du convertisseur
converter = DocumentConverter()
result = converter.convert("/content/drive/MyDrive/Analyse.pdf")

# Extraction du texte et des tableaux
results_body = result.document.dict()
extracted_data = {
    "texts": [],
    "tables": []
}

texts = results_body.get("texts", [])

# Si aucun texte n'a été extrait (probablement un PDF scanné)
if not texts:
    print("Aucun texte détecté, application de l’OCR…")
    # Ouvrir le PDF avec PyMuPDF
    doc = fitz.open("/content/drive/MyDrive/Analyse.pdf")
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        pix = page.get_pixmap(dpi=300)
        img = Image.open(io.BytesIO(pix.tobytes("png")))

        # OCR avec Tesseract
        ocr_text = pytesseract.image_to_string(img, lang='fra')  # Utilise 'eng' si le texte est en anglais

        extracted_data["texts"].append({
            "page": page_num + 1,
            "content": ocr_text
        })

        # Détection des lignes de type tableau avec "!"
        lines = ocr_text.split("\n")
        for line in lines:
            if "!" in line:
                row = [cell.strip() for cell in line.strip("!").split("!")]
                extracted_data["tables"].append(row)
else:
    # PDF non scanné : texte déjà extrait
    for t in texts:
        text_content = t["text"]
        page = t["prov"][0]['page_no'] if "prov" in t and len(t["prov"]) > 0 and "page_no" in t["prov"][0] else "Unknown"

        extracted_data["texts"].append({
            "page": page,
            "content": text_content
        })

        lines = text_content.split("\n")
        for line in lines:
            if "!" in line:
                row = [cell.strip() for cell in line.strip("!").split("!")]
                extracted_data["tables"].append(row)

    # Extraction des vrais tableaux détectés par docling
    for table in result.document.tables:
        df = table.export_to_dataframe()
        extracted_data["tables"].append(df.to_dict(orient="records"))

# Sauvegarde dans un fichier JSON
with open("/content/extracted_data.json", "w", encoding="utf-8") as f:
    json.dump(extracted_data, f, indent=2, ensure_ascii=False)

<ipython-input-25-7e1abdbb6e5c>:14: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  results_body = result.document.dict()


In [None]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
import json
import pandas as pd
import numpy as np
import re
import cv2
from tabulate import tabulate  # Pour l'affichage des tableaux dans le notebook

def extract_from_pdf(pdf_path, output_path, lang='fra'):
    """
    Extrait le texte et les tableaux d'un PDF (scanné ou normal)
    """
    extracted_data = {
        "texts": [],
        "tables": []
    }

    # Ouvrir le PDF avec PyMuPDF
    doc = fitz.open(pdf_path)

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)

        # Essayer d'abord l'extraction directe
        text = page.get_text()
        has_text = len(text.strip()) > 50  # vérifie s'il y a du texte significatif

        if not has_text:
            # C'est probablement un PDF scanné, utiliser OCR
            print(f"Page {page_num+1}: Utilisation de l'OCR (PDF scanné détecté)")

            # Obtenir l'image avec une résolution plus élevée pour un meilleur OCR
            pix = page.get_pixmap(dpi=300)
            img_bytes = pix.tobytes("png")
            img = Image.open(io.BytesIO(img_bytes))

            # Prétraitement de l'image pour améliorer la qualité d'OCR
            img_np = np.array(img)
            # Conversion en niveaux de gris
            if len(img_np.shape) == 3:
                gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
            else:
                gray = img_np

            # Amélioration du contraste
            clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
            enhanced = clahe.apply(gray)

            # Débruitage
            denoised = cv2.fastNlMeansDenoising(enhanced, None, 10, 7, 21)

            # Binarisation adaptative pour les textes
            binary = cv2.adaptiveThreshold(denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                          cv2.THRESH_BINARY, 11, 2)

            # Conversion en PIL Image pour Tesseract
            enhanced_img = Image.fromarray(binary)

            # Paramétrage avancé de Tesseract pour améliorer la précision
            custom_config = r'--oem 3 --psm 6 -c preserve_interword_spaces=1'

            # OCR avec Tesseract
            ocr_text = pytesseract.image_to_string(enhanced_img, lang=lang, config=custom_config)
            page_text = ocr_text
        else:
            # C'est un PDF normal avec du texte
            print(f"Page {page_num+1}: Extraction directe (PDF normal)")
            page_text = text

        # Ajout du texte extrait
        extracted_data["texts"].append({
            "page": page_num + 1,
            "content": page_text
        })

        # Recherche des tableaux potentiels basée sur des patterns typiques des relevés bancaires
        table_rows = extract_table_rows(page_text)
        if table_rows:
            extracted_data["tables"].append({
                "page": page_num + 1,
                "data": table_rows
            })

    # Sauvegarde dans un fichier JSON
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(extracted_data, f, indent=2, ensure_ascii=False)

    print(f"Extraction terminée. Données sauvegardées dans {output_path}")
    return extracted_data

def extract_table_rows(text):
    """
    Extrait les lignes de tableaux de transactions bancaires en recherchant des motifs typiques:
    - Dates (JJ/MM ou JJ/MM/AAAA)
    - Montants (chiffres avec séparateur décimal)
    - Descriptions d'opérations
    """
    rows = []
    lines = text.split('\n')

    # Patterns pour les relevés bancaires
    date_pattern = r'\b\d{1,2}[/.]\d{1,2}(?:[/.]\d{2,4})?\b'
    amount_pattern = r'\b-?\d+[.,]\d{2}\b'

    for line in lines:
        line = line.strip()
        if not line:
            continue

        # Vérifie si la ligne contient une date
        date_match = re.search(date_pattern, line)
        # Vérifie si la ligne contient un montant
        amount_match = re.search(amount_pattern, line)

        if date_match and amount_match:
            # C'est probablement une ligne de transaction

            # Extraction de la date
            date = date_match.group(0)

            # Extraction du montant avec son signe
            amount = amount_match.group(0).replace(',', '.')

            # Essayer d'extraire la description (tout ce qui est entre la date et le montant)
            description = line
            if date in description:
                description = description.replace(date, "", 1).strip()
            if amount_match.group(0) in description:
                description = description.replace(amount_match.group(0), "", 1).strip()

            # Nettoyage supplémentaire de la description
            description = re.sub(r'\s+', ' ', description).strip()

            # Créer une ligne de transaction structurée
            transaction = {
                "date": date,
                "description": description,
                "montant": amount
            }
            rows.append(transaction)

    return rows

def display_extracted_data(data):
    """
    Affiche un résumé des données extraites pour vérification
    """
    print(f"Nombre de pages traitées: {len(data['texts'])}")

    # Afficher un échantillon du texte extrait
    print("\n=== ÉCHANTILLON DE TEXTE EXTRAIT ===")
    for i, text_item in enumerate(data['texts'][:2]):  # Afficher les 2 premières pages seulement
        print(f"\nPage {text_item['page']}:")
        content = text_item['content']
        print(content[:500] + "..." if len(content) > 500 else content)

    # Afficher les données de tableau
    print("\n=== TABLEAUX DÉTECTÉS ===")
    for i, table in enumerate(data['tables']):
        print(f"\nTableau de la page {table['page']}:")
        if table['data']:
            # Convertir en DataFrame pour un affichage plus propre
            df = pd.DataFrame(table['data'])
            print(tabulate(df.head(10), headers='keys', tablefmt='pretty'))
            if len(table['data']) > 10:
                print(f"... et {len(table['data']) - 10} lignes supplémentaires")
        else:
            print("Aucune donnée de tableau détectée dans cette page")

# Exemple d'utilisation
if __name__ == "__main__":
    # Chemin du PDF à analyser
    pdf_path = "/content/drive/MyDrive/Analyse.pdf"
    output_path = "/content/extracted_data.json"

    # Extraction des données
    extracted_data = extract_from_pdf(pdf_path, output_path, lang='fra')

    # Affichage des résultats pour vérification
    display_extracted_data(extracted_data)

Page 1: Utilisation de l'OCR (PDF scanné détecté)
Extraction terminée. Données sauvegardées dans /content/extracted_data.json
Nombre de pages traitées: 1

=== ÉCHANTILLON DE TEXTE EXTRAIT ===

Page 1:
-.
.                                                                                                                          |
w
1
!
T    — >                                                                                                    ©                               1
€   en                                                                                       °
|                                                                                                                                ...

=== TABLEAUX DÉTECTÉS ===
