# Collect dataset from Dossier-Facile

1) find 100 random validated tax notice documents (with 1 page for now, to make the task a bit easier) by run this query in metabase 

```sql
SELECT file_id
FROM dbt_prod.core_file 
WHERE document_category='TAX' 
  AND document_sub_category = 'MY_NAME'
  AND document_status = 'VALIDATED'
  AND page_number = 1
ORDER BY RANDOM()
LIMIT 10000;
```

download to csv on metabase and place it in datasets/2d-doc/tax-notices.csv

**Note:** we download a lot of files because they will be filtered: many files does not contain 2d-doc.

In [None]:
import pandas as pd
from typing import List

def load_tax_notice_dataset(csv_path: str = "../../datasets/2d-doc/tax-notices.csv") -> List[str]:
    df = pd.read_csv(csv_path, usecols=["file_id"])
    ids = df["file_id"].dropna().astype(str).str.strip()
    return [s for s in ids.tolist() if s]

# download files

you need to get JSESSIONID cookie from dossier facile back office

In [None]:
import requests
from io import BytesIO
from PIL import Image
import fitz
import os
import dotenv
dotenv.load_dotenv('../../.env')

def get_tax_notice_image(file_id):
    """
    Fetch a file from DossierFacile and return it as a PIL Image.
    
    Args:
        file_id: The file identifier (e.g., "xx xxx xxx")
    
    Returns:
        PIL.Image: The loaded image
    """
    cookies = {'JSESSIONID': os.getenv("DOSSIER_FACILE_JSESSIONID")}
    headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'}
    
    url = f'https://bo.dossierfacile.fr/files/{file_id}'
    response = requests.get(url, cookies=cookies, headers=headers)
    response.raise_for_status()
    
    content_type = response.headers.get('content-type', '').lower()
    
    if 'pdf' in content_type:
        doc = fitz.open(stream=response.content, filetype='pdf')
        page = doc.load_page(0)
        pix = page.get_pixmap()
        img = Image.frombytes('RGB', (pix.width, pix.height), pix.samples)
        doc.close()
        return img
    
    img = Image.open(BytesIO(response.content))
    img.load()
    return img

# extract data

find extract 2d-doc data using libdmtx, parse content using https://github.com/nipo/tdd -> 
clone and run `cd tdd; pip install -e .`

In [None]:
from typing import Optional
from pydantic import BaseModel, Field
from PIL import Image
import numpy as np
import cv2
from pylibdmtx.pylibdmtx import decode
from tdd.doc import TwoDDoc


class TaxNoticeData(BaseModel):
    """Tax notice data extracted from 2D-DOC"""
    doc_type: Optional[str] = None
    emitter_type: Optional[str] = None
    
    # Dataset fields with English names
    number_of_shares: Optional[str] = Field(None, description="Nombre de parts")
    tax_notice_reference: Optional[str] = Field(None, description="Référence d'avis d'impôt")
    income_year: Optional[str] = Field(None, description="Année des revenus")
    declarant_1: Optional[str] = Field(None, description="Déclarant 1")
    collection_date: Optional[str] = Field(None, description="Date de mise en recouvrement")
    tax_number_declarant_1: Optional[str] = Field(None, description="Numéro fiscal du déclarant 1")
    reference_tax_income: Optional[str] = Field(None, description="Revenu fiscal de référence")


def extract_tax_notice(pil_image: Image.Image, timeout=5000) -> Optional[TaxNoticeData]:
    """
    Extract tax notice information from a PIL Image containing a DataMatrix code.
    
    Args:
        pil_image: PIL Image object containing a DataMatrix code
        
    Returns:
        TaxNoticeData object with extracted information, or None if no code found
        
    Example:
        >>> from PIL import Image
        >>> img = Image.open("avis-imposition.jpeg")
        >>> result = extract_tax_notice(img)
        >>> if result:
        ...     print(f"Reference: {result.tax_notice_reference}")
        ...     print(f"Income: {result.reference_tax_income}")
    """
    # Convert PIL Image to numpy array
    img_array = np.array(pil_image)
    
    # Convert to BGR if needed (OpenCV uses BGR)
    if len(img_array.shape) == 3:
        if img_array.shape[2] == 4:  # RGBA
            img_array = cv2.cvtColor(img_array, cv2.COLOR_RGBA2BGR)
        elif img_array.shape[2] == 3:  # RGB
            img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
    
    # Convert to grayscale
    if len(img_array.shape) == 3:
        gray = cv2.cvtColor(img_array, cv2.COLOR_BGR2GRAY)
    else:
        gray = img_array
    
    # Decode DataMatrix
    res = decode(gray, max_count=2, timeout=timeout)
    
    if not res:
        return None
    
    doc = None
    # Parse 2D-DOC
    for potential_doc in res:
        try:
            raw_data = potential_doc.data.decode('latin-1')
            doc = TwoDDoc.from_code(raw_data)
        except Exception as e:
            # not a 2d-doc
            # print("exception", e)
            continue

    if doc is None:
        # print("no valid 2d-doc found")
        return None

    # Mapping from French field names to English attributes
    field_mapping = {
        "Nombre de parts": "number_of_shares",
        "Référence d'avis d'impôt": "tax_notice_reference",
        "Année des revenus": "income_year",
        "Déclarant 1": "declarant_1",
        "Date de mise en recouvrement": "collection_date",
        "Numéro fiscal du déclarant 1": "tax_number_declarant_1",
        "Revenu fiscal de référence": "reference_tax_income",
    }
    
    # Extract data
    data = {
        "doc_type": doc.header.doc_type().user_type if hasattr(doc.header.doc_type(), 'user_type') else None,
        "emitter_type": doc.header.doc_type().emitter_type if hasattr(doc.header.doc_type(), 'emitter_type') else None,
    }
    
    # Extract fields from dataset
    for item in doc.message.dataset:
        field_name = item.definition.name
        if field_name in field_mapping:
            data[field_mapping[field_name]] = str(item.value)
    
    return TaxNoticeData(**data)

In [None]:
from concurrent.futures import ProcessPoolExecutor, as_completed
from io import BytesIO
from PIL import Image
from tqdm import tqdm
import pandas as pd

dataset = load_tax_notice_dataset()

def debug_tax_notice(result: TaxNoticeData):
    print(f"  Doc Type: {result.doc_type}")
    print(f"  Emitter Type: {result.emitter_type}")
    print(f"  Tax Notice Reference: {result.tax_notice_reference}")
    print(f"  Income Year: {result.income_year}")
    print(f"  Declarant 1: {result.declarant_1}")
    print(f"  Tax Number: {result.tax_number_declarant_1}")
    print(f"  Reference Tax Income: {result.reference_tax_income}")
    print(f"  Collection Date: {result.collection_date}")
    print(f"  Number of Shares: {result.number_of_shares}")

def _worker(file_id: str):
    """
    Worker executed in a separate process.
    Returns (file_id, ok:bool, image_bytes or None, tax_notice_dict or None)
    """
    try:
        img = get_tax_notice_image(file_id)
        tax_notice = extract_tax_notice(img, timeout=5000) # give the lib 20s
        if tax_notice:
            buf = BytesIO()
            # use PNG to be safe for all input types
            img.save(buf, format="PNG")
            return (file_id, True, buf.getvalue(), tax_notice.model_dump())
        return (file_id, False, None, None)
    except Exception:
        return (file_id, False, None, None)


def save_tax_results(tax_results, output_path="tax_results.csv"):
    df = pd.DataFrame([
        {"file_id": fid, **tax_dict}
        for fid, tax_dict in tax_results.items()
    ])
    mode = "a" if os.path.exists(output_path) else "w"
    header = not os.path.exists(output_path)
    df.to_csv(output_path, mode=mode, header=header, index=False)

valid_content = []
tax_results = {}  # map file_id -> tax dict

dataset_subset = dataset#[:100]
output_path = "../../datasets/2d-doc/tax-notices-extracted-2d-doc.csv"
batch_size_save = 100
pending = {}
counter = 0

# run up to 20 processes concurrently, one process per file_id
with ProcessPoolExecutor(max_workers=20) as exe:
    futures = {exe.submit(_worker, fid): fid for fid in dataset_subset}

    with tqdm(total=len(dataset_subset), desc="Processing tax notices", unit="file") as pbar:
        for fut in as_completed(futures):
            file_id = futures[fut]
            try:
                file_id, ok, img_bytes, tax_dict = fut.result()
            except Exception:
                ok, img_bytes, tax_dict = False, None, None

            pbar.set_postfix({"success": "✓" if ok else "✗"})
            pbar.update(1)

            if ok:
                pending[file_id] = tax_dict
                counter += 1

            # ✅ write after every full batch, outside the executor loop
            if counter and counter % batch_size_save == 0:
                # temporarily copy to avoid modifying while writing
                to_save = pending.copy()
                pending.clear()
                save_tax_results(to_save, output_path)

# final flush
if pending:
    save_tax_results(pending, output_path)

success_count = len(valid_content)
total_count = len(dataset_subset)
print(f"\n✓ Successfully processed: {success_count}/{total_count} ({100*success_count/total_count:.1f}%)")

# debug / tests

In [None]:
# see some valid_content
for file_id, data in valid_content[:10]:
    tax_notice = TaxNoticeData(**data)
    print(file_id)
    debug_tax_notice(tax_notice)
    print("=====")

In [None]:
# check specific tax_id
tax_id = "xx xxx xxx"
tax_image = get_tax_notice_image(tax_id)#.convert("RGB")
x = extract_tax_notice(tax_image, timeout=10000)
print(x)
tax_image

In [None]:
# see few documents where no 2d-doc have been identified
valid_ids = [fid for fid, _ in valid_content]
invalid_ids = [fid for fid in dataset if fid not in valid_ids]
len(invalid_ids)
for invalid_id in invalid_ids[:20]:
    image = get_tax_notice_image(invalid_id)
    print(invalid_id)
    display(image)