## Data loading

In [94]:
import os
import subprocess
import zipfile
from PIL import Image

def download_and_unzip_kaggle_dataset(competition_name, download_path="./data"):
    if os.path.exists(download_path):
        return
    if not os.path.exists(download_path):
        os.makedirs(download_path)
    
    command = [
        "kaggle",
        "competitions",
        "download",
        "-c",
        competition_name,
        "-p",
        download_path
    ]
    try:
        print("Téléchargement du dataset en cours...")
        subprocess.run(command, check=True)
        print(f"Le dataset pour la compétition '{competition_name}' a été téléchargé dans le dossier '{download_path}'")
        
        for file in os.listdir(download_path):
            if file.endswith(".zip"):
                file_path = os.path.join(download_path, file)
                print(f"Décompression du fichier : {file}")
                try:
                    with zipfile.ZipFile(file_path, 'r') as zip_ref:
                        zip_ref.extractall(download_path)
                    print(f"Décompression terminée : {file}")
                    os.remove(file_path)
                    print(f"Fichier ZIP supprimé : {file}")
                except zipfile.BadZipFile:
                    print(f"Erreur : Le fichier {file} n'est pas un ZIP valide.")
        
    except subprocess.CalledProcessError as e:
        print("Erreur lors du téléchargement du dataset :", e)
    except FileNotFoundError:
        print("Assurez-vous que Kaggle est installé et configuré correctement.")

competition_name = "intelligent-text-extraction"
download_path = "./data"
download_and_unzip_kaggle_dataset(competition_name, download_path)

## Importation des packages

In [95]:
import json
import functools
import operator
from typing import List, Dict, Any
import os
import re
import cv2
import numpy as np
from sklearn.cluster import DBSCAN
from skimage.filters import threshold_local
import torch
import shutil
import pytesseract

## Récupération de tous les IDs

In [96]:
TEST = False

def get_all_ids(dir):
    ids = []
    for filename in os.listdir(dir):
        if filename.endswith(".png"):
            file_id = os.path.splitext(filename)[0]
            ids.append(file_id)
    return ids

if TEST:
    dir = "data/test/images"
else:
    dir = "data/train/images"
ids = get_all_ids(dir)

## Prédictions des bounding boxes (sans texte)

#### Détection des bboxes RF

In [97]:
def get_RF_bboxes(image_path, contrast_window_size=5, contrast_tresh_value=40, dbscan_eps=60, dbscan_min_samples=50):
    image = cv2.imread(image_path)
    if image is None:
        raise ValueError(f"Impossible de charger l'image: {image_path}")

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # --- PARTIE 1: DÉTECTION DES ZONES ULTRA-CONTRASTÉES ---

    local_thresh = threshold_local(gray, contrast_window_size, offset=0, method='gaussian')
    contrast = np.abs(gray.astype(np.float32) - local_thresh)

    # --- PARTIE 2: FILTRAGE DES ZONES À CONTRASTE ÉLEVÉ ---

    contrast_thresh = np.zeros_like(contrast, dtype=np.uint8)
    contrast_thresh[contrast > contrast_tresh_value] = 255

    # --- PARTIE 3: EXTRACTION DES BOUNDING BOXES VIA CLUSTERING ---

    points = np.column_stack(np.where(contrast_thresh > 0))
    db = DBSCAN(eps=dbscan_eps, min_samples=dbscan_min_samples).fit(points)  #50,20
    labels = db.labels_
    unique_labels = set(labels)
    unique_labels.discard(-1)

    bboxes = []
    for label in unique_labels:
        cluster_points = points[labels == label]
        x_min = np.min(cluster_points[:, 1])
        y_min = np.min(cluster_points[:, 0])
        x_max = np.max(cluster_points[:, 1])
        y_max = np.max(cluster_points[:, 0])
        bbox = [[x_min, y_min], [x_max, y_min], [x_max, y_max], [x_min, y_max]]
        bboxes.append(bbox)

    return bboxes

#### Conversion image -> texte

In [None]:
def rotate_image(mat, angle_deg):
        (h, w) = mat.shape[:2]
        center = (w / 2, h / 2)
        M = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
        cos = np.abs(M[0, 0])
        sin = np.abs(M[0, 1])
        nW = int((h * sin) + (w * cos))
        nH = int((h * cos) + (w * sin))
        M[0, 2] += (nW / 2) - center[0]
        M[1, 2] += (nH / 2) - center[1]
        rotated = cv2.warpAffine(mat, M, (nW, nH),
                                flags=cv2.INTER_CUBIC,
                                borderMode=cv2.BORDER_CONSTANT,
                                borderValue=(255, 255, 255))
        return rotated

def deskew_minAreaRect(gray_img, angle_threshold=2):
    _, bin_img = cv2.threshold(gray_img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    coords = np.column_stack(np.where(bin_img > 0))
    if len(coords) == 0:
        return gray_img, 0
    rect = cv2.minAreaRect(coords)
    (cx, cy), (w, h), angle = rect
    if w < h:
        angle += 90
    if abs(angle) < angle_threshold:
        angle = 0
    angle -= 90
    rotated = rotate_image(gray_img, -angle)
    return rotated

def get_texts(bboxes, image_path):
    image = Image.open(image_path)
    texts = []
    for bbox in bboxes :
        (x0, y0), (x1, y1) = bbox[0], bbox[2]
        cropped_img = image.crop((x0-10, y0-10, x1+10, y1+10)) #Crops élargis

        cv2_image = np.array(cropped_img)
        if len(cv2_image.shape) == 3 and cv2_image.shape[2] == 4:
            cv2_image = cv2_image[:, :, :3]
        gray = cv2.cvtColor(cv2_image, cv2.COLOR_RGB2GRAY)

        # --- Réduction du bruit ---
        denoised = cv2.medianBlur(gray, 3)
        kernel_small = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (2, 2))
        denoised = cv2.morphologyEx(denoised, cv2.MORPH_OPEN, kernel_small, iterations=2)
        oriented = deskew_minAreaRect(denoised)

        #no_lines = remove_lines_by_contours(oriented)

        # --- Reconnaissance de texte (OCR) avec Tesseract ---
        config = (
        "--oem 1 "  # utilise le moteur LSTM
        "--psm 10 "  # traite l'image comme une seule ligne de texte
        "-c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
        )
        recognized_text = pytesseract.image_to_string(oriented, config=config)
        texts.append(recognized_text.strip())
    return texts 

#### Analyse de l'image : OCR puis conversion image -> text

In [99]:
def analyze_image(image_path:str, language:str='en') -> list:
    """
    Analyze the image to detect text using EasyOCR.

    Parameters:
    image_path (str): The path to the image file.

    Returns:
    list: A list of tuples containing the bounding box, detected text, and confidence score for each detected text region.
    """
    bboxes = get_RF_bboxes(image_path)
    texts = get_texts(bboxes, image_path) ######
    
    resultat = [(bbox, text, 0) for bbox, text in zip(bboxes, texts)]
    return resultat

#### Transformation des résultats au format .json

In [100]:
def transform_result(results: List[Dict[str, Any]]) -> str:
    """
    Transforms OCR results into Challenge's JSON format.

    Args:
        results (List[Dict[str, Any]]): A list of OCR results where each result is a dictionary containing:
            - bbox (List[List[int]]): Bounding box coordinates.
            - text (str): Detected text.
            - prob (float): Confidence score.

    Returns:
        str: A JSON string representing the transformed OCR results.
    """
    form_data = []
    for idx, (bbox, text, prob) in enumerate(results):
        # Convert bounding box coordinates to integers
        bbox = [[int(coord) for coord in point] for point in bbox]
        box_extracted = [bbox[0], bbox[2]]
        flat_box = [coord for sublist in box_extracted for coord in sublist]
        
        # Create the structure for each element
        item = {
            "box": flat_box,
            "text": text,
            "label": "RF", #get_label(flat_box, text),  #### WARNING : Customize this field as needed !
            "words": [{"box": flat_box, "text": text}],  # Each word is encapsulated in a 'words' list
            "linking": [],  # This part can be used to link words if necessary
            "id": idx
        }
        form_data.append(item)

    # Convert to JSON
    json_data = {
        "form": form_data
    }
    # Convert JSON data to a formatted string
    #json_output = json.dumps(json_data, indent=4, ensure_ascii=False)
    return json_data

#### Enregistrement des résultats

In [101]:
output_folder = 'output_classif'
if os.path.exists(output_folder):
    shutil.rmtree(output_folder)
os.makedirs(output_folder, exist_ok=True)
ground_truth_folder = 'ground_truth_annotations'
if os.path.exists(ground_truth_folder):
    shutil.rmtree(ground_truth_folder)
os.makedirs(ground_truth_folder, exist_ok=True)

for i, id in enumerate(ids) :
    torch.cuda.empty_cache()
    if TEST:
        image_path = f'data/test/images/{id}.png'
    else :
        image_path = f'data/train/images/{id}.png'
    results = transform_result(analyze_image(image_path))
    json_output = json.dumps(results, indent=4, ensure_ascii=False)
    with open(f'{output_folder}/{id}.json', 'w', encoding='utf-8') as f:
        f.write(json_output)

    if TEST:
        input_path = f'data/test/annotations/{id}.json'
    else :
        input_path = f'data/train/annotations/{id}.json'
        output_path = f'{ground_truth_folder}/{id}.json'

    with open(input_path, 'r', encoding='utf-8') as file:
        annotation = json.load(file)

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(annotation, f, ensure_ascii=False, indent=4)
    print(f'Image {i+1}/{len(ids)}')

Image 1/119
Image 2/119
Image 3/119
Image 4/119
Image 5/119
Image 6/119
Image 7/119
Image 8/119
Image 9/119
Image 10/119
Image 11/119
Image 12/119
Image 13/119
Image 14/119
Image 15/119
Image 16/119
Image 17/119
Image 18/119
Image 19/119
Image 20/119
Image 21/119
Image 22/119
Image 23/119
Image 24/119
Image 25/119
Image 26/119
Image 27/119
Image 28/119
Image 29/119
Image 30/119
Image 31/119
Image 32/119
Image 33/119
Image 34/119
Image 35/119
Image 36/119
Image 37/119
Image 38/119
Image 39/119
Image 40/119
Image 41/119
Image 42/119
Image 43/119
Image 44/119
Image 45/119
Image 46/119
Image 47/119
Image 48/119
Image 49/119
Image 50/119
Image 51/119
Image 52/119
Image 53/119
Image 54/119
Image 55/119
Image 56/119
Image 57/119
Image 58/119
Image 59/119
Image 60/119
Image 61/119
Image 62/119
Image 63/119
Image 64/119
Image 65/119
Image 66/119
Image 67/119
Image 68/119
Image 69/119
Image 70/119
Image 71/119
Image 72/119
Image 73/119
Image 74/119
Image 75/119
Image 76/119
Image 77/119
Image 78

## Calcul du score

You need to put the true annotations in a folder and your predictions in another folder to compute results

In [102]:
# Utils functions to compute result

def clean_txt(s: str) -> str:
    return re.sub(r'[^A-Z0-9]+', '', s.upper())


get_rects = functools.partial(map, operator.itemgetter('box'))
get_texts = functools.partial(map, operator.itemgetter('text'))


def compute_overlap(gt: np.array, preds: np.array) -> np.array:
    _gt = np.tile(np.expand_dims(gt, axis=1), (1, preds.shape[0], 1))
    _p = np.tile(np.expand_dims(preds, axis=0), (gt.shape[0], 1, 1))

    dx = np.minimum(_gt[:, :, 2], _p[:, :, 2]) - np.maximum(_gt[:, :, 0], _p[:, :, 0])
    dy = np.minimum(_gt[:, :, 3], _p[:, :, 3]) - np.maximum(_gt[:, :, 1], _p[:, :, 1])

    area = dx * dy
    # valid area if dx > 0 and dy > 0
    overlap = np.where(np.logical_and(dx > 0, dy > 0), area, 0)

    return overlap


def compute_area(rects: np.array) -> np.array:
    return (rects[:, 2] - rects[:, 0]) * (rects[:, 3] - rects[:, 1])


def compute_iou(gt: np.array, preds: np.array) -> np.array:
    gt_area = compute_area(gt)
    p_area = compute_area(preds)

    _gt_area = np.tile(gt_area[:, np.newaxis], (1, preds.shape[0]))
    _p_area = np.tile(p_area[np.newaxis, ], (gt.shape[0], 1))

    overlap = compute_overlap(gt, preds)

    _iou = overlap / (_gt_area + _p_area - overlap)

    return np.amax(_iou, axis=1), np.argmax(_iou, axis=1)


def compute_correct_rf(gt_texts: list, preds_texts: list, iou_results, iou_threshold=0.25) -> int:
    iou, best_iou_id = iou_results
    n_corrects = 0
    for i_txt, text in enumerate(gt_texts):
        if clean_txt(text) in clean_txt(preds_texts[best_iou_id[i_txt]]) and iou[i_txt] > iou_threshold:
            n_corrects += 1

    return n_corrects


def compute_results(gt: list, preds: list):
    gt = list(filter(lambda x: x['label'] == 'RF', gt))
    gt_rects = np.array(list(get_rects(gt)))
    gt_texts = list(get_texts(gt))

    p_rects = np.array(list(get_rects(preds)))
    p_texts = list(get_texts(preds))

    n_gt_rects = len(gt_rects)

    if n_gt_rects == 0:
        return 1.0, 1.0

    iou, best_iou_id = compute_iou(gt_rects, p_rects)
    # print("IOU: ", iou)
    # print("Best IOU ID: ", best_iou_id)
    n_corrects = compute_correct_rf(gt_texts, p_texts, (iou, best_iou_id))

    return np.sum(iou)/n_gt_rects, n_corrects/n_gt_rects

In [103]:
def compute_result(ground_truth, predictions):
    means_ious = []
    means_rf = []

    for file in os.listdir(ground_truth):
        with open(os.path.join(ground_truth, file), 'rb') as f:
            gt_json = json.load(f)

        pred_file = os.path.join(predictions, file)
        if os.path.isfile(pred_file):
            with open(pred_file, 'rb') as f:
                preds_json = json.load(f)

        else:
            means_ious.append(0)
            means_rf.append(0)
            continue

        file_iou, file_rf = compute_results(
            gt_json['form'],
            preds_json['form']
        )
        means_ious.append(file_iou)
        means_rf.append(file_rf)
    print(f"IOU: {np.mean(means_ious) * 100:.2f}")
    print(f"RF Correct: {np.mean(means_rf) * 100:.2f}")
    return means_ious, means_rf


In [104]:
gt_folder = os.path.join(os.getcwd(), ground_truth_folder)
pred_folder = os.path.join(os.getcwd(), output_folder)

iou, rf = compute_result(gt_folder, pred_folder)

IOU: 89.97
RF Correct: 30.10


# 6. From json to csv format

You need to create csv and make sure commas are replaced by semi-commas.

In [105]:
# import os
# import json
# import pandas as pd

# # Initialize an empty list to store the form data
# form_data_list = []
# ids = []
# # Iterate through each file in the directory
# for filename in os.listdir(output_folder):
#     if filename.endswith('.json'):
#         file_path = os.path.join(output_folder, filename)
#         with open(file_path, 'r') as file:
#             data = json.load(file)
#             form_data_list.append(data['form'])
#             ids.append(filename.split('.')[0])
# # Create a DataFrame from the form data list
# target_form = pd.DataFrame()
# target_form['id'] = ids
# target_form['target'] = form_data_list
# target_form['target'] = target_form['target'].astype(str)
# target_form = target_form.replace(',', ';', regex=True)  # Here we replace , by ;
# target_form['Usage'] = 'Public'


In [106]:
#target_form.to_csv('sample_submission.csv', index=False, sep=',')

# 7. Example of a Random Submission

In [107]:
# def submit():
#     start_path = "/kaggle/input/intelligent-text-extraction/test/images"
    
#     dataframe_liste = []
#     for root, dirs, files in os.walk(start_path):
#         for file in files:
#             image_path = os.path.join(root, file)
#             image_id = os.path.basename(image_path).split(".")[0]
#             image = Image.open(image_path)
    
#             ### change code here to incorporate your predictions
#             box = list(np.random.randint(0, 4000, 4))
#             text = "ETY017359D3"
#             ###
            
#             target = [{'box': box, 'text': text}]
#             target_tocsv = str(target).replace(',', ';')
#             line = {"id": image_id, "target": target_tocsv, "Usage": "Public"}
#             dataframe_liste.append(line)
    
#     df = pd.DataFrame(dataframe_liste)
#     df.to_csv("submission.csv", index=False)

# submit()