# 🚀 PaddleOCR - Solution Anti-Segfault

**Problème résolu :** Segfault lors de `pipeline.predict()` dans Jupyter

**Solution :** Configuration spécifique de l'environnement PaddlePaddle


## ⚙️ ÉTAPE 1: Configuration Anti-Segfault (OBLIGATOIRE)

**⚠️ IMPORTANT : Exécutez cette cellule AVANT tout import PaddleOCR**


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import gc
import warnings
from PIL import Image

def configure_anti_segfault():
    """Configure l'environnement pour éviter les segfaults lors de predict()"""
    print("🔧 Configuration anti-segfault pour predict()...")
    
    # Désactiver TOUTES les optimisations CPU problématiques
    os.environ["FLAGS_use_mkldnn"] = "false"
    os.environ["FLAGS_use_gpu"] = "false"
    os.environ["CUDA_VISIBLE_DEVICES"] = ""
    
    # Un seul thread pour éviter les race conditions
    os.environ["OMP_NUM_THREADS"] = "1"
    os.environ["MKL_NUM_THREADS"] = "1"
    os.environ["NUMEXPR_NUM_THREADS"] = "1"
    
    # Mémoire conservative
    os.environ["FLAGS_eager_delete_tensor_gb"] = "0.0"
    os.environ["FLAGS_memory_fraction_of_eager_deletion"] = "0.0"
    os.environ["FLAGS_fraction_of_gpu_memory_to_use"] = "0.1"
    
    # Convolutions sécurisées
    os.environ["FLAGS_conv_workspace_size_limit"] = "32"
    os.environ["FLAGS_cudnn_exhaustive_search"] = "false"
    
    # Logs minimaux
    os.environ["GLOG_minloglevel"] = "3"
    
    print("✅ Configuration terminée")

# EXÉCUTER LA CONFIGURATION
configure_anti_segfault()


In [None]:
from src import main
from src import utils

In [None]:
folder_path = "input/" # DANS LE DOSSIER NOTEBOOK ICI
image_name = "tableau2.png"
results, img = main.main_apply_pipeline(image_name,folder_path)

In [None]:
import src.utils as u_new
#image_name = "tableau2-1.png"#
#image_name = "bilan_passif_audit-1_png"
#image_name = "tableau_compte_resultat_ocr-1.png"
from IPython.display import display, HTML
import json
json_path = "output/"+  image_name.split('/')[-1].replace('.','_')  +"/safe_input_res.json"
with open(json_path, "r", encoding="utf-8") as f:
    json_res = json.load(f)
    
layout_boxes = json_res['table_res_list'][0]['cell_box_list']
rec_texts = json_res['table_res_list'][0]['table_ocr_pred']['rec_texts']
rec_boxes = json_res['table_res_list'][0]['table_ocr_pred']['rec_boxes']


# Charger les données
#layout_boxes, rec_boxes, rec_texts = u_new.load_paddleocr_data(json_path)
# 1. Extraire la structure (sans nettoyage)
table_structure = u_new.extract_table_structure(
    layout_boxes, 
    fill_empty_cells=True,
    extend_cells=True
)
u_new.plot_table_structure(table_structure)

# 2. Assigner les textes OCR ET nettoyer (dans le bon ordre)
filled_structure = u_new.assign_ocr_to_structure(
    table_structure, rec_boxes, rec_texts, 
    force_assignment=True,
    clean_structure=True,  # ✅ Maintenant APRÈS l'assignment
    auto_correct_overlaps = True,
    smart_spacing = True
)

u_new.plot_final_result( filled_structure)

html_output = u_new.export_to_html(filled_structure, "Mon Tableau")
display(HTML(html_output))

# 6. Sauvegarder le HTML
u_new.save_html_to_file(html_output, "output/"+  image_name.split('/')[-1].replace('.','_')  +"/tableau.html")

# 7. Export Markdown
markdown_output = u_new.export_to_markdown(filled_structure, "Mon Tableau")
print(markdown_output)

In [None]:
# Cellule DEBUG

n = 9
for el in filled_structure[n:n+1]:
    print(el.x1)
    print(el.y1)
    print(el.x2)
    print(el.y2)
    print('row start', el.row_start)
    print('col start', el.col_start)
    print('row span', el.row_span)
    print("col span", el.col_span)
    print(el.texts)
    print(el.final_text)
    print(el.is_auto_filled)
    print('')