In [None]:
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

In [None]:
# aeeh-to-fix.csv, is data that is extracted directly from the database targeting invalid matricules amongst all waves
df_existing = pd.read_csv('./aeeh-to-fix.csv', encoding='utf-8', sep=';')

# Matching against previous injected data
df_previous_waves = pd.concat([
    pd.read_csv('./previous_waves/wave-2/2025-09-19-production-with-dossier-ids.csv', on_bad_lines='skip', sep=';',
                dtype=str, engine="c", keep_default_na=False, encoding="utf-8"),
    pd.read_csv('./previous_waves/wave-3/2025-09-26-production-with-dossier-ids.csv', on_bad_lines='skip', sep=';',
                dtype=str, engine="c", keep_default_na=False, encoding="utf-8"),
    pd.read_csv('./previous_waves/wave-4/2025-10-03-production-with-dossier-ids.csv', on_bad_lines='skip', sep=';',
                dtype=str, engine="c", keep_default_na=False, encoding="utf-8"),
    pd.read_csv('./previous_waves/wave-5/2025-10-13-production-with-dossier-ids.csv', on_bad_lines='skip', sep=';',
                dtype=str, engine="c", keep_default_na=False, encoding="utf-8"),
])

df_merged = pd.merge(df_existing, df_previous_waves, how='inner', on='id_psp', suffixes=('_', '_from_dossiers'))
# Read from last wave, to refactor later
df_all_folders = pd.read_csv('./previous_waves/wave-5/2025-10-13-dossiers-a-traiter.csv', on_bad_lines='skip', sep=',',
                             dtype=str, engine="c", keep_default_na=False, encoding="utf-8")
df_all_folders.rename(columns={
    "ID": "dossier_id",
    "Le numéro d\'allocataire CAF": "dossier_matricule",
    "L\'organisme de gestion de votre allocation": "dossier_organisme"
}, inplace=True)

In [None]:
df_consolidated_folders = pd.merge(df_merged, df_all_folders, how='inner', on='dossier_id',
                                   suffixes=('', '_from_folders'))
df_consolidated_folders = df_consolidated_folders[df_consolidated_folders['dossier_organisme'] == 'CAF']
df_consolidated_folders['matricule_corrige'] = df_consolidated_folders['dossier_matricule'].str.replace(' ', '').str[
    :7].str.zfill(7)

In [None]:
# Exclude invalid matricules
mask_exclude_invalid_matricules = ~df_consolidated_folders['matricule_corrige'].str.contains(r'\D')

df_consolidated_folders = df_consolidated_folders[mask_exclude_invalid_matricules]
df_consolidated_folders[['dossier_id', 'id_psp', 'dossier_matricule', 'matricule_corrige']]

In [None]:
base_query = "UPDATE beneficiaires b SET allocataire = JSONB_SET(allocataire::JSONB, '{matricule}', TO_JSONB(:matricule::TEXT), true) WHERE id_psp = :id_psp AND exercice_id = 4;"
output = 'BEGIN;\n';

# Build UPDATE queries
for index, folder in df_consolidated_folders.iterrows():
    output += base_query.replace(':matricule', f"'{folder['matricule_corrige']}'").replace(':id_psp', f"'{folder['id_psp']}'")
    output += '\n'

output += 'COMMIT;'

with open("./matricules-to-fix.sql", "w") as f:
    f.write(output)