## Process
- Load backup file
- Load DS (Demarches Simplifiees) CSV file
- Apply eligibility dates
  - 01/01/2006 to 31/12/2019 (inclusives)
- Clean & Format the rows
  - Add the columns "is_found" (default to False),
  - Add the column "folder_number" (default to np.NaN)
- Match DS rows against the backup rows
- If match, create boolean column "is_found" and set it to True, otherwise False
- Output 1 CSV file with the database format (to be injection ready)
- Output 1 CSV file for the support team with the created column "is_found", and the "folder_number"

In [None]:
import os
import pandas as pd
import json
import numpy as np
import csv

from datetime import datetime, timedelta
from dotenv import load_dotenv
from utils.data_utils import unaccent_and_upper, format_insee_or_postal_code, get_current_date_for_file_name

load_dotenv()

ds_input_filepath = os.environ['DEMARCHES_SIMPLIFIEES_PATHFILE_2025']
backup_input_filepath = os.environ['BACKUP_PATHFILE_2025']
grist_input_filepath = os.environ['GRIST_AEEH_CLEANED_PATHFILE']

In [None]:
db_columns = ['nom', 'prenom', 'date_naissance', 'genre', 'organisme', 'situation', 'allocataire', 'adresse_allocataire', 'created_at', 'updated_at', 'exercice_id', 'uuid_doc', 'zrr', 'qpv', 'a_valider', 'refuser', 'id_psp']
db_columns_with_dossier = ['dossier_id', 'nom', 'prenom', 'date_naissance', 'genre', 'organisme', 'situation', 'allocataire', 'adresse_allocataire', 'created_at', 'updated_at', 'exercice_id', 'uuid_doc', 'zrr', 'qpv', 'a_valider', 'refuser', 'id_psp']

column_mapping = {
  "ID" : "dossier_id",
  "Email": "demandeur_email",
  "FranceConnect ?": "france_connect_a_ete_utilise",
  "Civilité": "qualite",
  "Nom": "demandeur_nom",
  "Prénom": "demandeur_prenom",
  "Dépôt pour un tiers": "depot_pour_un_tiers",
  "Nom du mandataire": "nom_mandataire",
  "Prénom du mandataire": "prenom_mandataire",
  "À archiver": "a_archiver",
  "État du dossier": "etat_dossier",
  "Dernière mise à jour le": "derniere_mise_a_jour",
  "Dernière mise à jour du dossier le": "derniere_mise_a_jour_du_dossier",
  "Déposé le": "depose_le",
  "Passé en instruction le": "passe_en_instruction_le",
  "Traité le": "traite_le",
  "Motivation de la décision": "decision",
  "Instructeurs": "instructeurs",
  "Percevez-vous l'allocation d'éducation de l'enfant handicapé (AEEH) ?": "est_aeeh",
  "Nom de famille de l'allocataire": "allocataire-nom",
  "Prénom de l'allocataire": "allocataire-prenom",
  "Adresse électronique de l'allocataire": "allocataire-courriel",
  "L'organisme de gestion de votre allocation": "organisme",
  "Adresse de résidence de l'allocataire": "adresse_allocataire-voie",
  "Commune de résidence de l'allocataire": "adresse_allocataire-commune",
  "Commune de résidence de l'allocataire (Code INSEE)": "adresse_allocataire-commune_insee",
  "Commune de résidence de l'allocataire (Département)": "adresse_allocataire-departement",
  # "Le numéro d'allocataire": "allocataire-matricule",
  "Le numéro d'allocataire CAF": "allocataire-matricule",
  "Genre": "genre",
  "Prénom de l'enfant": "prenom",
  "Nom de famille de l'enfant": "nom",
  "Date de naissance de l'enfant": "date_naissance",
  "Attestation de paiement de l'AEEH, fournie par votre CAF ou MSA": "attestation_paiement",
  "Nouvelle annotation": "annotation"
}

ds_df = pd.read_csv(ds_input_filepath, on_bad_lines='skip', sep=',', dtype=str, engine="c", keep_default_na=False, encoding="utf-8")
ds_df = ds_df.rename(columns=column_mapping)

In [None]:
ds_df['etat_dossier'] = ds_df['etat_dossier'].replace('En instruction', 'en_instruction')
ds_df['adresse_allocataire-code-postal'] = ds_df['adresse_allocataire-commune'].str.extract(r'\((\d{5})\)')
ds_df['adresse_allocataire-commune'] = ds_df['adresse_allocataire-commune'].str.extract(r'(.+)\s\(\d{5}\)')

In [None]:
mask_caf = ds_df['organisme'] == 'CAF'
ds_df.loc[mask_caf, 'allocataire-matricule'] = ds_df.loc[mask_caf, 'allocataire-matricule'].str[:7]

In [None]:
mask_matricule = ds_df['allocataire-matricule'].str.len() < 7
ds_df.loc[mask_caf & mask_matricule, 'allocataire-matricule'] = ds_df.loc[mask_caf & mask_matricule, 'allocataire-matricule'].str.replace(' ', '').str[
    :7].str.zfill(7)

In [None]:
ds_df['allocataire-qualite'] = np.NaN
ds_df['situation'] = 'jeune'

In [None]:
# Format date_naissance to datetime python object for processing
ds_df['date_naissance'] = pd.to_datetime(ds_df['date_naissance'], format='%Y-%m-%d')
ds_df['prenom'] = ds_df['prenom'].apply(unaccent_and_upper).str.strip()
ds_df['nom'] = ds_df['nom'].apply(unaccent_and_upper).str.strip()
ds_df['genre'] = ds_df['genre'].replace({
    'M.': 'M',
    'Mme': 'F'
})

In [None]:
# 6 to 13 years old
mask_dob_start = pd.to_datetime(ds_df['date_naissance']).dt.date >= datetime(2012, 1, 1).date()
mask_dob_end = pd.to_datetime(ds_df['date_naissance']).dt.date <= datetime(2019, 12, 31).date()

ds_df = ds_df[mask_dob_start & mask_dob_end]

# add 4h on all birthdates
ds_df['date_naissance'] = ds_df['date_naissance'] + timedelta(hours=4)

In [None]:
# map allocataire json
def to_json_allocataire_without_null(row):
    allocataire_mapping = {
        'qualite': np.NaN,
        'nom': unaccent_and_upper(row['allocataire-nom']).strip(),
        'prenom': unaccent_and_upper(row['allocataire-prenom']).strip(),
        'courriel': row['allocataire-courriel'].lower().strip()
    }
    if row['allocataire-matricule']:
        allocataire_mapping['matricule'] = row['allocataire-matricule']
    filtered_NaN_allocataire = {k: v for k, v in allocataire_mapping.items() if pd.notnull(v)}
    return json.dumps(filtered_NaN_allocataire, ensure_ascii=False)

ds_df['allocataire'] = ds_df.apply(to_json_allocataire_without_null, axis=1)

In [None]:
# map adresse_allocataire json
def to_json_adresse_without_null(row):
    adresse_mapping = {
        'voie': unaccent_and_upper(row['adresse_allocataire-voie'].strip()),
        'commune': unaccent_and_upper(row['adresse_allocataire-commune'].strip()),
        'code_postal': format_insee_or_postal_code(row['adresse_allocataire-code-postal']),
        'code_insee': format_insee_or_postal_code(row['adresse_allocataire-commune_insee'])
    }
    filtered_address = {k: v for k, v in adresse_mapping.items() if pd.notnull(v)}
    return json.dumps(filtered_address, ensure_ascii=False)

ds_df['adresse_allocataire'] = ds_df.apply(to_json_adresse_without_null, axis=1)

In [None]:
# Add missing default column needed to backup data
# Put a date value for the 2025 data otherwise the merge will not work as intended (the dates from 2024 will replace the non existing dates on data from 2025)
timestamp_with_custom_tz = pd.Timestamp.now(tz='Europe/Paris')

exercice_2025 = 4
ds_df['exercice_id'] = exercice_2025
ds_df[['id_psp', 'uuid_doc']] = np.NaN
ds_df[['zrr', 'qpv', 'a_valider', 'refuser']] = False
ds_df[['created_at', 'updated_at']] = timestamp_with_custom_tz

current_date = datetime.now()
current_year = str(current_date.year)[-2:]

In [None]:
backup_df_cnaf_2025 = pd.read_csv(backup_input_filepath, sep=';', encoding='utf-8', dtype=str)
grist_df_aeeh = pd.read_csv(grist_input_filepath, sep=';', encoding='utf-8', dtype=str)

In [None]:
backup_df_cnaf_2025_unwrapped = pd.json_normalize(backup_df_cnaf_2025['allocataire'].apply(json.loads)).add_prefix('allocataire-')
backup_df_cnaf_2025 = pd.merge(backup_df_cnaf_2025, backup_df_cnaf_2025_unwrapped['allocataire-courriel'], left_index=True, right_index=True)

In [None]:
backup_df = pd.concat([backup_df_cnaf_2025, grist_df_aeeh], ignore_index=True).reset_index()
ds_df['date_naissance'] = ds_df['date_naissance'].astype(str)

In [None]:
merged_df = pd.merge(ds_df, backup_df, how='inner', on=['prenom', 'nom', 'date_naissance'], suffixes=('', '_right'))
not_eligible_df = ds_df[~ds_df['dossier_id'].isin(merged_df['dossier_id'])]
not_eligible_df.reset_index(drop=True).sort_values(by=['dossier_id'])

assert((len(not_eligible_df) + len(merged_df)) == len(ds_df))

In [None]:
# Unique codes generation
import random
import string
import datetime

all_eligible = pd.concat([merged_df]).reset_index(drop=True)
current_date = datetime.datetime.now()
current_year = str(current_date.year)[-2:]

def get_characters_set(size = 4):
    return ''.join(random.choices([c for c in string.ascii_uppercase if c not in 'OI'], k=size))

def generate_code():
    return f"{current_year}-I{get_characters_set(3)}-{get_characters_set(4)}"

# init set of codes with existing
unique_codes = set()

# init current_code count
current_codes_count = len(unique_codes)

while len(unique_codes) < len(all_eligible):
    unique_codes.add(generate_code())

# Ensure we have generated codes for all the rows
assert len(unique_codes) == len(all_eligible)

In [None]:
all_eligible['id_psp'] = list(unique_codes)

In [None]:
print(f"{len(merged_df)} matches first wave")
print(f"{len(not_eligible_df)} not eligible from the first wave")
print(f"{len(all_eligible)} total eligible out of {len(ds_df)}")
print(f"Success rate first wave {len(merged_df)/len(ds_df):.2%}")

In [None]:
# output to CSV files
all_eligible[db_columns].to_csv(get_current_date_for_file_name('aeeh.csv'), sep=';', index=False, encoding='utf-8', quoting=csv.QUOTE_ALL)