## Process
- Load backup file
- Load DS (Demarches Simplifiees) CSV file
- Apply eligibility dates
  - 01/01/2006 to 31/12/2019 (inclusives)
- Clean & Format the rows
  - Add the column "folder_number" (default to np.NaN)
  - Remove duplicate from GRIST AEEH list
- Output 1 CSV file with the database format (to be injection ready)
- Output 1 CSV file for the support team with the created column and the "folder_number"

In [None]:
import csv
import os
import pandas as pd
import json
import numpy as np

from datetime import datetime, timedelta
from dotenv import load_dotenv
from utils.data_utils import unaccent_and_upper, format_insee_or_postal_code, get_current_date_for_file_name

load_dotenv()

ds_input_filepath = os.environ['DEMARCHES_SIMPLIFIEES_PATHFILE_2025']
existing_codes_filepath = os.environ['EXISTING_CODES_PATHFILE_2025']

In [None]:
dossiers_columns = ['dossier_id', 'prenom', 'nom', 'date_naissance', 'allocataire-courriel']
db_columns = ['nom', 'prenom', 'date_naissance', 'genre', 'organisme', 'situation', 'allocataire', 'adresse_allocataire', 'created_at', 'updated_at', 'exercice_id', 'uuid_doc', 'zrr', 'qpv', 'a_valider', 'refuser', 'id_psp']
db_columns_with_dossier = ['dossier_id', 'nom', 'prenom', 'date_naissance', 'genre', 'organisme', 'situation', 'allocataire', 'adresse_allocataire', 'created_at', 'updated_at', 'exercice_id', 'uuid_doc', 'zrr', 'qpv', 'a_valider', 'refuser', 'id_psp']

column_mapping = {
  "ID" : "dossier_id",
  "Email": "demandeur_email",
  "FranceConnect ?": "france_connect_a_ete_utilise",
  "Civilité": "qualite",
  "Nom": "demandeur_nom",
  "Prénom": "demandeur_prenom",
  "Dépôt pour un tiers": "depot_pour_un_tiers",
  "Nom du mandataire": "nom_mandataire",
  "Prénom du mandataire": "prenom_mandataire",
  "À archiver": "a_archiver",
  "État du dossier": "etat_dossier",
  "Dernière mise à jour le": "derniere_mise_a_jour",
  "Dernière mise à jour du dossier le": "derniere_mise_a_jour_du_dossier",
  "Déposé le": "depose_le",
  "Passé en instruction le": "passe_en_instruction_le",
  "Traité le": "traite_le",
  "Motivation de la décision": "decision",
  "Instructeurs": "instructeurs",
  "Percevez-vous l'allocation d'éducation de l'enfant handicapé (AEEH) ?": "est_aeeh",
  "Nom de famille de l'allocataire": "allocataire-nom",
  "Prénom de l'allocataire": "allocataire-prenom",
  "Adresse électronique de l'allocataire": "allocataire-courriel",
  "L'organisme de gestion de votre allocation": "organisme",
  "Adresse de résidence de l'allocataire": "adresse_allocataire-voie",
  "Commune de résidence de l'allocataire": "adresse_allocataire-commune",
  "Commune de résidence de l'allocataire (Code INSEE)": "adresse_allocataire-commune_insee",
  "Commune de résidence de l'allocataire (Département)": "adresse_allocataire-departement",
  "Le numéro d'allocataire CAF": "allocataire-matricule",
  "Genre": "genre",
  "Prénom de l'enfant": "prenom",
  "Nom de famille de l'enfant": "nom",
  "Date de naissance de l'enfant": "date_naissance",
  "Attestation de paiement de l'AEEH, fournie par votre CAF ou MSA": "attestation_paiement",
  "Nouvelle annotation": "annotation"
}

df_ds = pd.read_csv(ds_input_filepath, on_bad_lines='skip', sep=',', dtype=str, engine="c", keep_default_na=False, encoding="utf-8")
df_ds = df_ds.rename(columns=column_mapping)

In [None]:
df_ds['etat_dossier'] = df_ds['etat_dossier'].replace('En instruction', 'en_instruction')
df_ds['adresse_allocataire-code-postal'] = df_ds['adresse_allocataire-commune'].str.extract(r'\((\d{5})\)')
df_ds['adresse_allocataire-commune'] = df_ds['adresse_allocataire-commune'].str.extract(r'(.+)\s\(\d{5}\)')

In [None]:
mask_caf = df_ds['organisme'] == 'CAF'
df_ds.loc[mask_caf, 'allocataire-matricule'] = df_ds.loc[mask_caf, 'allocataire-matricule'].str[:7]
mask_matricule = df_ds['allocataire-matricule'].str.len() < 7
df_ds.loc[mask_caf & mask_matricule, 'allocataire-matricule'] = df_ds.loc[mask_caf & mask_matricule, 'allocataire-matricule'].str.zfill(7)

In [None]:
df_ds['allocataire-qualite'] = np.NaN
df_ds['situation'] = 'jeune'

In [None]:
# Format date_naissance to datetime python object for processing
df_ds['date_naissance'] = pd.to_datetime(df_ds['date_naissance'], format='%Y-%m-%d')
df_ds['prenom'] = df_ds['prenom'].apply(unaccent_and_upper).str.strip()
df_ds['nom'] = df_ds['nom'].apply(unaccent_and_upper).str.strip()
df_ds['genre'] = df_ds['genre'].replace({
    'M.': 'M',
    'Mme': 'F'
})

# add 4h on all birthdates
df_ds['date_naissance'] = df_ds['date_naissance'] + timedelta(hours=4)
df_ds['date_naissance'] = df_ds['date_naissance'].astype(str)

In [None]:
# Matching against previous injected data
df_previous_waves = pd.concat([
    pd.read_csv('./previous_waves/2025-09-11-dossiers-a-accepter.csv', on_bad_lines='skip', sep=';', dtype=str, engine="c", keep_default_na=False, encoding="utf-8"),
    pd.read_csv('./previous_waves/2025-09-11-dossiers-a-refuser-doublons.csv', on_bad_lines='skip', sep=';', dtype=str, engine="c", keep_default_na=False, encoding="utf-8"),
    pd.read_csv('./previous_waves/2025-09-11-dossiers-a-refuser-non-eligibles.csv', on_bad_lines='skip', sep=';', dtype=str, engine="c", keep_default_na=False, encoding="utf-8")
])

df_merge_all_waves = pd.merge(df_ds, df_previous_waves, on=['prenom', 'nom' ,'date_naissance'], how='left', suffixes=('', '_prev'), indicator=True)

print(f"{len(df_ds)} folders from current wave")
print(f"{len(df_previous_waves)} folders from previous waves")
print(f"{len(df_merge_all_waves)} folders from merging all waves on [prenom, nom, date_naissance]")

In [None]:
df_merge_all_waves['_merge'].value_counts()

In [None]:
print(f"{len(df_merge_all_waves[df_merge_all_waves['_merge'] == 'both'])} total after merge in both")
print(f"{len(df_merge_all_waves[df_merge_all_waves['_merge'] == 'both']) - len(df_previous_waves)} duplicated folders found in current wave against previous waves")

In [None]:
print(f"{len(df_merge_all_waves[df_merge_all_waves['_merge'] == 'left_only'] )} from left merge")

In [None]:
# Ensure we have no folders from the current wave found in previous waves
df_merge_keep_only_current_wave = df_merge_all_waves[df_merge_all_waves['_merge'] == 'left_only']
assert(len(df_merge_keep_only_current_wave[df_merge_keep_only_current_wave['dossier_id'].isin(df_previous_waves['dossier_id'])]) == 0)

In [None]:
df_ds_without_duplicated = df_merge_keep_only_current_wave.drop_duplicates(subset=['prenom', 'nom', 'date_naissance'])
df_duplicated_folders = df_merge_keep_only_current_wave[~df_merge_keep_only_current_wave['dossier_id'].isin(df_ds_without_duplicated['dossier_id'])]
print(f"{len(df_duplicated_folders)} duplicated folders found")

In [None]:
# 6 to 13 years old
mask_6_13_dob_start = pd.to_datetime(df_ds_without_duplicated['date_naissance']).dt.date >= datetime(2012, 1, 1).date()
mask_6_13_dob_end = pd.to_datetime(df_ds_without_duplicated['date_naissance']).dt.date <= datetime(2019, 12, 31).date()

# 18 to 20 years old
mask_18_20_dob_start = pd.to_datetime(df_ds_without_duplicated['date_naissance']).dt.date >= datetime(2005, 1, 1).date()
mask_18_20_dob_end = pd.to_datetime(df_ds_without_duplicated['date_naissance']).dt.date <= datetime(2007, 12, 31).date()

mask_within_dates = (mask_6_13_dob_start & mask_6_13_dob_end) | (mask_18_20_dob_start & mask_18_20_dob_end)

print(f"{len(df_ds_without_duplicated)} total rows")

df_ds_eligible = df_ds_without_duplicated[mask_within_dates]

print(f"{len(df_ds_eligible)} total rows after applying dates requirements")

# Dossiers not meeting date requirements
df_ds_not_eligible = df_ds_without_duplicated[~df_ds_without_duplicated['dossier_id'].isin(df_ds_eligible['dossier_id'])]
print(f"{len(df_ds_not_eligible)} total rows that do not meet dates requirements")

In [None]:
# map allocataire json
def to_json_allocataire_without_null(row):
    allocataire_mapping = {
        'qualite': np.NaN,
        'nom': unaccent_and_upper(row['allocataire-nom']).strip(),
        'prenom': unaccent_and_upper(row['allocataire-prenom']).strip(),
        'courriel': row['allocataire-courriel'].lower().strip()
    }
    if row['allocataire-matricule']:
        allocataire_mapping['matricule'] = row['allocataire-matricule']
    filtered_NaN_allocataire = {k: v for k, v in allocataire_mapping.items() if pd.notnull(v)}
    return json.dumps(filtered_NaN_allocataire, ensure_ascii=False)

df_ds_eligible.loc[:, 'allocataire'] = df_ds_eligible.apply(to_json_allocataire_without_null, axis=1)

In [None]:
# map adresse_allocataire json
def to_json_adresse_without_null(row):
    adresse_mapping = {
        'voie': unaccent_and_upper(row['adresse_allocataire-voie'].strip()).replace('"', '\''),
        'commune': unaccent_and_upper(row['adresse_allocataire-commune'].strip()),
        'code_postal': format_insee_or_postal_code(row['adresse_allocataire-code-postal']),
        'code_insee': format_insee_or_postal_code(row['adresse_allocataire-commune_insee'])
    }
    filtered_address = {k: v for k, v in adresse_mapping.items() if pd.notnull(v)}
    return json.dumps(filtered_address, ensure_ascii=False)

df_ds_eligible.loc[:, 'adresse_allocataire'] = df_ds_eligible.apply(to_json_adresse_without_null, axis=1)

In [None]:
# Add missing default column needed to backup data
# Put a date value for the 2025 data otherwise the merge will not work as intended (the dates from 2024 will replace the non existing dates on data from 2025)
timestamp_with_custom_tz = pd.Timestamp.now(tz='Europe/Paris')
timestamp_to_identify = timestamp_with_custom_tz.replace(
    hour=23,
    minute=23,
    second=23,
    microsecond=23000
)

exercice_2025 = 4
df_ds_eligible.loc[:,['exercice_id']] = exercice_2025
df_ds_eligible.loc[:,['id_psp', 'uuid_doc']] = np.NaN
df_ds_eligible.loc[:,['zrr', 'qpv', 'a_valider', 'refuser']] = False
df_ds_eligible.loc[:,['created_at', 'updated_at']] = timestamp_to_identify
df_ds_eligible.loc[:, 'date_naissance'] = df_ds_eligible['date_naissance'].astype(str)

In [None]:
df_existing_codes = pd.read_csv(existing_codes_filepath, on_bad_lines='skip', sep=',', dtype=str, engine="c", keep_default_na=False, encoding="utf-8")

In [None]:
df_existing_codes.drop_duplicates(subset=['code'], inplace=True)

In [None]:
# Unique codes generation
import random
import string
import datetime

df_ds_eligible = df_ds_eligible.reset_index(drop=True)
current_date = datetime.datetime.now()
current_year = str(current_date.year)[-2:]

def get_characters_set(size = 4):
    return ''.join(random.choices([c for c in string.ascii_uppercase if c not in 'OI'], k=size))

def generate_code():
    return f"{current_year}-{get_characters_set(4)}-{get_characters_set(4)}"

# init set of codes with existing
unique_codes = set(df_existing_codes['code'])

# init current_code count
current_codes_count = len(unique_codes)

while len(unique_codes) < (len(df_ds_eligible) + len(df_existing_codes)):
    unique_codes.add(generate_code())

In [None]:
# Ensure we have generated codes for all the rows
assert len(unique_codes) == (len(df_ds_eligible)+len(df_existing_codes))

In [None]:
new_codes = unique_codes.difference(set(df_existing_codes['code']))
assert len(new_codes) == len(df_ds_eligible)
len(new_codes)

In [None]:
# Assign generated code
df_ds_eligible['id_psp'] = list(new_codes)

In [None]:
assert(len(pd.merge(df_duplicated_folders, df_ds_not_eligible, how="inner", on=["dossier_id"])) == 0)

In [None]:
# output to CSV files folders OK
df_ds_eligible[db_columns_with_dossier].to_csv(get_current_date_for_file_name('production-with-dossier-ids.csv'), sep=';', index=False, encoding='utf-8')
df_ds_eligible[db_columns].to_csv(get_current_date_for_file_name('production.csv'), sep=';', index=False, encoding='utf-8')

In [None]:
# Output to CSV files folders NOT OK
df_duplicated_folders[dossiers_columns].to_csv(get_current_date_for_file_name('dossiers-a-refuser-doublons.csv'), sep=';', index=False, encoding='utf-8')
df_ds_not_eligible[dossiers_columns].to_csv(get_current_date_for_file_name('dossiers-a-refuser-non-eligibles.csv'), sep=';', index=False, encoding='utf-8')
df_ds_eligible[dossiers_columns].to_csv(get_current_date_for_file_name('dossiers-a-accepter.csv'), sep=';', index=False, encoding='utf-8')

In [None]:
print(f"{len(df_ds_not_eligible)} total rows that do not meet dates requirements")
print(f"{len(df_ds_eligible)} total rows after applying dates requirements")
print(f"{len(df_duplicated_folders)} duplicated folders found")
print(f"{len(df_ds_eligible)} folders to accept")
print(f"{len(df_ds)} initial total rows")
print(f"{len(df_ds_eligible) + len(df_ds_not_eligible) + len(df_duplicated_folders)} total rows processed from current wave")
print(f"{len(df_ds_eligible) + len(df_ds_not_eligible) + len(df_duplicated_folders) + len(df_previous_waves)} total rows processed from all waves")

In [None]:
df_ds['instructeurs'].value_counts()