## Process
- Load backup file
- Load DS (Demarches Simplifiees) CSV file
- Apply eligibility dates
  - 01/01/2006 to 31/12/2019 (inclusives)
- Clean & Format the rows
  - Add the columns "is_found" (default to False),
  - Add the column "folder_number" (default to np.NaN)
- Match DS rows against the backup rows
- If match, create boolean column "is_found" and set it to True, otherwise False
- Output 1 CSV file with the database format (to be injection ready)
- Output 1 CSV file for the support team with the created column "is_found", and the "folder_number"

In [None]:
import os
import pandas as pd
import json
import numpy as np
import csv

from datetime import datetime, timedelta
from dotenv import load_dotenv

from data.utils.data_utils import unaccent_and_upper

load_dotenv()

ds_input_filepath = os.environ['DEMARCHES_SIMPLIFIEES_PATHFILE_2025']
backup_input_filepath = os.environ['BACKUP_PATHFILE_2025']

In [None]:
ds_column_names = [
  'id',
  'email',
  'france_connect_has_been_used',
  'allocataire_quality',
  'benef_name',
  'benef_firstname',
  'depot_is_from_tutor',
  'tutor_name',
  'tutor_firstname',
  'to_archive',
  'folder_status',
  'last_updated_at'
  'folder_last_updated_at',
  'submitted_at',
  'date_of_instruction',
  'treated_at',
  'final_decision',
  'instructors',
  'input_has_aeeh',
  'input_email',
  'input_organism',
  'input_alloc_residence',
  'input_alloc_commune',
  'input_alloc_commune_insee_code',
  'input_alloc_matricule_number',
  'input_benef_gender',
  'input_benef_firstname',
  'input_benef_lastname',
  'input_benef_date_of_birth',
  'input_proof_of_paiement_filename'
]

ds_df = pd.read_csv(ds_input_filepath, on_bad_lines='skip', sep=',', dtype=str, engine="c", keep_default_na=False, names=ds_column_names)

In [None]:
ds_df['allocataire_quality'] = np.NaN
ds_df['situation'] = 'jeune'

In [None]:
# Format date_naissance to datetime python object for processing
ds_df['input_benef_date_of_birth'] = pd.to_datetime(ds_df['input_benef_date_of_birth'], format='%Y/%m/%d')
ds_df['input_firstname'] = ds_df['input_firstname'].apply(unaccent_and_upper)
ds_df['input_lastname'] = ds_df['input_lastname'].apply(unaccent_and_upper)
ds_df['input_benef_gender'] = ds_df['input_benef_gender'].replace({
    'M.': 'M',
    'Mme': 'F'
})

In [None]:
mask_dob_start = pd.to_datetime(ds_df['input_benef_date_of_birth']).dt.date >= datetime(2012, 1, 1).date()
mask_dob_end = pd.to_datetime(ds_df['input_benef_date_of_birth']).dt.date <= datetime(2019, 12, 31).date()

ds_df = ds_df[mask_dob_start & mask_dob_end]

# add 4h on all birthdates
ds_df['input_benef_date_of_birth'] = ds_df['input_benef_date_of_birth'] + timedelta(hours=4)

In [None]:
# map allocataire json
def to_json_allocataire_without_null(row):
    allocataire_mapping = {
        'qualite': row['allocataire_quality'],
        'matricule': row['input_alloc_matricule_number'],
        'nom': unaccent_and_upper(row['input_alloc_lastname']),
        'prenom': unaccent_and_upper(row['allocataire-prenom']),
        'courriel': row['allocataire-courriel']
    }
    filtered_NaN_allocataire = {k: v for k, v in allocataire_mapping.items() if pd.notnull(v)}
    return json.dumps(filtered_NaN_allocataire, ensure_ascii=False)


ds_df['allocataire'] = ds_df.apply(to_json_allocataire_without_null, axis=1)

In [None]:
# map adresse_allocataire json
def to_json_adresse_without_null(row):
    adresse_mapping = {
        'voie': row['adresse_allocataire-voie'],
        'code_postal': format_insee_or_postal_code(row['adresse_allocataire-code_postal']),
        'commune': row['adresse_allocataire-commune'],
        'code_insee': format_insee_or_postal_code(row['adresse_allocataire-code_insee']),
        'cplt_adresse': row['adresse_allocataire-cplt_adresse'],
    }

    filtered_address = {k: v for k, v in adresse_mapping.items() if pd.notnull(v)}
    return json.dumps(filtered_address, ensure_ascii=False)


df_valid_no_duplicate['adresse_allocataire'] = df_valid_no_duplicate.apply(to_json_adresse_without_null, axis=1)

In [None]:
## drop null value
df_final = df_valid_no_duplicate.drop(columns=[
    'allocataire-qualite',
    'allocataire-matricule',
    'allocataire-code_organisme',
    'allocataire-nom',
    'allocataire-prenom',
    'allocataire-telephone',
    'allocataire-courriel',
    'adresse_allocataire-voie',
    'adresse_allocataire-code_postal',
    'adresse_allocataire-commune',
    'adresse_allocataire-code_insee',
    'adresse_allocataire-cplt_adresse',
])

In [None]:
# Jeune data 14-17 years old
mask_jeune_dob_start = pd.to_datetime(df_psp_mapped_cnaf['date_naissance']).dt.date >= datetime(2008, 1, 1).date()
mask_jeune_dob_end = pd.to_datetime(df_psp_mapped_cnaf['date_naissance']).dt.date <= datetime(2011, 12, 31).date()
mask_jeune_dob = mask_jeune_dob_start & mask_jeune_dob_end

df_final_jeune = df_final[mask_jeune_dob]

In [None]:
# AAH benef, already computed their situation above so no need to apply other filters
df_final_aah = df_final[df_final['situation'] == 'AAH']

In [None]:
# 6-13 years old for verification purposes
mask_backup_data_potential_aeeh_dob_start = pd.to_datetime(df_psp_mapped_cnaf['date_naissance']).dt.date >= datetime(2012, 1, 1).date()
mask_backup_data_potential_aeeh_dob_end = pd.to_datetime(df_psp_mapped_cnaf['date_naissance']).dt.date <= datetime(2019, 12, 31).date()

mask_backup_data_potential_aeeh_dob = mask_backup_data_potential_aeeh_dob_start & mask_backup_data_potential_aeeh_dob_end

df_backup_data = df_final[mask_backup_data_potential_aeeh_dob]

In [None]:
# Merge ARS and AAH
df_final_jeune_and_aah = pd.concat([df_final_jeune, df_final_aah], ignore_index=True).reset_index(drop=True)

In [None]:
df_final_jeune_and_aah.loc[:,'date_naissance'] = df_final_jeune_and_aah['date_naissance'].astype(str)
df_backup_data.loc[:,'date_naissance'] = df_backup_data['date_naissance'].astype(str)

In [None]:
# output to CSV files
df_final_jeune_and_aah.to_csv(base_output_filepath, sep=';', index=False, encoding='utf-8', quoting=csv.QUOTE_ALL)
df_backup_data.to_csv(backup_output_filepath, sep=';', index=False, encoding='utf-8', quoting=csv.QUOTE_ALL)

In [None]:
print(f"{len(df_final_jeune)} df_final_jeune")
print(f"{len(df_final_aah)} df_final_aah")
print(f"{len(df_final_jeune_and_aah)} jeune and aah")
print(f"{len(df_backup_data)} backup data")