# Summary
Goal of this script is to extract families that contain siblings and add a column to know if there was a child that is no longer eligible

In [None]:
import json

import pandas as pd
from dotenv import load_dotenv
import os

load_dotenv()

backup_pathfile = os.environ['BACKUP_PATHFILE_2025']
existing_benef_pathfile = os.environ['EXISTING_BENEF_PATHFILE_2025']
injep_query_output_pathfile = os.environ['INJEP_QUERY_OUTPUT_PATHFILE_2025']
optimize_memory = True

In [None]:
df_backup = pd.read_csv(backup_pathfile, sep=';', encoding='utf-8', dtype=str)
df_existing = pd.read_csv(existing_benef_pathfile, sep=';', encoding='utf-8', dtype=str)

In [None]:
# All benef from backup file aren't eligible by default since they were not inserted into the database
# All existing benef from the database are eligible by default
df_backup.loc[:,'is_eligible'] = False
df_existing.loc[:,'is_eligible'] = True

In [None]:
df_backup_allocataire_json = pd.json_normalize(df_backup['allocataire'].apply(json.loads)).add_prefix('allocataire-')
df_backup_unwrapped = pd.merge(df_backup, df_backup_allocataire_json, left_index=True, right_index=True)

In [None]:
df_existing_json = pd.json_normalize(df_existing['allocataire'].apply(json.loads)).add_prefix('allocataire-')
df_existing_unwrapped = pd.merge(df_existing, df_existing_json, left_index=True, right_index=True)

In [None]:
if optimize_memory:
    del df_backup
    del df_backup_allocataire_json
    del df_existing
    del df_existing_json

In [None]:
df_final = pd.concat([df_existing_unwrapped, df_backup_unwrapped]).reset_index()

In [None]:
if optimize_memory:
    del df_existing_unwrapped
    del df_backup_unwrapped

In [None]:
# For each family that have siblings, we set "is_eligible" column to true ONLY IF there is one eligible AND one that is not eligible
df_final_siblings = df_final.groupby(['allocataire-prenom', 'allocataire-nom', 'allocataire-matricule'])['is_eligible']\
    .transform(lambda x: x.any() & (~x).any())

# The indexes should be the same on both dataframes in order to assign the boolean series later
assert df_final_siblings.index.equals(df_final.index) == True

In [None]:
# Assign boolean series to our original dataframe
df_final['fratrie_mixte'] = df_final_siblings
df_final.sort_values(by=['allocataire-prenom', 'allocataire-nom', 'allocataire-matricule'], inplace=True)

In [None]:
# CSV with benef from both original files (existing and backup)
cols_to_keep = ['id','id_psp','nom','prenom','date_naissance','genre','organisme','situation','allocataire', 'fratrie_mixte']
df_final[cols_to_keep].to_csv('./fratrie_mixte.csv', index=False, sep=';')

In [None]:
# CSV with only the existing benef
df_final_without_backup = df_final[df_final['id'].notna()]
df_final_without_backup[cols_to_keep].to_csv('./fratrie_mixte_without_backup.csv', index=False, sep=';')

In [None]:
if optimize_memory:
    del df_final_siblings
    del df_final

In [None]:
# Merge fratrie_mixte value into injep query result
df_injep_query_result = pd.read_csv(injep_query_output_pathfile, sep=';', encoding='utf-8', dtype=str)

In [None]:
final_cols_to_keep = [
    'id','millesime','genre','situation','age','date_naissance',
    'code_insee','commune', 'code_commune', 'departement','code_departement',
    'region','code_region','drom_com','qpv','zrr',
    'pass_statut','date_recours','email','telephone',
    'type_contact','federation','fratrie_mixte'
]

df_injep_query_with_fratrie = pd.merge(df_injep_query_result, df_final_without_backup, on="id", how="inner", suffixes=(None, "old"))
df_injep_query_with_fratrie[final_cols_to_keep].to_csv('./injep_quey_with_fratrie.csv', index=False, sep=';')