# Summary
Goal of this script is to extract families that contain siblings and add a column to know if there was a child that is no longer eligible
The injep query output contains backup 6-13 + existing already, so no need to merge anything

In [None]:
import pandas as pd
from dotenv import load_dotenv
import os

load_dotenv()

injep_query_output_pathfile = os.environ['INJEP_QUERY_OUTPUT_PATHFILE_2025']
injep_query_enriched_caf_output_pathfile = os.environ['INJEP_QUERY_ENRICHED_CAF_OUTPUT_PATHFILE_2025']
optimize_memory = True

In [None]:
df_existing = pd.read_csv(injep_query_output_pathfile, sep=';', encoding='utf-8', dtype={
    'id': str,
    'id_psp': str,
    'dossier_id': str,
    'is_eligible': 'boolean',
    'age': int,
    'date_naissance': str,
    'genre': str,
    'situation': str,
    'email': str,
    'telephone': str,
    'allocataire_prenom': str,
    'allocataire_nom': str,
    'allocataire_matricule': str,
    'pass_statut': str,
    'date_recours': str,
    'code_insee': str,
    'commune': str,
    'code_commune': str,
    'region': str,
    'code_region': str,
    'departement': str,
    'code_departement': str,
    'zrr': 'boolean',
    'qpv': 'boolean',
    'drom_com': 'boolean',
    'millesime': str,
})

In [None]:
# For each family that have siblings, we set "is_eligible" column to true ONLY IF there is one eligible AND one that is not eligible
df_final_siblings_mixed_eligible = df_existing.groupby(['allocataire_prenom', 'allocataire_nom', 'allocataire_matricule'])['is_eligible']\
    .transform(lambda x: x.any() & (~x).any())

# The indexes should be the same on both dataframes in order to assign the boolean series later
assert df_final_siblings_mixed_eligible.index.equals(df_existing.index) == True

# Assign boolean series to our original dataframe
df_existing['fratrie_mixte'] = df_final_siblings_mixed_eligible

if optimize_memory:
    del df_final_siblings_mixed_eligible

In [None]:
# We only take family that have only eligible siblings
df_final_siblings_all_eligible = df_existing.groupby(['allocataire_prenom', 'allocataire_nom', 'allocataire_matricule'])['is_eligible']\
    .transform(lambda x: x.all())

# The indexes should be the same on both dataframes in order to assign the boolean series later
assert df_final_siblings_all_eligible.index.equals(df_existing.index) == True

# Assign boolean series to our original dataframe
df_existing['fratrie_avec_que_des_eligibles'] = df_final_siblings_all_eligible

if optimize_memory:
    del df_final_siblings_all_eligible

In [None]:
# We only take family that have only not eligible siblings
df_final_siblings_all_not_eligible = df_existing.groupby(['allocataire_prenom', 'allocataire_nom', 'allocataire_matricule'])['is_eligible']\
    .transform(lambda x: (~x).all())

# The indexes should be the same on both dataframes in order to assign the boolean series later
assert df_final_siblings_all_not_eligible.index.equals(df_existing.index) == True

# Assign boolean series to our original dataframe
df_existing['fratrie_avec_que_des_non_eligibles'] = df_final_siblings_all_not_eligible

if optimize_memory:
    del df_final_siblings_all_not_eligible

In [None]:
df_existing.sort_values(by=['allocataire_prenom', 'allocataire_nom', 'allocataire_matricule'], inplace=True)

In [None]:
df_existing['fratrie_mixte'].value_counts()

In [None]:
df_existing['fratrie_avec_que_des_eligibles'].value_counts()

In [None]:
df_existing['fratrie_avec_que_des_non_eligibles'].value_counts()

In [None]:
original_length = len(df_existing)
invalid_mask = df_existing['fratrie_avec_que_des_non_eligibles'].isna()
df_existing = df_existing[~invalid_mask]

print(f"{original_length - len(df_existing)} removed values")

In [None]:
assert (
           df_existing['fratrie_mixte'].astype(int) +
           df_existing['fratrie_avec_que_des_eligibles'].astype(int) +
           df_existing['fratrie_avec_que_des_non_eligibles'].astype(int)
).all() == 1, "More than one column is truthy"

In [None]:
final_cols_to_keep = [
    'id', 'id_psp', 'dossier_id', 'millesime','genre','situation','age','date_naissance',
    'code_insee','commune', 'code_commune', 'departement','code_departement',
    'region','code_region','drom_com','qpv','zrr',
    'pass_statut','date_recours','email','telephone',
    'allocataire_prenom', 'allocataire_nom', 'allocataire_matricule',
    'type_contact','fratrie_mixte', 'fratrie_avec_que_des_eligibles', 'fratrie_avec_que_des_non_eligibles'
]

df_existing[final_cols_to_keep].to_csv(injep_query_enriched_caf_output_pathfile, index=False, sep=';')