## Summary
Teasing emailing campaign, only to UNIQUE emails

## Process
- Load list of eligible benef from 2025
- JSON extraction for the allocataire field
- Data mapping and deletion of unusable data
- URL generation containing the code
- Output 3 files
  - Indirect benef
    - Family with one child
    - Familiy with more than one child
  - Direct benef

## Target audience
Initial filters:
- updated_at > 2023
- refuser = false
- a_valider = false
- situation != boursier

Audience:
- Jeunes : between 14 and 18 years old
- AAH : >= 16 years old

In [None]:
import pandas as pd
from dotenv import load_dotenv
import os
import json
import time
from data.utils.emailing_utils import format_allocataire_benef_names_in_place

load_dotenv()

start_time = time.time()

benef_2024_pathfile = os.environ['BENEF_2024_TEASING_PARQUET_PATHFILE']

output_all_eligible_families = os.environ['CAMPAIGN_TEASING_ALL_ELIGIBLE_FAMILIES_2025']
output_one_children= os.environ['CAMPAIGN_TEASING_FAMILY_ONE_CHILDREN_2025']
output_multiple_children = os.environ['CAMPAIGN_TEASING_FAMILY_MULTIPLE_CHILDREN_2025']
output_direct_beneficiaries = os.environ['CAMPAIGN_TEASING_DIRECT_BENEFICIARIES_2025']

In [None]:
df_main = pd.read_parquet(benef_2024_pathfile)

In [None]:
df_json_normalized = pd.json_normalize(df_main['allocataire'].apply(json.loads))
df_json_normalized = df_json_normalized.add_prefix('allocataire_')
df_main.index = pd.RangeIndex(start=0, stop=len(df_main), step=1)
df_unwrapped_alloc = pd.merge(df_main, df_json_normalized, left_index=True, right_index=True)

print(f"Number of beneficiaries : {len(df_unwrapped_alloc)}")

In [None]:
# Remove RGPD lines
rgpd_emails = os.environ['RGPD_EMAILS_TO_EXCLUDE_PATHFILE']
rgpd_codes = os.environ['RGPD_CODES_TO_EXCLUDE_PATHFILE']

df_rgpd_emails = pd.read_csv(rgpd_emails)
df_rgpd_codes = pd.read_csv(rgpd_codes)

print(f"Number of beneficiaries before excluding RGPD lines : {len(df_unwrapped_alloc)} ")

df_unwrapped_alloc = df_unwrapped_alloc[~df_unwrapped_alloc['id'].isin(df_rgpd_codes['id'])]
df_unwrapped_alloc = df_unwrapped_alloc[~df_unwrapped_alloc['allocataire_courriel'].str.lower().isin(df_rgpd_emails['email'].str.lower())]

print(f"Number of beneficiaries after excluding RGPD lines : {len(df_unwrapped_alloc)} ")

In [None]:
# Users that have email
mask_not_existing_email = df_unwrapped_alloc['allocataire_courriel'].apply(lambda x: pd.isna(x) or x == '')
df_unwrapped_alloc = df_unwrapped_alloc[~mask_not_existing_email]

print(f"Number of beneficiaries with email : {len(df_unwrapped_alloc)}")

In [None]:
# Only take benef who have been updated in 2024 and beyond
# df_unwrapped_alloc['updated_at'] = pd.to_datetime(df_unwrapped_alloc['updated_at'], format='ISO8601', errors='coerce')
df_unwrapped_alloc['updated_at'] = pd.to_datetime(df_unwrapped_alloc['updated_at'].apply(lambda v: v[:10]), format='%Y-%m-%d')
df_unwrapped_alloc = df_unwrapped_alloc[df_unwrapped_alloc['updated_at'].dt.year > 2023]

In [None]:
# Exclude boursiers and invalidated benef
df_unwrapped_alloc = df_unwrapped_alloc[df_unwrapped_alloc['refuser'] == False]
df_unwrapped_alloc = df_unwrapped_alloc[df_unwrapped_alloc['a_valider'] == False]
df_unwrapped_alloc = df_unwrapped_alloc[df_unwrapped_alloc['situation'].str.lower() != 'boursier']

In [None]:
len(df_unwrapped_alloc)

In [None]:
from datetime import datetime

# Take only people between 14 years old and 18 years old
start_date = datetime(2008, 1, 1).date()
end_date = datetime(2011, 12, 31).date()

df_unwrapped_alloc['date_naissance'] = pd.to_datetime(df_unwrapped_alloc['date_naissance'], format='ISO8601', errors='coerce')

mask_14_to_18_years_old = (df_unwrapped_alloc['date_naissance'].dt.date >= start_date) & (df_unwrapped_alloc['date_naissance'].dt.date <= end_date)
df_14_to_18_years_old = df_unwrapped_alloc.loc[mask_14_to_18_years_old]

In [None]:
print(f"{len(df_unwrapped_alloc)} original benef length")
print(f"{len(df_14_to_18_years_old)} benef between 14 and 18 years old")

In [None]:
column_mapping = {
    'id': 'id',
    'genre': 'genre',
    'updated_at': 'updated_at',
    'organisme': 'organisme',
    'date_naissance': 'date_naissance',
    'situation': 'situation',
    'allocataire_courriel': 'email',
    'allocataire_qualite': 'allocataire_qualite',
    'allocataire_nom': 'allocataire_nom',
    'allocataire_prenom': 'allocataire_prenom',
    'allocataire_matricule': 'allocataire_matricule',
    'prenom': 'beneficiaire_prenom',
    'nom': 'beneficiaire_nom',
    'allocataire_telephone': 'telephone',
}

# Keep original to make
original_df_unwrapped_alloc = df_unwrapped_alloc.copy()
original_df_unwrapped_alloc.columns = original_df_unwrapped_alloc.columns.to_series().replace(column_mapping)

df_14_to_18_years_old.columns = df_14_to_18_years_old.columns.to_series().replace(column_mapping)

In [None]:
# Format names
format_allocataire_benef_names_in_place(df_14_to_18_years_old)
format_allocataire_benef_names_in_place(original_df_unwrapped_alloc)

In [None]:
# From original - Group indirect beneficiaries to distinguish family with 1 and famility with > 1 children
grouped_emails = original_df_unwrapped_alloc.groupby(['email']).size().reset_index(name="count")
single_families = grouped_emails[grouped_emails['count'] == 1]

# Unique AAH taken from families with one beneficiary
mask_alloc_eq_benef = original_df_unwrapped_alloc['beneficiaire_prenom'].str.lower() == original_df_unwrapped_alloc['allocataire_prenom'].str.lower()

aah_start_date = datetime(1995, 1, 1).date() # 30 years old
aah_end_date = datetime(2005, 12, 31).date() # 20 years old

mask_aah = (original_df_unwrapped_alloc['date_naissance'].dt.date <= aah_start_date) & (original_df_unwrapped_alloc['date_naissance'].dt.date <= aah_end_date)
aah_df = original_df_unwrapped_alloc[mask_aah & mask_alloc_eq_benef]
unique_aah_df = aah_df[aah_df['email'].isin(single_families['email'])]

print(f"{len(aah_df)} AAH benef")

In [None]:
# Group indirect beneficiaries to distinguish family with 1 and famility with > 1 children for 14-18 years old
grouped_emails = df_14_to_18_years_old.groupby(['email']).size().reset_index(name="count")
single_families = grouped_emails[grouped_emails['count'] == 1]
families_more_than_one = grouped_emails[grouped_emails['count'] > 1]

In [None]:
# allocataire = bénéficiaire
mask_alloc_eq_benef = df_14_to_18_years_old['beneficiaire_prenom'].str.lower() == df_14_to_18_years_old['allocataire_prenom'].str.lower()

direct_benef = df_14_to_18_years_old.loc[df_14_to_18_years_old['email'].isin(single_families['email'])].loc[mask_alloc_eq_benef]
indirect_benef = df_14_to_18_years_old[~df_14_to_18_years_old['email'].isin(direct_benef['email'])]

In [None]:
# Families with single kid
indirect_benef_single_families = df_14_to_18_years_old[df_14_to_18_years_old['email'].isin(single_families['email'])]
indirect_benef_single_families = indirect_benef_single_families[~indirect_benef_single_families['email'].isin(direct_benef['email'])]

In [None]:
# Families with mulitple kids
indirect_benef_siblings_families = df_14_to_18_years_old[df_14_to_18_years_old['email'].isin(families_more_than_one['email'])]
indirect_benef_siblings_families = indirect_benef_siblings_families[~indirect_benef_siblings_families['email'].isin(direct_benef['email'])]
indirect_benef_siblings_families = indirect_benef_siblings_families[~indirect_benef_siblings_families['email'].isin(indirect_benef_single_families['email'])]
indirect_benef_siblings_families = indirect_benef_siblings_families.drop_duplicates(subset=['email']) # Only single email per family

In [None]:
# Indirect benef should not have any AAH
assert(indirect_benef_single_families['email'].isin(unique_aah_df['email']).any() == False)
assert(indirect_benef_siblings_families['email'].isin(unique_aah_df['email']).any() == False)

In [None]:
assert(len(pd.merge(unique_aah_df, indirect_benef_siblings_families, on='email', how='inner')) == 0)
assert(len(pd.merge(unique_aah_df, indirect_benef_single_families, on='email', how='inner')) == 0)

In [None]:
# Indirect benef should not contain any email from direct benef
assert(indirect_benef_single_families['email'].isin(direct_benef['email']).any() == False)

# Indirect benef from families with more than one child should not contain any email from single families and direct beneficiaries
assert(indirect_benef_siblings_families['email'].isin(direct_benef['email']).any() == False)
assert(indirect_benef_siblings_families['email'].isin(indirect_benef_single_families['email']).any() == False)

In [None]:
# Compare against the list of all families
# AAH benef should not have any benef from this list
# Single and Sibling families should be from this list
assert(indirect_benef_single_families['email'].isin(df_14_to_18_years_old['email']).any() == True)
assert(indirect_benef_siblings_families['email'].isin(df_14_to_18_years_old['email']).any() == True)
assert(unique_aah_df['email'].isin(df_14_to_18_years_old['email']).any() == False)

In [None]:
print(f"Total families with only one child: {len(indirect_benef_single_families):,}".replace(",", " "))
print(f"Total families with more than one child: {len(indirect_benef_siblings_families):,}".replace(",", " "))
print(f"Total families with one kid and more than one child: {len(indirect_benef_single_families) + len
(indirect_benef_siblings_families):,}".replace(",", " "))
print(f"Total AAH direct beneficiaries: {len(unique_aah_df):,}".replace(",", " "))
print(f"Total eligible families + AAH direct benef: {len(indirect_benef_single_families) + len(indirect_benef_siblings_families) + len(unique_aah_df):,}".replace(",", " "))

print(f"Total number of benef from eligible 14-18 years old families (without grouping): {len(df_14_to_18_years_old):,}".replace(",", " "))

In [None]:
# Make final check to ensure no rgpd emails are in the final output files
assert(df_14_to_18_years_old['id'].isin(df_rgpd_codes['id']).any() == False)
assert(indirect_benef_single_families['id'].isin(df_rgpd_codes['id']).any() == False)
assert(indirect_benef_siblings_families['id'].isin(df_rgpd_codes['id']).any() == False)
assert(unique_aah_df['id'].isin(df_rgpd_codes['id']).any() == False)

In [None]:
# Make final check to ensure no rgpd id are in the final output files
assert(df_14_to_18_years_old['email'].isin(df_rgpd_emails['email']).any() == False)
assert(indirect_benef_single_families['email'].isin(df_rgpd_emails['email']).any() == False)
assert(indirect_benef_siblings_families['email'].isin(df_rgpd_emails['email']).any() == False)
assert(unique_aah_df['email'].isin(df_rgpd_emails['email']).any() == False)

In [None]:
columns_to_keep = [
    'id',
    'email',
    'beneficiaire_prenom',
    'beneficiaire_nom',
    'allocataire_nom',
    'allocataire_prenom',
    # 'genre',
    # 'date_naissance',
    # 'organisme',
    # 'situation'
    # 'allocataire_matricule',
    # 'organisme'
]

columns_to_keep_siblings = [
    'id',
    'email',
    'allocataire_nom',
    'allocataire_prenom',
]

indirect_benef_single_families[columns_to_keep].to_csv(output_one_children, index=False, sep=",")
indirect_benef_siblings_families[columns_to_keep_siblings].to_csv(output_multiple_children, index=False, sep=",")
unique_aah_df[columns_to_keep].to_csv(output_direct_beneficiaries, index=False, sep=",")
df_14_to_18_years_old[columns_to_keep].to_csv(output_all_eligible_families, index=False, sep=",")

end_time = time.time()
print(f"Notebook executed in {end_time - start_time:.2f} seconds")