## Summary
Teasing emailing campaign, only to UNIQUE emails

## Process
- Load list of eligible beneficiaries from 2025
- JSON extraction for the allocataire field
- Data mapping and deletion of unusable data
- URL generation containing the code
- Output 3 files
  - Indirect beneficiaries
    - Family with one children
    - Familiy with more than one children
  - Direct beneficiairy



In [None]:
import pandas as pd
from dotenv import load_dotenv
import os
import json
import time
from data.utils.emailing_utils import format_allocataire_benef_names_in_place

load_dotenv()

start_time = time.time()

benef_2025_pathfile = os.environ['BENEF_2025_PARQUET_PATHFILE']
qr_code_secret_key = os.environ['BENEF_2025_QR_CODE_URL_SECRET']
qr_code_base_url = os.environ['BENEF_2025_QR_CODE_BASE_URL']
pathfile_campaign_csv_output_b = os.environ['CAMPAIGN_CSV_OUTPUT_B']
pathfile_campaign_csv_output_b_and_a = os.environ['CAMPAIGN_CSV_OUTPUT_B_AND_A']

output_one_children= os.environ['CAMPAIGN_TEASING_FAMILY_ONE_CHILDREN_2025']
output_multiple_children = os.environ['CAMPAIGN_TEASING_FAMILY_MULTIPLE_CHILDREN_2025']
output_direct_beneficiaries = os.environ['CAMPAIGN_TEASING_DIRECT_BENEFICIARIES_2025']

In [None]:
df_main = pd.read_parquet(benef_2025_pathfile)

In [None]:
df_json_normalized = pd.json_normalize(df_main['allocataire'].apply(json.loads))
df_json_normalized = df_json_normalized.add_prefix('allocataire_')
df_main.index = pd.RangeIndex(start=0, stop=len(df_main), step=1)
df_unwrapped_alloc = pd.merge(df_main, df_json_normalized, left_index=True, right_index=True)

print(f"Number of beneficiaries : {len(df_unwrapped_alloc)}")

In [None]:
# Users that have email
mask_not_existing_email = df_unwrapped_alloc['allocataire_courriel'].apply(lambda x: pd.isna(x) or x == '')

df_unwrapped_alloc = df_unwrapped_alloc[~mask_not_existing_email]

print(f"Number of beneficiaries with email : {len(df_unwrapped_alloc)}")

In [None]:
column_mapping = {
    'id': 'id',
    'allocataire_courriel': 'email',
    'allocataire_qualite': 'allocataire_qualite',
    'allocataire_nom': 'allocataire_nom',
    'allocataire_prenom': 'allocataire_prenom',
    'prenom': 'beneficiaire_prenom',
    'nom': 'beneficiaire_nom',
    'allocataire_telephone': 'telephone',
}

df_unwrapped_alloc.columns = df_unwrapped_alloc.columns.to_series().replace(column_mapping)

In [None]:
#only keep necessary columns
df_campaign = df_unwrapped_alloc[
    [
        'id',
        'email',
        'allocataire_nom',
        'allocataire_prenom',
        'beneficiaire_prenom',
        'beneficiaire_nom',
    ]
]

In [None]:
# capitalize on name / surname
format_allocataire_benef_names_in_place(df_campaign)

In [None]:
# allocataire = bénéficiaire
mask_alloc_diff_benef = df_campaign['beneficiaire_prenom'].str.lower() != df_campaign['allocataire_prenom'].str.lower()
df_alloc_diff_benef = df_campaign[mask_alloc_diff_benef]

# allocataire != bénéficiaire
mask_alloc_eq_benef = df_campaign['beneficiaire_prenom'].str.lower() == df_campaign['allocataire_prenom'].str.lower()
mask_exclude_email_from_indirect_benef = ~df_campaign['email'].isin(df_alloc_diff_benef['email'])

df_alloc_eq_benef = df_campaign[mask_alloc_eq_benef & mask_exclude_email_from_indirect_benef]

print(f"{len(df_alloc_diff_benef)} indirect beneficiaries")
print(f"{len(df_alloc_eq_benef)} direct beneficiaries")

In [None]:
# Group indirect beneficiaries to distinguish family with 1 and famility with > 1 children
grouped = df_alloc_diff_benef.groupby(['email']).size().reset_index(name='count')
grouped_family_of_one = grouped[grouped['count'] == 1]
grouped_family_of_more_than_one = grouped[grouped['count'] > 1]

In [None]:
print(f"{grouped_family_of_one.shape[0]} families with only one children")
print(f"{grouped_family_of_more_than_one.shape[0]} families with more than one children")
print(f"{len(df_alloc_eq_benef)} direct beneficiaries")
print(f"{grouped_family_of_one.shape[0] + grouped_family_of_more_than_one.shape[0]} families")
print(f"Total : {grouped_family_of_one.shape[0] + grouped_family_of_more_than_one.shape[0] + len(df_alloc_eq_benef)}")

In [None]:
family_of_one_list = df_campaign[df_campaign['email'].isin(grouped_family_of_one['email'])].drop_duplicates(
    subset=['email'])

family_of_more_than_one_list = df_campaign[
    df_campaign['email'].isin(grouped_family_of_more_than_one['email'])].drop_duplicates(subset=['email'])

In [None]:
print(f"{len(family_of_one_list)} families with only one children")
print(f"{len(family_of_more_than_one_list)} families with more than one children")
print(f"{len(df_alloc_eq_benef)} direct beneficiaries")

In [None]:
columns_to_keep = [
    'id',
    'email',
    'allocataire_nom',
    'allocataire_prenom'
]

family_of_one_list[columns_to_keep].to_csv(output_one_children, index=False)
family_of_more_than_one_list[columns_to_keep].to_csv(output_multiple_children, index=False)
df_alloc_eq_benef[columns_to_keep].to_csv(output_direct_beneficiaries, index=False)

end_time = time.time()
print(f"Notebook executed in {end_time - start_time:.2f} seconds")