# Link Mobility notebook to extract people who didn't activate their pass Sport
The process:
- Take the export from Link Mobility hardbounce SMS campaign from 12 september
- Extract and add two columns
  - type of benef (parent=1, beneficiaire_direct=2) used, 
  - object received (controle=1, test=1)
- Cross reference with existing database to add column "recours" to know which beneficiary activated or not their pass Sport
- Calculate the age of allocataire & beneficiaire from existing data
- Add column code_postal from existing data
- Add geo columns zrr,qpv from existing data
- Output the final CSV file for DITP (all information should be anonym)


In [None]:
import pandas as pd
from datetime import date

from dotenv import load_dotenv
import os

load_dotenv()

# File that can be retrieved from link mobility platform
lm_pathfile = os.environ['CAMPAIGN_HARD_BOUNCE_SMS_12_SEPTEMBER_PATHFILE']

lm_extracted_information_output_pathfile = os.environ['CAMPAIGN_HARD_BOUNCE_SMS_12_SEPTEMBER_EXTRACTED_INFORMATION_OUTPUT_PATHFILE']
db_existing = os.environ['DB_CURRENT_WITH_PASS_STATUS']

code_col = 'id_psp'
type_benef_col = 'type_benef' # 1=parent, 2=direct
obj_received_col = 'objet_recu' # 0=control, 1=test

In [None]:
df = pd.read_csv(lm_pathfile, sep=',', dtype=str)

In [None]:
# Regex to extract codes (exclude "O" and "I")
pattern = r"24-[A-HJ-NP-Z]{4}-[A-HJ-NP-Z]{4}"

lm_initial_columns = df.columns

df[code_col] = df['Message Content'].str.extract(f'({pattern})')

In [None]:
import numpy as np

# Set type/object received columns for ditp
df[type_benef_col] = np.where(df['Campaign Name'].str.contains('parents', case=False), 1, 2);
df[obj_received_col] = np.where(df['Campaign Name'].str.contains('contrôle', case=False), 0, 1);

In [None]:
# Load existing database with exhaustive info
df_db_existing = pd.read_csv(db_existing, sep=',', dtype=str)

In [None]:
merged_df = pd.merge(df, df_db_existing, on=code_col, how="left", suffixes=(None, None))

In [None]:
merged_df = merged_df.drop(columns=lm_initial_columns)

In [None]:
import json 

# Cleaning data process
# unwrap alloc
df_json_allocataire = pd.json_normalize(merged_df['allocataire'].apply(json.loads))
df_json_allocataire = df_json_allocataire.add_prefix('allocataire-')

merged_df.index = pd.RangeIndex(start=0, stop=len(merged_df), step=1)

merged_db_unwrapped = pd.merge(
  merged_df, 
  df_json_allocataire[
    ['allocataire-courriel', 'allocataire-qualite', 'allocataire-nom', 'allocataire-prenom', 'allocataire-telephone', 'allocataire-date_naissance']
  ], 
  left_index=True, 
  right_index=True
)

merged_db_unwrapped = merged_db_unwrapped.drop(columns=['allocataire'])

In [None]:
# unwrap adresse alloc
df_json_adresse_allocataire = pd.json_normalize(merged_db_unwrapped['adresse_allocataire'].apply(json.loads))

merged_db_unwrapped.index = pd.RangeIndex(start=0, stop=len(merged_db_unwrapped), step=1)

merged_db_unwrapped = pd.merge(merged_db_unwrapped, df_json_adresse_allocataire[['code_postal']], left_index=True, right_index=True)
merged_db_unwrapped = merged_db_unwrapped.drop(columns=['adresse_allocataire'])

In [None]:
# Cast to date_time benef + allocataire birth dates
merged_db_unwrapped['beneficiaire_date_naissance'] = pd.to_datetime(merged_db_unwrapped['date_naissance'], errors='coerce')
merged_db_unwrapped['allocataire_date_naissance'] = pd.to_datetime(merged_db_unwrapped['allocataire-date_naissance'], errors='coerce')

In [None]:
# Add column for allocataire gender
merged_db_unwrapped['allocataire_genre'] = np.where(merged_db_unwrapped['allocataire-qualite'] == 'Mme', 'F', 'M')

In [None]:
# age of beneficiaire + allocataire (if it exists)
def calculate_age(born):
    today = date.today()
    age = today.year - born.year
    if (today.month, today.day) < (born.month, born.day):
        age -= 1
    
    return age

merged_db_unwrapped['beneficiaire_age'] = merged_db_unwrapped['beneficiaire_date_naissance'].apply(calculate_age)
merged_db_unwrapped['allocataire_age'] = merged_db_unwrapped['allocataire_date_naissance'].apply(calculate_age)

In [None]:
merged_db_unwrapped[['beneficiaire_age', 'allocataire_age']] = merged_db_unwrapped[['beneficiaire_age', 'allocataire_age']].astype('Int64') 

In [None]:
# Columns for exported CSV to DITP
type_benef_col = 'type_benef' # 1=parent, 2=direct
obj_received_col = 'objet_recu' # 0=control, 1=test
mail_received_col = 'mail_recu' # 0=control, 1=test
activated_col = 'recours' # 0=non, 1=oui
benef_gender_col = 'beneficiaire_genre' # 1=Femme, 2=Homme
benef_age_col = 'beneficiaire_age'
parents_gender_col = 'parents_genre' # 1=Femme, 2=Homme
parents_age_col = 'parents_age'
postal_code_col = 'code_postal'
child_age_col = 'age_enfant'
qpv_col = 'qpv'
zrr_col ='zrr'

columns_to_keep = [
  type_benef_col, 
  obj_received_col, 
  activated_col,
  parents_age_col,
  parents_gender_col,
  benef_gender_col,
  benef_age_col,
  postal_code_col, 
  qpv_col, 
  zrr_col
]

merged_db_unwrapped = merged_db_unwrapped.rename(columns={
  'allocataire_genre': parents_gender_col,
  'allocataire_age': parents_age_col,
  'genre': benef_gender_col,
  'beneficiaire_age': benef_age_col,
  'pass_statut': activated_col
})

In [None]:
merged_db_unwrapped = merged_db_unwrapped[columns_to_keep]

In [None]:
merged_db_unwrapped.to_csv(lm_extracted_information_output_pathfile, index=False)