# DITP experimentation n°1
## Link Mobility notebook to extract people who didn't activate their pass Sport
The process:
- Take the export from Link Mobility hardbounce SMS campaign from 12 september
- Extract and add two columns
  - type of benef (parent=1, beneficiaire_direct=2) used, 
  - object received (controle=1, test=1)
- Cross reference with existing database to add column "recours" to know which beneficiary activated or not their pass Sport
- Calculate the age of allocataire & beneficiaire from existing data
- Add column code_postal from existing data
- Add geo columns zrr,qpv from existing data
- Output the final CSV file for DITP (all information should be anonym)


In [None]:
import pandas as pd
from dotenv import load_dotenv
import os

load_dotenv()

# File that can be retrieved from link mobility platform
lm_pathfile = os.environ['CAMPAIGN_HARD_BOUNCE_SMS_12_SEPTEMBER_PATHFILE']

lm_extracted_information_output_pathfile = os.environ['CAMPAIGN_HARD_BOUNCE_SMS_12_SEPTEMBER_EXTRACTED_INFORMATION_OUTPUT_PATHFILE']

db_existing = os.environ['DB_CURRENT_WITH_PASS_STATUS']

code_col = 'id_psp'
type_benef_col = 'type_benef' # 1=parent, 2=direct
obj_received_col = 'objet_recu' # 0=control, 1=test
activation_date_col = 'date_recours'
residential_area_col = 'zone_habitation'

In [None]:
df = pd.read_csv(lm_pathfile, sep=',', dtype=str)

In [None]:
# Regex to extract codes (exclude "O" and "I")
pattern = r"24-[A-HJ-NP-Z]{4}-[A-HJ-NP-Z]{4}"

lm_initial_columns = df.columns

df[code_col] = df['Message Content'].str.extract(f'({pattern})')

In [None]:
import numpy as np

# Set type/object received columns for ditp
df[type_benef_col] = np.where(df['Campaign Name'].str.contains('parents', case=False), 1, 2)
df[obj_received_col] = np.where(df['Campaign Name'].str.contains('contrôle', case=False), 0, 1)

In [None]:
# Load existing database with exhaustive info
df_db_existing = pd.read_csv(db_existing, sep=',', dtype=str)

In [None]:
# Merge lm & existing data dataframes & drop initial columns from LM campaign csv
merged_df = pd.merge(
  df, 
  df_db_existing, 
  on=code_col, 
  how="left", 
  suffixes=(None, '_new')
)

merged_df = merged_df.drop(columns=lm_initial_columns)

In [None]:
# Columns for exported CSV to DITP
type_benef_col = 'type_benef' # 1=parent, 2=direct
obj_received_col = 'objet_recu' # 0=control, 1=test
mail_received_col = 'mail_recu' # 0=control, 1=test
activated_col = 'recours' # 0=non, 1=oui
benef_gender_col = 'beneficiaire_genre' # 1=Femme, 2=Homme
benef_age_col = 'beneficiaire_age'
parents_gender_col = 'parents_genre' # 1=Femme, 2=Homme
parents_age_col = 'parents_age'
postal_code_col = 'code_postal'
child_age_col = 'age_enfant'
qpv_col = 'qpv'
zrr_col ='zrr'

merged_df = merged_df.rename(columns={
  'allocataire_genre': parents_gender_col,
  'allocataire_age': parents_age_col,
  'genre': benef_gender_col,
  'beneficiaire_age': benef_age_col,
  'beneficiaire_genre': benef_gender_col
})

In [None]:
# Sort by activation_date_col, most recent must appear first and NaT last
merged_df[activation_date_col] = pd.to_datetime(
  merged_df[activation_date_col], 
  format='%d/%m/%Y',
  errors='coerce'
)

merged_db_unwrapped = merged_df.sort_values(by=activation_date_col, na_position='last', ascending=False)

# Reformat human readable date (initial format)
merged_db_unwrapped[activation_date_col] = merged_db_unwrapped[activation_date_col].dt.strftime('%d/%m/%Y')

In [None]:
columns_to_keep = [
  type_benef_col, 
  obj_received_col, 
  activated_col,
  activation_date_col,
  parents_age_col,
  parents_gender_col,
  benef_gender_col,
  benef_age_col,
  postal_code_col, 
  residential_area_col,
]

df_final = merged_db_unwrapped[columns_to_keep]

In [None]:
df_final.to_csv(lm_extracted_information_output_pathfile, index=False)