# DITP experimentation n°2 - part 2
# Notebook for LM campaign on the 19th september that begins on the 26th of september
Analysis deadlines: 10 october, 10 november, 31 december

The files are to received from Link Mobility, 
We need to fill in the type beneficiairy/ object received/ mail received in the final DITP exported consolidated file

# Mapping table
| Type de bénéficiaire | Mail reçu    | Objet reçu   |
|----------------------|--------------|--------------|
| 1 (parent)           | 0 (contrôle) | 0 (contrôle) |
| 1 (parent)           | 0 (contrôle) | 1 (test)     |
| 1 (parent)           | 1 (test)     | 0 (contrôle) |
| 1 (parent)           | 1 (test)     | 1 (test)     |
| 2 (Ben direct)       | 0 (contrôle) | 0 (contrôle) |
| 2 (Ben direct)       | 0 (contrôle) | 1 (test)     |
| 2 (Ben direct)       | 1 (test)     | 0 (contrôle) |
| 2 (Ben direct)       | 1 (test)     | 1 (test)     |


In [None]:
from dotenv import load_dotenv
import os
import numpy as np

load_dotenv()

db_current_path_file = os.environ['DB_CURRENT_WITH_PASS_STATUS']
consolidated_original_path_file = os.environ['CAMPAIGN_LINK_MOBILITY_26_SEPTEMBER_CONSOLIDATED_OUTPUT_PATHFILE']
ditp_analysis_output_path_file = os.environ['DITP_ANALYSIS_EXPORT_OUPUT_PATH_FILE']

# Columns for exported CSV to DITP
type_benef_col = 'type_benef' # 1=parent, 2=direct
object_type_col = 'objet_recu' # 0=control, 1=test
mail_type_col = 'mail_recu' # 0=control, 1=test
activated_col = 'recours' # 0=non, 1=oui
benef_gender_col = 'beneficiaire_genre' # 1=Femme, 2=Homme
benef_age_col = 'beneficiaire_age'
parents_gender_col = 'parents_genre' # 1=Femme, 2=Homme
parents_age_col = 'parents_age'
postal_code_col = 'code_postal'
child_age_col = 'age_enfant'
qpv_col = 'qpv'
zrr_col ='zrr'
residential_area_col = 'zone_habitation'
activation_date_col = 'date_recours'

# Additionnal columns for DITP
type_benef = {
  'parent': 1,
  'benef': 2
}

object_type = {
  'control': 0,
  'test': 1
}

mail_type = {
  'control': 0,
  'test': 1
}

In [None]:
import pandas as pd

# Load into dataframes the 8 files that were initially sent to Link Mobility
df_100 = pd.read_csv(os.environ['DITP_CAMPAIGN_FROM_26_SEPTEMBER_PART_100_PATHFILE'])
df_101 = pd.read_csv(os.environ['DITP_CAMPAIGN_FROM_26_SEPTEMBER_PART_101_PATHFILE'])
df_110 = pd.read_csv(os.environ['DITP_CAMPAIGN_FROM_26_SEPTEMBER_PART_110_PATHFILE'])
df_111 = pd.read_csv(os.environ['DITP_CAMPAIGN_FROM_26_SEPTEMBER_PART_111_PATHFILE'])

df_200 = pd.read_csv(os.environ['DITP_CAMPAIGN_FROM_26_SEPTEMBER_PART_200_PATHFILE'])
df_201 = pd.read_csv(os.environ['DITP_CAMPAIGN_FROM_26_SEPTEMBER_PART_201_PATHFILE'])
df_210 = pd.read_csv(os.environ['DITP_CAMPAIGN_FROM_26_SEPTEMBER_PART_210_PATHFILE'])
df_211 = pd.read_csv(os.environ['DITP_CAMPAIGN_FROM_26_SEPTEMBER_PART_211_PATHFILE'])

In [None]:
# Initialise type_benef, object received & mail received for each of the 8 files initially sent to Link Mobility
df_100[[type_benef_col, mail_type_col, object_type_col]] = [
  type_benef['parent'], mail_type['control'], object_type['control']
]

df_101[[type_benef_col, mail_type_col, object_type_col]] = [
  type_benef['parent'], mail_type['control'], object_type['test']
]

df_110[[type_benef_col, mail_type_col, object_type_col]] = [
  type_benef['parent'], mail_type['test'], object_type['control']
]

df_111[[type_benef_col, mail_type_col, object_type_col]] = [
  type_benef['parent'], mail_type['test'], object_type['test']
]

df_200[[type_benef_col, mail_type_col, object_type_col]] = [
  type_benef['benef'], mail_type['control'], object_type['control']
]

df_201[[type_benef_col, mail_type_col, object_type_col]] = [
  type_benef['benef'], mail_type['control'], object_type['test']
]

df_210[[type_benef_col, mail_type_col, object_type_col]] = [
  type_benef['benef'], mail_type['test'], object_type['control']
]

df_211[[type_benef_col, object_type_col, mail_type_col]] = [
  type_benef['benef'], object_type['test'], mail_type['test']
]

In [None]:
# Merge all the files that were sent to Link Mobility into one dataframe
merged_df_to_update = pd.concat([
  df_100,
  df_101,
  df_110,
  df_111,
  df_200,
  df_201,
  df_210,
  df_211
], ignore_index=True).reset_index()

In [None]:
# Original file that contains exhaustive information about beneficiaire & allocataire
original_df = pd.read_csv(consolidated_original_path_file, dtype={
  'email': 'str',
  'allocataire_age': 'Int64',
  'allocataire_genre': 'str',
  'beneficiaire_age': 'Int64',
  'beneficiaire_genre': 'str',
  'allocataire_qualite': 'str',
  'code_postal': 'str',
  'zrr': 'boolean',
  'qpv': 'boolean'
})

In [None]:
# Merge DITP with original CSV that was sent to Link Mobility for the campaign that contains additionnal information such as
# beneficiary age, allocataire age, zzr, qpv, 
merged_df_updated = pd.merge(merged_df_to_update, original_df, how='left', on=['code'], suffixes=(None,'_new'))

In [None]:
merged_df_updated[['qpv', 'zrr']] = merged_df_updated[['qpv', 'zrr']].fillna(False)

In [None]:
# Load csv file into dataframe that contains people who activated their pass Sport
df_db_existing = pd.read_csv(db_current_path_file, dtype={
  'id': 'str',
  'id_psp': 'str',
  'date_recours': 'str',
  'recours': 'Int64'
})

In [None]:
len(df_db_existing)

In [None]:
df_db_existing_activated_only = df_db_existing[df_db_existing['recours'] == 1]

In [None]:
len(df_db_existing_activated_only)

In [None]:
df_db_existing_activated_only = df_db_existing_activated_only.rename(columns={
  'id_psp': 'code'
})

In [None]:
# Fill in people who have activated their pass sport
merged_df_updated = pd.merge(
  merged_df_updated, 
  df_db_existing_activated_only[['code', activated_col, activation_date_col]],
  how='left',
  on=['code'], 
  suffixes=(None, '_new')
)

In [None]:
# Fill column dedicated to activated pass sport
merged_df_updated[activated_col] = merged_df_updated[activated_col].fillna(value=0)
merged_df_updated[activated_col] = merged_df_updated[activated_col].astype(int)

In [None]:
# Map genders
merged_df_updated['allocataire_genre'] = np.where(merged_df_updated['allocataire_genre'] == 'F', 1, 2)
merged_df_updated['beneficiaire_genre'] = np.where(merged_df_updated['beneficiaire_genre'] == 'F', 1, 2)

In [None]:
# Sort by activation_date_col, most recent must appear first and NaT last
merged_df_updated[activation_date_col] = pd.to_datetime(
  merged_df_updated[activation_date_col], 
  format='%d/%m/%Y',
  errors='coerce'
)

merged_df_updated = merged_df_updated.sort_values(by=activation_date_col, na_position='last', ascending=False)

# Reformat human readable date (initial format)
merged_df_updated[activation_date_col] = merged_df_updated[activation_date_col].dt.strftime('%d/%m/%Y')

In [None]:
# Create <residential_area_col>, exclude qpv and zrr when both are true because it isn't reliable
merged_df_updated[residential_area_col] = np.where(
    merged_df_updated['qpv'] & merged_df_updated['zrr'],  # Condition: Both qpv and zrr are True
    '',  # If both are True, set empty string
    np.where(
      merged_df_updated['qpv'], 'qpv', 
      np.where(
        merged_df_updated['zrr'], 'zrr', ''
      )
    )  # Otherwise, set 'qpv', 'zrr', or empty string
)

In [None]:
# Exclude people who have not been delivered
from unidecode import unidecode

lm_report_100_pathfile = os.environ['DITP_CAMPAIGN_FROM_26_SEPTEMBER_PART_100_LM_REPORT_PATHFILE']
lm_report_101_pathfile = os.environ['DITP_CAMPAIGN_FROM_26_SEPTEMBER_PART_101_LM_REPORT_PATHFILE']
lm_report_110_pathfile = os.environ['DITP_CAMPAIGN_FROM_26_SEPTEMBER_PART_110_LM_REPORT_PATHFILE']
lm_report_111_pathfile = os.environ['DITP_CAMPAIGN_FROM_26_SEPTEMBER_PART_111_LM_REPORT_PATHFILE']
lm_report_200_pathfile = os.environ['DITP_CAMPAIGN_FROM_26_SEPTEMBER_PART_200_LM_REPORT_PATHFILE']
lm_report_201_pathfile = os.environ['DITP_CAMPAIGN_FROM_26_SEPTEMBER_PART_201_LM_REPORT_PATHFILE']
lm_report_210_pathfile = os.environ['DITP_CAMPAIGN_FROM_26_SEPTEMBER_PART_210_LM_REPORT_PATHFILE']
lm_report_211_pathfile = os.environ['DITP_CAMPAIGN_FROM_26_SEPTEMBER_PART_211_LM_REPORT_PATHFILE']

lm_report_100_df = pd.read_csv(lm_report_100_pathfile, sep=';', encoding='macroman', dtype=str)
lm_report_101_df = pd.read_csv(lm_report_101_pathfile, sep=';', encoding='macroman', dtype=str)
lm_report_110_df = pd.read_csv(lm_report_110_pathfile, sep=';', encoding='macroman', dtype=str)
lm_report_111_df = pd.read_csv(lm_report_111_pathfile, sep=';', encoding='macroman', dtype=str)
lm_report_200_df = pd.read_csv(lm_report_200_pathfile, sep=';', encoding='macroman', dtype=str)
lm_report_201_df = pd.read_csv(lm_report_201_pathfile, sep=';', encoding='macroman', dtype=str)
lm_report_210_df = pd.read_csv(lm_report_210_pathfile, sep=';', encoding='macroman', dtype=str)
lm_report_211_df = pd.read_csv(lm_report_211_pathfile, sep=';', encoding='macroman', dtype=str)

merged_report_df = pd.concat([
  lm_report_100_df,
  lm_report_101_df,
  lm_report_110_df,
  lm_report_111_df,
  lm_report_200_df,
  lm_report_201_df,
  lm_report_210_df,
  lm_report_211_df
], ignore_index=True)

considered_as_delivered = [
  'open',
  'envoi ok',
  'open+clic',
  'clic',
  'open+view+clic',
  'open+view',
  'open+unsb',
  'unsb',
  'open+view+clic+unsb',
  'open+clic+unsb',
  'view',
  'clic+unsb',
  'open+view+unsb',
  'open+clic+abus',
  'view+clic+unsb',
  'open+unsb+abus',
  'view+clic',
  'clic+abuse',
  'open+view+clic+abus',
  'open+view+abus',
  'view+unsb',
  'unsb+abus'
]

merged_report_df['ERROR_NAME'] = merged_report_df['ERROR_NAME'].apply(unidecode)
merged_report_df['ERROR_NAME'] = merged_report_df['ERROR_NAME'].str.lower()
merged_report_df_only_delivered = merged_report_df[merged_report_df['ERROR_NAME'].isin(considered_as_delivered)]

# Exclude not delivered users
merged_df_updated_with_only_delivered = merged_df_updated[merged_df_updated['code'].isin(merged_report_df_only_delivered['CODE'])]

In [None]:
merged_report_df['ERROR_NAME'].value_counts()

In [None]:
columns_to_keep = [
  type_benef_col, 
  object_type_col, 
  mail_type_col, 
  activated_col,
  activation_date_col,
  parents_age_col,
  parents_gender_col,
  benef_gender_col,
  benef_age_col,
  postal_code_col,
  residential_area_col
]

final_df = merged_df_updated_with_only_delivered.rename(columns={
  'allocataire_genre': parents_gender_col,
  'allocataire_age': parents_age_col,
  'beneficiaire_genre': benef_gender_col,
  'beneficiaire_age': benef_age_col
})

final_df = final_df[columns_to_keep]

In [None]:
# Export final csv file for DITP
final_df.to_csv(ditp_analysis_output_path_file, index=False)