In [None]:
import os
from dotenv import load_dotenv
import pandas as pd

load_dotenv()

not_activated_pathfile = os.environ['NOT_ACTIVATED_PASS_SPORTS']
lm_report_pathfile = os.environ['LM_REPORT_FROM_26_SEPTEMBER_CAMPAIGN']
batch_mailing_campaign_pathfile = os.environ['BATCH_MAILING_CAMPAIGN_PATHFILE']
batch_sms_campaign_pathfile = os.environ['BATCH_SMS_CAMPAIGN_PATHFILE'] 
batch_people_to_not_contact_pathfile = os.environ['BATCH_PEOPLE_TO_NOT_CONTACT_PATHFILE'] # (for DITP)

# UTF-8 encoding by default since the csv has been filtered in step 0_cnous_dedupe.ipynb
report_df = pd.read_csv(
  lm_report_pathfile, 
  encoding='Windows-1252',
  on_bad_lines='skip',
  sep=';',
  engine="c",
  dtype=str,
)

report_df = report_df.rename(columns={
  'CODE': 'code'
})

not_activated_df = pd.read_csv(not_activated_pathfile)

batch_size_sms_campaign = 20_000
batch_size_mailing_campaign = 20_000
batch_size_people_to_not_contact_campaign = 20_000 # (for DITP)

In [None]:
# Remove people who don't have phone number from Link Mobility report
report_df = report_df[report_df['MSISDN'].notna()]

In [None]:
# Remove invalid emails
report_df = report_df[report_df['ERROR_NAME'].str.lower() != 'adresse non valide']

In [None]:
print(f'{len(not_activated_df)} not activated, before excluding RGPD users and before filtering')

In [None]:
# Exclude blacklisted users due to RGPD reasons
# Combine with the relative path to the file
pathfile_rgpd_users_blacklist =os.path.join('../../', os.environ['RGPD_USERS_BLACKLIST_CSV_PATH_FILE']) 

# https://www.notion.so/Suivi-remont-s-utilisateurs-0bfd5c50ac67460a99ef651e3f8a0f45?pvs=4#cd6cbf85cbe6498c8ebbeda96ecba42d
df_rgpd = pd.read_csv(pathfile_rgpd_users_blacklist, usecols=['email'], dtype={ 'email': 'string' })

not_activated_excluding_rgpd_users_df = not_activated_df[~not_activated_df['email'].isin(df_rgpd['email'])]

In [None]:
print(f'{len(not_activated_excluding_rgpd_users_df)} not activated, after excluding RGPD users and before filtering')

In [None]:
# Take only people who did not activate their pass Sport
filtered_df = not_activated_excluding_rgpd_users_df[not_activated_excluding_rgpd_users_df['id_psp'].isin(report_df['code'])]

In [None]:
print(f'{len(filtered_df)} people after filtering')

In [None]:
# Shuffling rows
filtered_df_shuffle = filtered_df.sample(frac=1, random_state=1).reset_index(drop=True)

In [None]:
# Picking <batch_size_sms_campaign> for SMS campaign
sms_batch_df = filtered_df_shuffle.sample(n=batch_size_sms_campaign, random_state=1).reset_index(drop=True)

In [None]:
# Exclude people that were already picked for the SMS campaign
filtered_df_shuffle_without_sms_batch = filtered_df_shuffle[~filtered_df_shuffle['id_psp'].isin(sms_batch_df['id_psp'])]

# Picking <batch_size_mailing_campaign> for mailing campaign
mailing_batch_df = filtered_df_shuffle_without_sms_batch.sample(n=batch_size_sms_campaign, random_state=1).reset_index(drop=True)

In [None]:
# Exclude people that were already picked for the SMS campaign and mailing campaign
filtered_df_shuffle_without_sms_and_mailing_batch = filtered_df_shuffle[
  (~filtered_df_shuffle['id_psp'].isin(sms_batch_df['id_psp'])) & 
  (~filtered_df_shuffle['id_psp'].isin(mailing_batch_df['id_psp']))
]

# Picking <batch_size_people_to_not_contact_campaign> for people to not contact campaign (for DITP)
people_to_not_contact_df = filtered_df_shuffle_without_sms_and_mailing_batch.sample(
  n=batch_size_sms_campaign, 
  random_state=1
).reset_index(drop=True)

In [None]:
# Make sure mailing+sms batches don't overlap
assert(len(mailing_batch_df[mailing_batch_df['id_psp'].isin(sms_batch_df['id_psp'])]) == 0)

# Make sure people to not contact batch don't overlap with mailing+sms batches
people_to_not_contact_overlap_count_with_mailing_batch = len(
  people_to_not_contact_df[
    people_to_not_contact_df['id_psp'].isin(mailing_batch_df['id_psp'])
  ]
)

people_to_not_contact_overlap_count_with_sms_batch = len(people_to_not_contact_df[
  people_to_not_contact_df['id_psp'].isin(sms_batch_df['id_psp'])
]) 

assert(
  (people_to_not_contact_overlap_count_with_mailing_batch + people_to_not_contact_overlap_count_with_sms_batch) == 0
)

In [None]:
# Output CSV files
mailing_batch_df.to_csv(batch_mailing_campaign_pathfile)
sms_batch_df.to_csv(batch_sms_campaign_pathfile)
people_to_not_contact_df.to_csv(batch_people_to_not_contact_pathfile)