# Purpose of this notebook
Extract a sampling out of a data from the database that already has unique emails and age threshold (from 6 to > 18)

# Criterias for the sampling
Sampling of 10% on approximately total base ~4M (unique emails), so 400k

- 50% of activated pass and 50% of not activated pass
- Age bracket, 25% for each
  - 6-10
  - 11-14
  - 15-18
  - Greater than 18

In [None]:
import os
from dotenv import load_dotenv
import pandas as pd

load_dotenv()

sample_filepath = os.environ['SAMPLE_FILEPATH']
final_sample_output_filepath = os.environ['FINAL_SAMPLE_OUTPUT_FILEPATH']

df = pd.read_csv(sample_filepath)

In [None]:
# Activated pass
activated_df = df[df['pass_statut'] == 'utilise']
len(activated_df)

In [None]:
# Not activated pass
not_activated_df = df[df['pass_statut'] == 'non_utilise']
len(not_activated_df)

In [None]:
assert(len(df) == (len(not_activated_df) + len(activated_df)))

In [None]:
activated_bracket_6_10 = activated_df[(activated_df['age'] >= 6) & (activated_df['age'] <= 10)]
activated_bracket_11_14 = activated_df[(activated_df['age'] >= 11) & (activated_df['age'] <= 14)]
activated_bracket_15_18 = activated_df[(activated_df['age'] >= 15) & (activated_df['age'] <= 18)]
activated_bracket_more_than_18 = activated_df[activated_df['age'] > 18]


In [None]:
not_activated_bracket_6_10 = not_activated_df[(not_activated_df['age'] >= 6) & (not_activated_df['age'] <= 10)]
not_activated_bracket_11_14 = not_activated_df[(not_activated_df['age'] >= 11) & (not_activated_df['age'] <= 14)]
not_activated_bracket_15_18 = not_activated_df[(not_activated_df['age'] >= 15) & (not_activated_df['age'] <= 18)]
not_activated_bracket_more_than_18 = not_activated_df[not_activated_df['age'] > 18]

In [None]:
print(
    f'''
  Activated bracket 6-10 : {len(activated_bracket_6_10)}
  Activated bracket 11-14 : {len(activated_bracket_11_14)}
  Activated bracket 15-18 : {len(activated_bracket_15_18)}
  Activated bracket > 18 : {len(activated_bracket_more_than_18)}

  Not activated bracket 6-10 : {len(not_activated_bracket_6_10)}
  Not activated bracket 11-14 : {len(not_activated_bracket_11_14)}
  Not activated bracket 15-18 : {len(not_activated_bracket_15_18)}
  Not activated bracket > 18 : {len(not_activated_bracket_more_than_18)}
  '''
)

In [None]:
sample_size = 50_000

final_df = pd.concat([
    activated_bracket_6_10.sample(n=sample_size),
    activated_bracket_11_14.sample(n=sample_size),
    activated_bracket_15_18.sample(n=sample_size),
    activated_bracket_more_than_18, # contains less than the the sample size
    not_activated_bracket_6_10.sample(n=sample_size),
    not_activated_bracket_11_14.sample(n=sample_size),
    not_activated_bracket_15_18.sample(n=sample_size),
    not_activated_bracket_more_than_18.sample(n=sample_size)
], ignore_index=True)

In [None]:
final_df[[
    'nom_allocataire',
    'prenom_allocataire',
    'courriel'
]].to_csv(final_sample_output_filepath, index=False)