In [1]:
import pandas as pd
import yaml
from functions import replace_column_names, trim_and_lower, replace_patterns, convert_data_type

In [2]:
try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("The configuration file was not found!")

***Notes on groups and clients***
- Both datasets are of equal length, however, only 20109 rows in the `groups` dataset have a variation assigned.
- Once `dropna` is applied to the dataframe resulting from the merge of `clients` and `groups` only 50487 rows remain.

In [3]:
clients_path = config['data']['raw']['raw_1']
clients = pd.read_csv(clients_path)
clients = trim_and_lower(clients)

In [4]:
groups_path = config['data']['raw']['raw_2']
groups = pd.read_csv(groups_path)
groups = trim_and_lower(groups)

In [5]:
clients.isna().sum()

client_id            0
clnt_tenure_yr      14
clnt_tenure_mnth    14
clnt_age            15
gendr               14
num_accts           14
bal                 14
calls_6_mnth        14
logons_6_mnth       14
dtype: int64

In [6]:
df = pd.merge(clients, groups, on='client_id')

In [7]:
# This cell drops all rows containing null values because these are not relevant to the A/B test.
df = df.dropna()

In [8]:
# This cell replaces the shorthand used in some column names with more explicit names.
col_replacements = {
    'clnt': 'client',
    'gendr': 'gender',
    'accts': 'accounts',
    'bal':'balance'
}

df = replace_column_names(df, col_replacements)

In [9]:
# This cell makes gender values more explicit and converts floats to integers where it makes sense.
gender_replacements = {
    'm': 'male',
    'f': 'female',
    'u': 'unknown',
    'x': 'non-binary'
}

df.gender = df.gender.apply(lambda x: replace_patterns(x, gender_replacements))

float_to_int_cols = [
    'client_tenure_yr',
    'client_tenure_mnth',
    'num_accounts',
    'calls_6_mnth',
    'logons_6_mnth'
    ]

df = convert_data_type(df, float_to_int_cols, int)

In [10]:
# This cell creates categories based on the values found in some numerical columns. This makes analysis across cohorts
# much easier.

exp_year = 2017
df['yob'] = (exp_year - df.client_age).astype(int)

gen_bins = [1901, 1927, 1945, 1964, 1980, 1996, df.yob.max()]
gen_labels = ['greatest', 'silent', 'baby boomers', 'gen x', 'millennials', 'gen z']
df['generation'] = pd.cut(
    df.yob,
    bins=gen_bins,
    labels=gen_labels,
    include_lowest=True
)

tenure_bins = [0, 5, 10, 15, df.client_tenure_yr.max()]
tenure_labels = ['0-5 years', '6-10 years', '11-15 years', '15+ years']
df['tenure_group'] = pd.cut(
    df.client_tenure_yr,
    bins=tenure_bins,
    labels=tenure_labels,
    include_lowest=False,
)

online_bins = [0, 3, 7, df.logons_6_mnth.max()]
online_labels = ['low', 'medium', 'high']
df['online_activity_level'] = pd.cut(
    df.logons_6_mnth,
    bins=online_bins,
    labels=online_labels,
    include_lowest=True
)

call_bins = [0, 2, 4, df.calls_6_mnth.max()]
call_labels = ['low', 'medium', 'high']
df['call_rate'] = pd.cut(
    df.calls_6_mnth,
    bins=call_bins,
    labels=call_labels,
    include_lowest=True
)

bal_bins = [0, 50000, 100000, 500000, df.balance.max()]
bal_labels = ["0-50,000", "50,001-100,000", "100,001-500,000", "500,001+"]
df['balance_group'] = pd.cut(
    df.balance,
    bins=bal_bins,
    right=True,
    labels=bal_labels,
    include_lowest=True,
)

df.head()

Unnamed: 0,client_id,client_tenure_yr,client_tenure_mnth,client_age,gender,num_accounts,balance,calls_6_mnth,logons_6_mnth,variation,yob,generation,tenure_group,online_activity_level,call_rate,balance_group
0,836976,6,73,60.5,unknown,2,45105.3,6,9,test,1956,baby boomers,6-10 years,high,high,"0-50,000"
1,2304905,7,94,58.0,unknown,2,110860.3,6,9,control,1959,baby boomers,6-10 years,high,high,"100,001-500,000"
2,1439522,5,64,32.0,unknown,2,52467.79,6,9,test,1985,millennials,0-5 years,high,high,"50,001-100,000"
3,1562045,16,198,49.0,male,2,67454.65,3,6,test,1968,gen x,15+ years,medium,medium,"50,001-100,000"
4,5126305,12,145,33.0,female,2,103671.75,0,3,control,1984,millennials,11-15 years,low,low,"100,001-500,000"


In [11]:
# Create clean csv files
output_file_path = config['data']['clean']['clean_1']
df.to_csv(output_file_path, index=False)