# Merge & Divide

- Combine all client information for every DataFrame.
- Parse `'gender'` column with four values (_Male_, _Female_, _Other_ and _Unknown_)
- Step column converted to numerical values to avoid errors.
- Add `'error_count'`, `'stepped_back'` and `'variation'` columns

In [None]:
import pandas as pd

In [None]:
# import reusable functions from utils directory
import sys
sys.path.append('../../utils')
import functions

In [None]:
combined_cleaned_df = pd.read_csv('../../data/clean/combined_cleaned_data.csv')
combined_cleaned_df.head()

In [None]:
merged_demo_experiment_df = pd.read_csv('../../data/clean/final_demo_df_clean.csv')
merged_demo_experiment_df

In [None]:
merged_client_df = pd.merge(combined_cleaned_df, merged_demo_experiment_df, on='client_id', how='left')

In [None]:

merged_client_df.rename(columns={'process_step' : 'step', 'client_tenure_years': 'tenure_years', 'client_tenure_months': 'tenure_months', 'client_age' : 'age', 'num_accounts' : 'accounts'}, inplace=True)
functions.floats_to_ints(merged_client_df, 'tenure_years')
functions.floats_to_ints(merged_client_df, 'tenure_months')
functions.floats_to_ints(merged_client_df, 'age')
functions.floats_to_ints(merged_client_df, 'accounts')
functions.floats_to_ints(merged_client_df, 'calls_last_6_months')
functions.floats_to_ints(merged_client_df, 'logons_last_6_months')

merged_client_df

In [None]:
functions.steps_to_numerical(merged_client_df, 'step')

# Add Error Rate Column

In [None]:
# Detect backward steps (errors) for all clients
merged_client_df['stepped_back'] = merged_client_df.groupby('client_id')['step'].diff() < 0

# Count the number of unique clients who experienced at least one error
clients_with_errors = merged_client_df[merged_client_df['stepped_back']]['client_id'].nunique()

# Count the total number of errors for each client
errors_per_client = merged_client_df[merged_client_df['stepped_back']].groupby('client_id').size()

# Total number of errors across all clients
total_errors = merged_client_df['stepped_back'].sum()

print(f'Number of clients with at least one error: {clients_with_errors}')
print('Errors per client:')
print(errors_per_client)
print(f'Total number of errors across all clients: {total_errors}')

In [None]:
# add column 'error_count'
merged_client_df['error_count'] = merged_client_df['client_id'].map(errors_per_client).fillna(0).astype(int)

In [None]:
merged_client_df

In [None]:
functions.check_unique_and_empty(merged_client_df)

In [None]:
merged_final_demo_final_experiment_clients_df = pd.read_csv('../../data/clean/merged_final_demo_final_experiment_clients_df.csv')
merged_final_demo_final_experiment_clients_df

In [None]:
# Merge the DataFrames using 'outer' join to keep all records
merged_client_df = pd.merge(
    merged_client_df, 
    merged_final_demo_final_experiment_clients_df, 
    on='client_id', 
    how='outer', 
    suffixes=('', '_merged')  # Use suffixes to handle columns with the same name
)

# Drop columns that are duplicates
merged_client_df = merged_client_df.loc[:, ~merged_client_df.columns.duplicated()]


In [None]:
print(merged_client_df['variation'].unique())

In [None]:
merged_client_df.dropna(axis=0, how='any', inplace=True)

merged_client_df.drop(columns=['gender_merged', 'balance_merged' , 'calls_last_6_months_merged', 'logons_last_6_months_merged'], inplace=True)

functions.check_unique_and_empty(merged_client_df)

In [None]:
# merged_client_df.to_csv('final_clean_client_df.csv', index=False)

In [None]:
control_variation_df = merged_client_df[merged_client_df['variation'] == 'Control']
test_variation_df = merged_client_df[merged_client_df['variation'] == 'Test']
unknown_variation_df = merged_client_df[merged_client_df['variation'] == 'Unknown']

# Print the resulting DataFrames
print('Control DataFrame:')
print(control_variation_df)

print('\nTest DataFrame:')
print(test_variation_df)

print('\nUnknown DataFrame:')
print(unknown_variation_df)

In [None]:
# control_variation_df.to_csv('control_variation_df.csv', index=False)
# test_variation_df.to_csv('test_variation_df.csv', index=False)
# unknown_variation_df.to_csv('unknown_variation_df.csv', index=False)