In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

# Functions

In [None]:
# inspect df
def inspect_dataframe(df):
    """
    Function to perform basic inspection on a DataFrame: 
    shape, column names, data types, and missing values.
    
    """

    print('Check the shape (rows, columns):')
    print(df.shape)

    print('\nColumn names:')
    print(df.columns)


    print('\nData types:')
    print(df.dtypes)


    print('\nMissing values:')
    print(df.isnull().sum())

# clean column names
def clean_column_names(df):
    """
    Function to clean the column names of a DataFrame:
    - Convert to lowercase
    - Replace spaces with underscores
    - Remove or replace special characters with underscores

    """

    def clean_name(name):
        name = name.lower()
        name = name.replace(" ", "_")
        name = re.sub(r'[^a-z0-9_]', '_', name)
        return name
    
    df.columns = [clean_name(col) for col in df.columns]    
    return df

# check unique and empty values
def check_unique_and_empty(df):
    result = []
    
    for column in df.columns:
        unique_values = df[column].dropna().unique()
        empty_values = df[column].isna().sum()
        
        result.append({
            'Column': column,
            'Unique value count': len(unique_values),  # Number of unique values in the column
            'Empty value count': empty_values
        })
    
    result_df = pd.DataFrame(result)
    result_df.set_index('Column', inplace=True)
    
    print('Summary of Unique and Empty Values:\n')
    print(result_df)
    print('\n' + '-'*50 + '\n')

# convert floats to ints
def floats_to_ints(df, column_name):
    """
    Convert float values in the specified column to integers, leaving NaNs intact.
   
    """
    # apply floor() when needed, leave NaN values as they are
    df[column_name] = df[column_name].apply(lambda x: np.floor(x) if pd.notna(x) else x)
    df[column_name] = df[column_name].apply(lambda x: int(x) if pd.notna(x) else x)
    df[column_name] = df[column_name].astype('Int64')  # Use 'Int64' for nullable integers in pandas
    return df

## Read and clean data

# Final Demo DF

In [None]:
final_demo_df = pd.read_csv('../../data/raw/df_final_demo.txt')
final_demo_df.head()

In [None]:
# clean column names
clean_column_names(final_demo_df)

# inspect dataframe
inspect_dataframe(final_demo_df)

In [None]:
# change column names for standardization
new_column_names_final_demo_df = {
    'client_id': 'client_id',
    'clnt_tenure_yr': 'client_tenure_years',
    'clnt_tenure_mnth': 'client_tenure_months',
    'clnt_age': 'client_age',
    'gendr': 'gender',
    'num_accts': 'num_accounts',
    'bal': 'balance',
    'calls_6_mnth': 'calls_last_6_months',
    'logons_6_mnth': 'logons_last_6_months'
}

final_demo_df.rename(columns=new_column_names_final_demo_df, inplace=True)

In [None]:
# check unique values
check_unique_and_empty(final_demo_df)

In [None]:
final_demo_df.head()

In [None]:
# handle unique values in 'gender' column
print(final_demo_df['gender'].unique())

replacement_dict_gender = {
    'U': 'Unspecified',
    'M': 'Male',
    'F': 'Female',
    'X': 'Unspecified',
    np.nan: 'Unspecified'
}

# Apply the function to replace the gender values
final_demo_df['gender'] = final_demo_df['gender'].replace(replacement_dict_gender)

In [None]:
# convert floats to ints in several columns
floats_to_ints(final_demo_df, 'client_tenure_years')
floats_to_ints(final_demo_df, 'client_tenure_months')
floats_to_ints(final_demo_df, 'num_accounts')
floats_to_ints(final_demo_df, 'calls_last_6_months')
floats_to_ints(final_demo_df, 'logons_last_6_months')
floats_to_ints(final_demo_df, 'client_age')
check_unique_and_empty(final_demo_df)

In [None]:
# drop empty values
final_demo_df = final_demo_df.dropna()
# check_unique_and_empty(final_demo_df)

# initial rows: 70609, rows after dropping empty values: 70594

final_demo_df.head()

In [None]:
# generate clean csv file
# final_demo_df.to_csv('final_demo_df_clean.csv', index=False)
inspect_dataframe(final_demo_df)

# Final Experiment Clients DF

In [None]:
final_experiment_clients_df = pd.read_csv('../../data/raw/df_final_experiment_clients.txt')
final_experiment_clients_df.head()

In [None]:
# clean column names
clean_column_names(final_experiment_clients_df)

# inspect df
inspect_dataframe(final_experiment_clients_df)
check_unique_and_empty(final_experiment_clients_df)

In [None]:
# check unique values
check_unique_and_empty(final_experiment_clients_df)

In [None]:
# replace empty values with 'unknown'

final_experiment_clients_df = final_experiment_clients_df.apply(lambda col: col.apply(lambda x: 'Unknown' if pd.isna(x) or x == '' else x))

check_unique_and_empty(final_experiment_clients_df)

In [None]:
# generate clean csv file
final_experiment_clients_df.to_csv('final_experiment_clients_df.csv', index=False)

### Variation Count

In [None]:
variation_percentages = final_experiment_clients_df['variation'].value_counts(normalize=True) * 100

print(variation_percentages)

In [None]:
variation_counts = final_experiment_clients_df['variation'].value_counts()

variation_percentages = variation_counts / variation_counts.sum() * 100

colors = ['coral', 'lightgreen', 'lightblue']

plt.figure(figsize=(8, 6))
sns.barplot(x=variation_counts.index, y=variation_counts.values, palette=colors)

plt.title('Test vs Control Variation', fontsize=16)
plt.xlabel('Variation', fontsize=12)
plt.ylabel('Count', fontsize=12)

# annotate each bar with the percentage
for i, p in enumerate(plt.gca().patches):
    height = p.get_height()
    # add annotation with the percentage on top of each bar
    plt.text(p.get_x() + p.get_width() / 2, height + 0.1, f'{variation_percentages.iloc[i]:.1f}%', 
             ha='center', va='bottom', fontsize=12, color='black')

plt.show()

## Questions:

1. Primary clients using the online process
2. Are the primary clients younger or older?
3. Are the primary clients newer or long-standing?

In [None]:
# clients using the online process (logons_last_6_months > 5)
primary_clients = final_demo_df[final_demo_df['logons_last_6_months'] > 5]

# We can compare the average age of primary clients vs. the full dataset
avg_age_all_clients = round(final_demo_df['client_age'].mean())
avg_age_primary_clients = round(primary_clients['client_age'].mean())


# We can compare the average tenure in years of primary clients vs. the full dataset
avg_tenure_all_clients = round(final_demo_df['client_tenure_years'].mean())
avg_tenure_primary_clients = round(primary_clients['client_tenure_years'].mean())

# Output the results
print(f'Primary clients:\n{primary_clients[['client_id', 'client_age', 'client_tenure_years', 'logons_last_6_months']]}\n')
print(f'Average age of all clients: {avg_age_all_clients}')
print(f'Average age of primary clients: {avg_age_primary_clients}')
print(f'Are primary clients younger or older? {'Younger' if avg_age_primary_clients < avg_age_all_clients else 'Older'}\n')
print(f'Average tenure of all clients: {avg_tenure_all_clients}')
print(f'Average tenure of primary clients: {avg_tenure_primary_clients}')
print(f'Are primary clients newer or long-standing? {'Newer' if avg_tenure_primary_clients < avg_tenure_all_clients else 'Long-standing'}')