In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

# Functions

In [2]:
# inspect df
def inspect_dataframe(df):
    """
    Function to perform basic inspection on a DataFrame: 
    shape, column names, data types, and missing values.
    
    """

    print('Check the shape (rows, columns):')
    print(df.shape)

    print('\nColumn names:')
    print(df.columns)


    print('\nData types:')
    print(df.dtypes)


    print('\nMissing values:')
    print(df.isnull().sum())

# clean column names
def clean_column_names(df):
    """
    Function to clean the column names of a DataFrame:
    - Convert to lowercase
    - Replace spaces with underscores
    - Remove or replace special characters with underscores

    """

    def clean_name(name):
        name = name.lower()
        name = name.replace(" ", "_")
        name = re.sub(r'[^a-z0-9_]', '_', name)
        return name
    
    df.columns = [clean_name(col) for col in df.columns]    
    return df

# check unique and empty values
def check_unique_and_empty(df):
    """
    Function to print the unique and empty values for each column in a DataFrame.
    
    """
    result = []
    
    for column in df.columns:
        unique_values = df[column].nunique()
        empty_values = df[column].isna().sum()
        

        empty_rows = df[column][df[column].isna()].index.tolist()
        
        result.append({
            'Column': column,
            'Unique Values': unique_values,
            'Empty Values': empty_values,
            'Empty Row Indices': empty_rows
        })
    
    result_df = pd.DataFrame(result)
    
    result_df.set_index('Column', inplace=True)
    
    print(result_df)

# convert floats to ints
def floats_to_ints(df, column_name):
    """
    Convert float values in the specified column to integers, leaving NaNs intact.
   
    """
    df[column_name] = df[column_name].apply(lambda x: int(x) if pd.notna(x) else x)
    return df

# Read and clean data

## Final Demo DF

In [10]:
final_demo_df = pd.read_csv("../../data/raw/df_final_demo.txt")
final_demo_df.head()

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth
0,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0
1,2304905,7.0,94.0,58.0,U,2.0,110860.3,6.0,9.0
2,1439522,5.0,64.0,32.0,U,2.0,52467.79,6.0,9.0
3,1562045,16.0,198.0,49.0,M,2.0,67454.65,3.0,6.0
4,5126305,12.0,145.0,33.0,F,2.0,103671.75,0.0,3.0


In [4]:
# clean column names
clean_column_names(final_demo_df)

# inspect dataframe
inspect_dataframe(final_demo_df)

Check the shape (rows, columns):
(70609, 9)

Column names:
Index(['client_id', 'clnt_tenure_yr', 'clnt_tenure_mnth', 'clnt_age', 'gendr',
       'num_accts', 'bal', 'calls_6_mnth', 'logons_6_mnth'],
      dtype='object')

Data types:
client_id             int64
clnt_tenure_yr      float64
clnt_tenure_mnth    float64
clnt_age            float64
gendr                object
num_accts           float64
bal                 float64
calls_6_mnth        float64
logons_6_mnth       float64
dtype: object

Missing values:
client_id            0
clnt_tenure_yr      14
clnt_tenure_mnth    14
clnt_age            15
gendr               14
num_accts           14
bal                 14
calls_6_mnth        14
logons_6_mnth       14
dtype: int64


In [5]:
# change column names for standardization
new_column_names_final_demo_df = {
    'client_id': 'client_id',
    'clnt_tenure_yr': 'client_tenure_years',
    'clnt_tenure_mnth': 'client_tenure_months',
    'clnt_age': 'client_age',
    'gendr': 'gender',
    'num_accts': 'num_accounts',
    'bal': 'balance',
    'calls_6_mnth': 'calls_last_6_months',
    'logons_6_mnth': 'logons_last_6_months'
}

final_demo_df.rename(columns=new_column_names_final_demo_df, inplace=True)

In [6]:
# check unique values
check_unique_and_empty(final_demo_df)

                      Unique Values  Empty Values  \
Column                                              
client_id                     70609             0   
client_tenure_years              54            14   
client_tenure_months            482            14   
client_age                      165            15   
gender                            4            14   
num_accounts                      8            14   
balance                       70328            14   
calls_last_6_months               8            14   
logons_last_6_months              9            14   

                                                      Empty Row Indices  
Column                                                                   
client_id                                                            []  
client_tenure_years   [4164, 8316, 8677, 13444, 18066, 25961, 28432,...  
client_tenure_months  [4164, 8316, 8677, 13444, 18066, 25961, 28432,...  
client_age            [4164, 8316, 8677, 9583,

## Final Experiment Clients DF

In [7]:
final_experiment_clients_df = pd.read_csv("../../data/raw/df_final_experiment_clients.txt")
final_experiment_clients_df.head()

Unnamed: 0,client_id,Variation
0,9988021,Test
1,8320017,Test
2,4033851,Control
3,1982004,Test
4,9294070,Control


In [8]:
# clean column names
clean_column_names(final_experiment_clients_df)

# inspect df
inspect_dataframe(final_experiment_clients_df)

Check the shape (rows, columns):
(70609, 2)

Column names:
Index(['client_id', 'variation'], dtype='object')

Data types:
client_id     int64
variation    object
dtype: object

Missing values:
client_id        0
variation    20109
dtype: int64


In [9]:
# check unique values
check_unique_and_empty(final_experiment_clients_df)

           Unique Values  Empty Values  \
Column                                   
client_id          70609             0   
variation              2         20109   

                                           Empty Row Indices  
Column                                                        
client_id                                                 []  
variation  [50500, 50501, 50502, 50503, 50504, 50505, 505...  
