In [None]:
import warnings

# Ignore UserWarning for QDA
warnings.simplefilter("ignore", UserWarning)

import os

dir = str(os.getcwd())
split_dir = dir.split('\\')

# Set the github repository directory
for folder in reversed(split_dir):
    if folder != 'AA_Project_G6' and 'AA_Project_G6' in split_dir:
        os.chdir(os.path.dirname(os.getcwd()))
    else:
        break

print('Current directory:', os.getcwd())

# **Data Import**

In [None]:
import pandas as pd
import numpy as np

# Import data from .csv file
df = pd.read_csv(filepath_or_buffer='./data/raw/credit_card_churn.csv', delimiter=',')

df.info()

print(df.isna().sum())

# Drop rows with the same client ID
df.drop_duplicates(subset='CLIENTNUM')

# **Data Pre-processing**

In [None]:
# Drop CLIENTNUM and Naive Bayes related columns
df.drop(['CLIENTNUM',
         'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
        'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],
       axis=1, inplace=True)

# Classify the types of features in the dataframe
cat_feats = []
num_feats = []
for feat in df.columns:
    if feat == 'Attrition_Flag':
        None
    else:
        if df[feat].dtype == 'O':
            cat_feats.append(feat)
        else:
            num_feats.append(feat)

print('There are {} categorical features: {}'.format(len(cat_feats), cat_feats))
print('There are {} numerical features: {}'.format(len(num_feats), num_feats))

# Encode the categorical columns
dummies = pd.get_dummies(df[['Attrition_Flag', 'Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']], drop_first=True)

# Merge encoded columns with original dataframe
df2 = df.join(dummies)

# Drop categorical columns
df2 = df2.drop(columns=['Attrition_Flag', 'Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category'])

# **Numerical Features Visualization**

In [None]:
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import seaborn as sns

# Boxplots, histograms for numerical features
plt.figure(figsize=(20, 40))

for i,feat in enumerate(num_feats):
    # Boxplot
    plt.subplot(7,4,2*i+1)
    sns.boxplot(y=df[num_feats[i]])
    plt.title('Boxplot')

    # Histogram
    plt.subplot(7,4,2*i+2)
    plt.hist(x=df[num_feats[i]])
    plt.xlabel(feat)    
    plt.title('Histogram')

plt.show()

# **Categorical Features Visualization**

In [None]:
# Bar charts for categorical features

for i,feat in enumerate(cat_feats):
    # Create a cross-tabulation of the data
    ctab = pd.crosstab(index=df[feat], columns=df['Attrition_Flag'], normalize='index')
    ctab.plot(kind='bar', stacked='True', figsize=(5,5))
    
plt.show()

# **Correlations Heatmaps**

In [None]:
# Numerical features correlations heatmap
df_num = df2[num_feats]
df_num = df_num.join(df2['Attrition_Flag_Existing Customer'])
df_num_corrs = df_num.corr(method='pearson', numeric_only=True)
pval = df_num.corr(method=lambda x,y: pearsonr(x,y)[1], numeric_only=True)
mask = pval > 0.1/100
plt.subplots(figsize=(10,10))
heatmap = sns.heatmap(df_num_corrs, mask=mask, square=True, cmap='coolwarm', annot=True)

# Categorical features correlations heatmap
df_cat = dummies
df_cat_corrs = df_cat.corr(method='pearson', numeric_only=False)
pval = df_cat.corr(method=lambda x,y: pearsonr(x,y)[1], numeric_only=True)
mask = pval > 0.1/100
plt.subplots(figsize=(10,10))
heatmap = sns.heatmap(df_cat_corrs, mask=mask, square=True, cmap='coolwarm', annot=True)