# 1. Data Preparation

## 1.1 Reading dataset

In [1]:
import pandas as pd 

In [2]:
churn_data = pd.read_csv('../DataFiles/Bankchurners.csv')
churn_data.head()

Unnamed: 0,CLIENTNUM,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,...,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Attrition_Flag
0,768805383,45,M,3,High School,Married,$60K - $80K,Blue,39,5,...,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,0
1,818770008,49,F,5,Graduate,Single,Less than $40K,Blue,44,6,...,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,0
2,713982108,51,M,3,Graduate,Married,$80K - $120K,Blue,36,4,...,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,0
3,769911858,40,F,4,High School,,Less than $40K,Blue,34,3,...,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76,0
4,709106358,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,5,...,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0,0


## 1.2 Splitting into Train, Test and Production Sets

In [3]:
from sklearn.model_selection import train_test_split

churn, churn_prod = train_test_split(churn_data, train_size=0.8, random_state= 4)
churn_train, churn_test = train_test_split(churn, train_size=0.8, random_state=4)

## 1.3 Data Pre Processing

### Missing Data

In [4]:
missing_count = churn_train.isnull().sum()/len(churn_data.index) * 100
missing_count

CLIENTNUM                   0.000000
Customer_Age                0.000000
Gender                      0.000000
Dependent_count             0.000000
Education_Level             9.677101
Marital_Status              4.927422
Income_Category             7.060334
Card_Category               0.000000
Months_on_book              0.000000
Total_Relationship_Count    0.000000
Months_Inactive_12_mon      0.000000
Contacts_Count_12_mon       0.000000
Credit_Limit                0.000000
Total_Revolving_Bal         0.000000
Avg_Open_To_Buy             0.000000
Total_Amt_Chng_Q4_Q1        0.000000
Total_Trans_Amt             0.000000
Total_Trans_Ct              0.000000
Total_Ct_Chng_Q4_Q1         0.000000
Avg_Utilization_Ratio       0.000000
Attrition_Flag              0.000000
dtype: float64

'Education_Level', 'Marital_Status' and 'Income_Category' were found to have missing values ! Imputation using kNN

### kNN Imputation

In [5]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer

In [6]:
cat_cols = ['Education_Level','Marital_Status','Income_Category']

In [7]:
# Converting Categorical Variables to Ordinal for KNN Impution to function
encoder = OrdinalEncoder()
encoder.fit(churn_train[cat_cols])

OrdinalEncoder()

In [8]:
churn_train[cat_cols] = encoder.transform(churn_train[cat_cols])
churn_test[cat_cols] = encoder.transform(churn_test[cat_cols])
churn_prod[cat_cols] = encoder.transform(churn_prod[cat_cols])

In [9]:
# Imputation using KNNInputer
imputer = KNNImputer(n_neighbors=4, weights= 'distance')
imputer.fit(churn_train[cat_cols])

KNNImputer(n_neighbors=4, weights='distance')

In [10]:
churn_train[cat_cols] = imputer.transform(churn_train[cat_cols])
churn_test[cat_cols] = imputer.transform(churn_test[cat_cols])
churn_prod[cat_cols] = imputer.transform(churn_prod[cat_cols])

In [11]:
# Converting the imputed Ordinal back to Categorical Variables
churn_train[cat_cols] = encoder.inverse_transform(churn_train[cat_cols])
churn_test[cat_cols] = encoder.inverse_transform(churn_test[cat_cols])
churn_prod[cat_cols] = encoder.inverse_transform(churn_prod[cat_cols])

### Data Imbalance

In [12]:
churn_train['Attrition_Flag'].value_counts()

0    5437
1    1043
Name: Attrition_Flag, dtype: int64

SMOTE Sampling to remove class imbalance !

### SMOTE Sampling

In [13]:
from imblearn.over_sampling import SMOTENC

In [14]:
X = churn_train.drop(['CLIENTNUM','Attrition_Flag'],axis=1)
y = churn_train['Attrition_Flag']

In [15]:
oversample = SMOTENC(categorical_features = [1,3,4,5,6])
X_1,y_1 = oversample.fit_resample(X,y)

In [16]:
y_1.value_counts()

0    5437
1    5437
Name: Attrition_Flag, dtype: int64

## 1.4 Storing datasets in pickle files

In [17]:
churn_train = X_1.join(y_1)
churn_test = churn_test.drop('CLIENTNUM',axis = 1)
churn_prod = churn_prod.drop('CLIENTNUM',axis = 1)

In [18]:
churn_train.to_pickle('../DataFiles/churn_train.pkl')
churn_test.to_pickle('../DataFiles/churn_test.pkl')
churn_prod.to_pickle('../DataFiles/churn_prod.pkl')

## 1.5 Data Profiling 

In [19]:
from pandas_profiling import ProfileReport

profile = ProfileReport(churn_train, title = 'Credit Card Data Profiling Report')
profile.to_file("../Reports/credit_card_profile.html")

Summarize dataset:   0%|          | 0/34 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

  cmap.set_bad(cmap_bad)


Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]