## 1. Importing libraries and loading dataframe

In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
# importing dataframe from previous step and reset index
df_preprocessing = pd.read_csv('..\Dataset\df_preprocessing.csv', index_col=0)
df_preprocessing = df_preprocessing.reset_index(drop=True)

## 2. Encoding features

In [3]:
df_preprocessing['Attrition_Flag'].value_counts()

Existing Customer    7771
Attrited Customer    1536
Name: Attrition_Flag, dtype: int64

In [4]:
# Encoding 'Attrition_Flag' attributes to binary 1 and 0. Existing Customer = 0, and Attrited Customer = 1
# df_preprocessing['Attrition_Flag'].replace({'Existing Customer':0, 'Attrited Customer':1}, inplace=True)
df_preprocessing['Attrition_Flag'] = df_preprocessing['Attrition_Flag'].map({'Existing Customer':0, 'Attrited Customer':1})

In [5]:
X = df_preprocessing.drop('Attrition_Flag', axis=1)
y = df_preprocessing['Attrition_Flag']

In [6]:
df_preprocessing = X.copy()

In [7]:
df_preprocessing['Gender'] = df_preprocessing['Gender'].replace({'M':1, 'F':2})

In [8]:
df_preprocessing.select_dtypes('object').columns

Index(['Education', 'Marital_Status', 'Income', 'Card_Category'], dtype='object')

In [9]:
df_preprocessing['Education'].value_counts()

Graduate         2887
High School      1836
Unknown          1391
Uneducated       1359
College           945
Post-Graduate     474
Doctorate         415
Name: Education, dtype: int64

In [10]:
dict_education = {'Graduate':1, 'High School':2, 'Unknown':3, 'Uneducated':4, 'College':5, 'Post-Graduate':6, 'Doctorate':7}
df_preprocessing['Education'] = df_preprocessing['Education'].replace(dict_education)

In [11]:
df_preprocessing['Marital_Status'].value_counts()

Married     4247
Single      3684
Unknown      692
Divorced     684
Name: Marital_Status, dtype: int64

In [12]:
dict_maritalstatus = {'Married':1, 'Single':2, 'Unknown':3, 'Divorced':4}
df_preprocessing['Marital_Status'] = df_preprocessing['Marital_Status'].replace(dict_maritalstatus)

In [13]:
df_preprocessing['Card_Category'].value_counts()

Blue        8726
Silver       470
Gold          94
Platinum      17
Name: Card_Category, dtype: int64

In [14]:
df_preprocessing['Card_Category'] = df_preprocessing['Card_Category'].replace({'Blue':1, 'Silver':2, 'Gold':3, 'Platinum':4})

In [15]:
df_preprocessing['Income'].value_counts()

Less than $40K    3323
$40K - $60K       1647
$80K - $120K      1391
$60K - $80K       1251
Unknown           1027
$120K +            668
Name: Income, dtype: int64

In [16]:
dict_income = {'Less than $40K':1, '$40K - $60K':2, '$80K - $120K':3, '$60K - $80K':4, 'Unknown':5, '$120K +':6}
df_preprocessing['Income'] = df_preprocessing['Income'].replace(dict_income)

In [17]:
df_preprocessing.head()

Unnamed: 0,Age,Gender,Dependent_Count,Education,Marital_Status,Income,Card_Category,Months_On_Book,Total_Relationship_Count,Months_Inactive,Contacts_Count,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,44.0,1,2,1,1,2,1,36,3,1.0,2.0,4010.0,1247,2763.0,1.376,1088.0,24.0,0.846,0.311
1,42.0,1,5,4,3,6,1,31,5,3.0,2.0,6748.0,1467,5281.0,0.831,1201.0,42.0,0.68,0.217
2,57.0,2,2,1,1,1,1,48,5,2.0,2.0,2436.0,680,1756.0,1.19,1570.0,29.0,0.611,0.279
3,45.0,2,2,1,1,5,1,37,6,1.0,2.0,14470.0,1157,13313.0,0.966,1207.0,21.0,0.909,0.08
4,47.0,1,1,7,4,4,1,42,5,2.0,0.0,20979.0,1800,19179.0,0.906,1178.0,27.0,0.929,0.086


## 3. Train test splits

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df_preprocessing, y, test_size=0.25, random_state=123)

In [19]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [20]:
# Standard scaling to numerical columns
scaler = StandardScaler()

In [21]:
X_train[X.select_dtypes('number').columns] = scaler.fit_transform(X_train[X.select_dtypes('number').columns])
X_test[X.select_dtypes('number').columns] = scaler.fit_transform(X_test[X.select_dtypes('number').columns])

In [22]:
X_train.head()

Unnamed: 0,Age,Gender,Dependent_Count,Education,Marital_Status,Income,Card_Category,Months_On_Book,Total_Relationship_Count,Months_Inactive,Contacts_Count,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,-0.791543,1,0.498352,1,4,1,2,0.002708,-1.841984,0.745299,-0.41516,0.324106,0.617387,0.267968,1.339053,0.244997,-0.264551,0.056721,-0.480486
1,-0.03735,2,1.269124,4,1,1,2,0.002708,-0.550727,-1.400375,1.442415,0.274543,0.291116,0.247961,-0.444369,-0.043189,0.611758,0.2765,-0.545072
2,0.716844,2,-1.813965,1,2,1,1,0.002708,0.74053,-1.400375,-0.41516,-0.657427,-0.068149,-0.650666,0.711856,0.353397,0.934609,1.070698,0.5493
3,-1.420038,1,-0.27242,4,2,4,1,-0.753447,1.386159,0.745299,0.513627,1.462074,0.127369,1.449226,0.27009,-0.665639,0.473394,-1.07214,-0.792651
4,0.716844,2,0.498352,3,2,1,1,0.632838,-1.196355,0.745299,-0.41516,-0.778979,-0.446965,-0.737843,-0.777057,0.350375,1.165217,0.251525,0.961932


In [23]:
X_train.to_csv('..\Dataset\X_train1.csv')
X_test.to_csv('..\Dataset\X_test1.csv')
y_train.to_csv('..\Dataset\y_train1.csv')
y_test.to_csv('..\Dataset\y_test1.csv')