# Capstone project two (Preprocessing)

## 1. Importing libraries and loading dataframe

In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()
import os
from scipy import stats
from sklearn.preprocessing import StandardScaler


In [2]:
# importing dataframe from previous step and reset index
df_preprocessing = pd.read_csv('df_preprocessing.csv', index_col=0)
df_preprocessing = df_preprocessing.reset_index(drop=True)

## 2. Preprocessing

In [3]:
df_preprocessing['Attrition_Flag'].value_counts()

Existing Customer    7771
Attrited Customer    1536
Name: Attrition_Flag, dtype: int64

In [4]:
# Encoding 'Attrition_Flag' attributes to binary 1 and 0. Existing Customer = 0, and Attrited Customer = 1
# df_preprocessing['Attrition_Flag'].replace({'Existing Customer':0, 'Attrited Customer':1}, inplace=True)
df_preprocessing['Attrition_Flag'] = df_preprocessing['Attrition_Flag'].map({'Existing Customer':0, 'Attrited Customer':1})

In [5]:
# df_preprocessing['Attrition_Flag'].value_counts()
df_preprocessing['Attrition_Flag'] = df_preprocessing['Attrition_Flag'].astype('object')
df_preprocessing['Attrition_Flag'].dtype

dtype('O')

In [6]:
df_preprocessing.head()

Unnamed: 0,Attrition_Flag,Age,Gender,Dependent_Count,Education,Marital_Status,Income,Card_Category,Months_On_Book,Total_Relationship_Count,Months_Inactive,Contacts_Count,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,0,44.0,M,2,Graduate,Married,$40K - $60K,Blue,36,3,1.0,2.0,4010.0,1247,2763.0,1.376,1088.0,24.0,0.846,0.311
1,0,42.0,M,5,Uneducated,Unknown,$120K +,Blue,31,5,3.0,2.0,6748.0,1467,5281.0,0.831,1201.0,42.0,0.68,0.217
2,0,57.0,F,2,Graduate,Married,Less than $40K,Blue,48,5,2.0,2.0,2436.0,680,1756.0,1.19,1570.0,29.0,0.611,0.279
3,0,45.0,F,2,Graduate,Married,Unknown,Blue,37,6,1.0,2.0,14470.0,1157,13313.0,0.966,1207.0,21.0,0.909,0.08
4,0,47.0,M,1,Doctorate,Divorced,$60K - $80K,Blue,42,5,2.0,0.0,20979.0,1800,19179.0,0.906,1178.0,27.0,0.929,0.086


In [7]:
# Creating new attributes
# df_preprocessing['Percent_Open_To_Buy'] = (df_preprocessing['Avg_Open_To_Buy']*100)/df_preprocessing['Credit_Limit']
# df_preprocessing['Percent_Revolving_Bal'] = (df_preprocessing['Total_Revolving_Bal']*100)/df_preprocessing['Credit_Limit']

## 3. Scaling the dataframe

In [8]:
# Creating two dataframe for numeric and categorical columns
df_numeric = df_preprocessing[df_preprocessing.select_dtypes('number').columns]
df_categorical = df_preprocessing[df_preprocessing.select_dtypes('object').columns]

In [9]:
df_numeric.head()

Unnamed: 0,Age,Dependent_Count,Months_On_Book,Total_Relationship_Count,Months_Inactive,Contacts_Count,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,44.0,2,36,3,1.0,2.0,4010.0,1247,2763.0,1.376,1088.0,24.0,0.846,0.311
1,42.0,5,31,5,3.0,2.0,6748.0,1467,5281.0,0.831,1201.0,42.0,0.68,0.217
2,57.0,2,48,5,2.0,2.0,2436.0,680,1756.0,1.19,1570.0,29.0,0.611,0.279
3,45.0,2,37,6,1.0,2.0,14470.0,1157,13313.0,0.966,1207.0,21.0,0.909,0.08
4,47.0,1,42,5,2.0,0.0,20979.0,1800,19179.0,0.906,1178.0,27.0,0.929,0.086


In [10]:
df_categorical.head()

Unnamed: 0,Attrition_Flag,Gender,Education,Marital_Status,Income,Card_Category
0,0,M,Graduate,Married,$40K - $60K,Blue
1,0,M,Uneducated,Unknown,$120K +,Blue
2,0,F,Graduate,Married,Less than $40K,Blue
3,0,F,Graduate,Married,Unknown,Blue
4,0,M,Doctorate,Divorced,$60K - $80K,Blue


In [11]:
# Standard scaling to numerical columns
scaler = StandardScaler()
df_numeric_scaled = pd.DataFrame(scaler.fit_transform(df_numeric), columns=df_numeric.columns)

In [12]:
df_numeric_scaled.head()

Unnamed: 0,Age,Dependent_Count,Months_On_Book,Total_Relationship_Count,Months_Inactive,Contacts_Count,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,-0.29577,-0.271926,0.004744,-0.552026,-1.395182,-0.408007,-0.490538,0.118049,-0.500932,3.449765,-1.114706,-1.839357,0.754786,0.115815
1,-0.547035,2.042137,-0.626022,0.73918,0.751624,-0.408007,-0.18519,0.387672,-0.220314,0.47641,-1.071557,-1.003931,-0.078532,-0.221903
2,1.337453,-0.271926,1.518584,0.73918,-0.321779,-0.408007,-0.666074,-0.576842,-0.613157,2.435005,-0.930655,-1.607294,-0.424912,0.000847
3,-0.170138,-0.271926,0.130897,1.384783,-1.395182,-0.408007,0.675986,0.007749,0.674812,1.212929,-1.069266,-1.978594,1.071045,-0.714108
4,0.081127,-1.04328,0.761664,0.73918,-0.321779,-2.260358,1.401885,0.795782,1.328548,0.885587,-1.08034,-1.700119,1.171445,-0.692552


In [13]:
# Merging categorical dataframe and scaled numerical dataframe to one 
df_preprocessing = pd.concat([df_numeric_scaled, df_categorical], axis=1)

In [14]:
df_preprocessing.shape

(9307, 20)

In [15]:
# Encoding Categorical Columns
df_preprocessing = pd.get_dummies(df_preprocessing, drop_first=True)

In [16]:
df_preprocessing.head()

Unnamed: 0,Age,Dependent_Count,Months_On_Book,Total_Relationship_Count,Months_Inactive,Contacts_Count,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,...,Marital_Status_Single,Marital_Status_Unknown,Income_$40K - $60K,Income_$60K - $80K,Income_$80K - $120K,Income_Less than $40K,Income_Unknown,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver
0,-0.29577,-0.271926,0.004744,-0.552026,-1.395182,-0.408007,-0.490538,0.118049,-0.500932,3.449765,...,0,0,1,0,0,0,0,0,0,0
1,-0.547035,2.042137,-0.626022,0.73918,0.751624,-0.408007,-0.18519,0.387672,-0.220314,0.47641,...,0,1,0,0,0,0,0,0,0,0
2,1.337453,-0.271926,1.518584,0.73918,-0.321779,-0.408007,-0.666074,-0.576842,-0.613157,2.435005,...,0,0,0,0,0,1,0,0,0,0
3,-0.170138,-0.271926,0.130897,1.384783,-1.395182,-0.408007,0.675986,0.007749,0.674812,1.212929,...,0,0,0,0,0,0,1,0,0,0
4,0.081127,-1.04328,0.761664,0.73918,-0.321779,-2.260358,1.401885,0.795782,1.328548,0.885587,...,0,0,0,1,0,0,0,0,0,0


## 4. Save the final dataframe

In [17]:
df_preprocessing.to_csv('df_modeling.csv')