# Capstone project two (Preprocessing)

## 1. Importing libraries and loading dataframe

In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# importing dataframe from previous step and reset index
df_preprocessing = pd.read_csv('..\Dataset\df_preprocessing.csv', index_col=0)
df_preprocessing = df_preprocessing.reset_index(drop=True)

In [3]:
# Creating new features
# df_preprocessing['Percent_Revolving_Bal'] = (df_preprocessing['Total_Revolving_Bal']*100)/df_preprocessing['Credit_Limit']

## 2. Encoding target feature

In [4]:
df_preprocessing['Attrition_Flag'].value_counts()

Existing Customer    7771
Attrited Customer    1536
Name: Attrition_Flag, dtype: int64

In [5]:
# Encoding 'Attrition_Flag' attributes to binary 1 and 0. Existing Customer = 0, and Attrited Customer = 1
# df_preprocessing['Attrition_Flag'].replace({'Existing Customer':0, 'Attrited Customer':1}, inplace=True)
df_preprocessing['Attrition_Flag'] = df_preprocessing['Attrition_Flag'].map({'Existing Customer':0, 'Attrited Customer':1})

In [6]:
df_preprocessing.head()

Unnamed: 0,Attrition_Flag,Age,Gender,Dependent_Count,Education,Marital_Status,Income,Card_Category,Months_On_Book,Total_Relationship_Count,Months_Inactive,Contacts_Count,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,0,44.0,M,2,Graduate,Married,$40K - $60K,Blue,36,3,1.0,2.0,4010.0,1247,2763.0,1.376,1088.0,24.0,0.846,0.311
1,0,42.0,M,5,Uneducated,Unknown,$120K +,Blue,31,5,3.0,2.0,6748.0,1467,5281.0,0.831,1201.0,42.0,0.68,0.217
2,0,57.0,F,2,Graduate,Married,Less than $40K,Blue,48,5,2.0,2.0,2436.0,680,1756.0,1.19,1570.0,29.0,0.611,0.279
3,0,45.0,F,2,Graduate,Married,Unknown,Blue,37,6,1.0,2.0,14470.0,1157,13313.0,0.966,1207.0,21.0,0.909,0.08
4,0,47.0,M,1,Doctorate,Divorced,$60K - $80K,Blue,42,5,2.0,0.0,20979.0,1800,19179.0,0.906,1178.0,27.0,0.929,0.086


## 3. Seperating target features and predictor features

In [7]:
X = df_preprocessing.drop('Attrition_Flag', axis=1)
y = df_preprocessing['Attrition_Flag']

In [8]:
# Splitting into train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

In [9]:
# Reindexing all four splits
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [10]:
print(f'X_train.shape: {X_train.shape}, X_test.shape: {X_test.shape}')

X_train.shape: (6980, 19), X_test.shape: (2327, 19)


In [11]:
print(f'y_train.shape: {y_train.shape}, y_test.shape: {y_test.shape}')

y_train.shape: (6980,), y_test.shape: (2327,)


In [12]:
X_train.head()

Unnamed: 0,Age,Gender,Dependent_Count,Education,Marital_Status,Income,Card_Category,Months_On_Book,Total_Relationship_Count,Months_Inactive,Contacts_Count,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,40.0,M,3,Graduate,Divorced,Less than $40K,Silver,36,1,3.0,2.0,11409.0,1658,9751.0,0.989,4677.0,58.0,0.706,0.145
1,46.0,F,4,Uneducated,Married,Less than $40K,Silver,36,3,1.0,4.0,10961.0,1391,9570.0,0.662,3914.0,77.0,0.75,0.127
2,52.0,F,0,Graduate,Single,Less than $40K,Blue,36,5,1.0,2.0,2537.0,1097,1440.0,0.874,4964.0,84.0,0.909,0.432
3,35.0,M,2,Uneducated,Single,$60K - $80K,Blue,30,6,3.0,3.0,21695.0,1257,20438.0,0.793,2266.0,74.0,0.48,0.058
4,52.0,F,3,Unknown,Single,Less than $40K,Blue,41,2,3.0,2.0,1438.3,787,651.3,0.601,4956.0,89.0,0.745,0.547


In [13]:
y_train.head()

0    0
1    0
2    0
3    0
4    0
Name: Attrition_Flag, dtype: int64

## 5. Scaling the numeric columns of the X_train and X_test dataframe

In [14]:
# Standard scaling to numerical columns
scaler = StandardScaler()

In [15]:
X_train[X_train.select_dtypes('number').columns] = scaler.fit_transform(X_train[X_train.select_dtypes('number').columns])
X_test[X_test.select_dtypes('number').columns] = scaler.fit_transform(X_test[X_test.select_dtypes('number').columns])

## 6. Encoding Categorical columns

In [16]:
# Encoding Categorical Columns
X_train = pd.get_dummies(X_train, drop_first=True)

In [17]:
X_test = pd.get_dummies(X_test, drop_first=True)

In [18]:
X_train.head()

Unnamed: 0,Age,Dependent_Count,Months_On_Book,Total_Relationship_Count,Months_Inactive,Contacts_Count,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,...,Marital_Status_Single,Marital_Status_Unknown,Income_$40K - $60K,Income_$60K - $80K,Income_$80K - $120K,Income_Less than $40K,Income_Unknown,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver
0,-0.791543,0.498352,0.002708,-1.841984,0.745299,-0.41516,0.324106,0.617387,0.267968,1.339053,...,0,0,0,0,0,1,0,0,0,1
1,-0.03735,1.269124,0.002708,-0.550727,-1.400375,1.442415,0.274543,0.291116,0.247961,-0.444369,...,0,0,0,0,0,1,0,0,0,1
2,0.716844,-1.813965,0.002708,0.74053,-1.400375,-0.41516,-0.657427,-0.068149,-0.650666,0.711856,...,1,0,0,0,0,1,0,0,0,0
3,-1.420038,-0.27242,-0.753447,1.386159,0.745299,0.513627,1.462074,0.127369,1.449226,0.27009,...,1,0,0,1,0,0,0,0,0,0
4,0.716844,0.498352,0.632838,-1.196355,0.745299,-0.41516,-0.778979,-0.446965,-0.737843,-0.777057,...,1,0,0,0,0,1,0,0,0,0


In [19]:
X_train.shape

(6980, 32)

In [20]:
X_test.shape

(2327, 32)

In [21]:
y_train.shape

(6980,)

In [22]:
y_test.shape

(2327,)

## 8. Save the final dataframes for modeling

In [23]:
X_train.to_csv('..\Dataset\X_train.csv')
X_test.to_csv('..\Dataset\X_test.csv')
y_train.to_csv('..\Dataset\y_train.csv')
y_test.to_csv('..\Dataset\y_test.csv')