# Capstone project two (Preprocessing)

## 1. Importing libraries and loading dataframe

In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# importing dataframe from previous step and reset index
df_preprocessing = pd.read_csv('..\Dataset\df_preprocessing.csv', index_col=0)
df_preprocessing = df_preprocessing.reset_index(drop=True)

In [3]:
# Creating new features
# df_preprocessing['Percent_Revolving_Bal'] = (df_preprocessing['Total_Revolving_Bal']*100)/df_preprocessing['Credit_Limit']

## 2. Encoding target feature

In [4]:
df_preprocessing['Attrition_Flag'].value_counts()

Existing Customer    7871
Attrited Customer    1607
Name: Attrition_Flag, dtype: int64

Encoding target attribute 'Attrition_Flag' to binary 1 and 0. Existing Customer = 0, and Attrited Customer = 1

In [5]:
# df_preprocessing['Attrition_Flag'].replace({'Existing Customer':0, 'Attrited Customer':1}, inplace=True)
df_preprocessing['Attrition_Flag'] = df_preprocessing['Attrition_Flag'].map({'Existing Customer':0, 'Attrited Customer':1})

In [6]:
df_preprocessing.head()

Unnamed: 0,Attrition_Flag,Age,Gender,Dependent_Count,Education,Marital_Status,Income,Card_Category,Months_On_Book,Total_Relationship_Count,Months_Inactive,Contacts_Count,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,0,44.0,M,2,Graduate,Married,$40K - $60K,Blue,36,3,1,2,4010.0,1247,2763.0,1.376,1088.0,24.0,0.846,0.311
1,0,42.0,M,5,Uneducated,Unknown,$120K +,Blue,31,5,3,2,6748.0,1467,5281.0,0.831,1201.0,42.0,0.68,0.217
2,0,57.0,F,2,Graduate,Married,Less than $40K,Blue,48,5,2,2,2436.0,680,1756.0,1.19,1570.0,29.0,0.611,0.279
3,0,45.0,F,2,Graduate,Married,Unknown,Blue,37,6,1,2,14470.0,1157,13313.0,0.966,1207.0,21.0,0.909,0.08
4,0,47.0,M,1,Doctorate,Divorced,$60K - $80K,Blue,42,5,2,0,20979.0,1800,19179.0,0.906,1178.0,27.0,0.929,0.086


## 3. Seperating target features and predictor features

In [7]:
X = df_preprocessing.drop('Attrition_Flag', axis=1)
y = df_preprocessing['Attrition_Flag']

In [8]:
# Splitting into train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

In [9]:
# Reindexing all four splits
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [10]:
print(f'X_train.shape: {X_train.shape}, X_test.shape: {X_test.shape}')

X_train.shape: (7108, 19), X_test.shape: (2370, 19)


In [11]:
print(f'y_train.shape: {y_train.shape}, y_test.shape: {y_test.shape}')

y_train.shape: (7108,), y_test.shape: (2370,)


In [12]:
X_train.head()

Unnamed: 0,Age,Gender,Dependent_Count,Education,Marital_Status,Income,Card_Category,Months_On_Book,Total_Relationship_Count,Months_Inactive,Contacts_Count,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,52.0,F,1,High School,Married,Less than $40K,Blue,36,2,2,2,2253.0,1981,272.0,0.55,4758.0,81.0,0.723,0.879
1,33.0,M,3,High School,Married,$80K - $120K,Blue,22,5,3,2,9094.0,0,9094.0,0.631,4203.0,76.0,0.727,0.0
2,52.0,F,3,Graduate,Married,Less than $40K,Blue,36,5,3,3,2566.0,1079,1487.0,0.554,2340.0,36.0,0.5,0.42
3,48.0,F,4,Doctorate,Unknown,Unknown,Blue,24,4,2,3,1438.3,0,1438.3,0.738,2306.0,46.0,0.484,0.0
4,48.0,M,4,Graduate,Married,$60K - $80K,Blue,37,2,2,1,18477.0,0,18477.0,0.771,8695.0,95.0,0.583,0.0


In [13]:
y_train.head()

0    0
1    0
2    1
3    1
4    0
Name: Attrition_Flag, dtype: int64

## 5. Scaling the numeric columns of the X_train and X_test dataframe

In [14]:
# Standard scaling to numerical columns
scaler = StandardScaler()

In [15]:
X_train[X_train.select_dtypes('number').columns] = scaler.fit_transform(X_train[X_train.select_dtypes('number').columns])
X_test[X_test.select_dtypes('number').columns] = scaler.fit_transform(X_test[X_test.select_dtypes('number').columns])

## 6. Encoding Categorical columns

In [16]:
# Encoding Categorical Columns
X_train = pd.get_dummies(X_train, drop_first=True)

In [17]:
X_test = pd.get_dummies(X_test, drop_first=True)

In [18]:
X_train.head()

Unnamed: 0,Age,Dependent_Count,Months_On_Book,Total_Relationship_Count,Months_Inactive,Contacts_Count,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,...,Marital_Status_Single,Marital_Status_Unknown,Income_$40K - $60K,Income_$60K - $80K,Income_$80K - $120K,Income_Less than $40K,Income_Unknown,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver
0,0.710852,-1.042934,0.009833,-1.183174,-0.3361,-0.418276,-0.681429,1.014035,-0.773185,-1.055874,...,0,0,0,0,0,1,0,0,0,0
1,-1.669559,0.501317,-1.743667,0.745873,0.654364,-0.418276,0.080113,-1.407099,0.208114,-0.616482,...,0,0,0,0,1,0,0,0,0,0
2,0.710852,0.501317,0.009833,0.745873,0.654364,0.485129,-0.646585,-0.08837,-0.638037,-1.034175,...,0,0,0,0,0,1,0,0,0,0
3,0.209713,1.273442,-1.493167,0.102857,-0.3361,0.485129,-0.772121,-1.407099,-0.643454,-0.03605,...,0,1,0,0,0,0,1,0,0,0
4,0.209713,1.273442,0.135082,-1.183174,-0.3361,-1.321682,1.12463,-1.407099,1.251814,0.142962,...,0,0,0,1,0,0,0,0,0,0


In [19]:
X_train.shape

(7108, 32)

In [20]:
X_test.shape

(2370, 32)

In [21]:
y_train.shape

(7108,)

In [22]:
y_test.shape

(2370,)

## 8. Save the final dataframes for modeling

In [23]:
X_train.to_csv('..\Dataset\X_train.csv')
X_test.to_csv('..\Dataset\X_test.csv')
y_train.to_csv('..\Dataset\y_train.csv')
y_test.to_csv('..\Dataset\y_test.csv')