### 3. Preprocessing and Training Data Development

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
c_data = pd.read_csv('../Data/c_data_EDA.csv')

In [3]:
c_data.head()

Unnamed: 0,Education,Income,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,...,AcceptedCmp2,Complain,Response,Total_Expenses,Children,Age,Relationship Status,TotalAcceptedCmp,NumTotalPurchases,Customer_Day
0,Undergraduate,58138.0,2012-04-09,58,635,88,546,172,88,88,...,0,0,1,1617,0,64,Single,1,22,971
1,Undergraduate,46344.0,2014-08-03,38,11,1,6,2,1,6,...,0,0,0,27,2,67,Single,0,4,125
2,Undergraduate,71613.0,2013-08-21,26,426,49,127,111,21,42,...,0,0,0,776,0,56,Couple,0,20,472
3,Undergraduate,26646.0,2014-10-02,26,11,4,20,10,3,5,...,0,0,0,53,1,37,Couple,0,6,65
4,PhD,58293.0,2014-01-19,94,173,43,118,46,27,15,...,0,0,0,422,1,40,Couple,0,14,321


### 3.1. Creating Dummies for Categorical Variables

In [4]:
# Dropping Dt_Customer because we have already generated Customer_Day column
c_data = c_data.drop('Dt_Customer', axis=1)

In [5]:
c_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2236 entries, 0 to 2235
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Education            2236 non-null   object 
 1   Income               2236 non-null   float64
 2   Recency              2236 non-null   int64  
 3   MntWines             2236 non-null   int64  
 4   MntFruits            2236 non-null   int64  
 5   MntMeatProducts      2236 non-null   int64  
 6   MntFishProducts      2236 non-null   int64  
 7   MntSweetProducts     2236 non-null   int64  
 8   MntGoldProds         2236 non-null   int64  
 9   NumDealsPurchases    2236 non-null   int64  
 10  NumWebPurchases      2236 non-null   int64  
 11  NumCatalogPurchases  2236 non-null   int64  
 12  NumStorePurchases    2236 non-null   int64  
 13  NumWebVisitsMonth    2236 non-null   int64  
 14  AcceptedCmp3         2236 non-null   int64  
 15  AcceptedCmp4         2236 non-null   i

In [6]:
dfo = c_data.select_dtypes(include=['object'])

In [7]:
c_data.head()

Unnamed: 0,Education,Income,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,...,AcceptedCmp2,Complain,Response,Total_Expenses,Children,Age,Relationship Status,TotalAcceptedCmp,NumTotalPurchases,Customer_Day
0,Undergraduate,58138.0,58,635,88,546,172,88,88,3,...,0,0,1,1617,0,64,Single,1,22,971
1,Undergraduate,46344.0,38,11,1,6,2,1,6,2,...,0,0,0,27,2,67,Single,0,4,125
2,Undergraduate,71613.0,26,426,49,127,111,21,42,1,...,0,0,0,776,0,56,Couple,0,20,472
3,Undergraduate,26646.0,26,11,4,20,10,3,5,2,...,0,0,0,53,1,37,Couple,0,6,65
4,PhD,58293.0,94,173,43,118,46,27,15,5,...,0,0,0,422,1,40,Couple,0,14,321


### 3.2. Standardize the Magnitude of Numeric Features

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
scaler = StandardScaler()

In [10]:
d_data = c_data.drop(['Education', 'Relationship Status'], axis=1)

In [11]:
d_data

Unnamed: 0,Income,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,...,AcceptedCmp1,AcceptedCmp2,Complain,Response,Total_Expenses,Children,Age,TotalAcceptedCmp,NumTotalPurchases,Customer_Day
0,58138.0,58,635,88,546,172,88,88,3,8,...,0,0,0,1,1617,0,64,1,22,971
1,46344.0,38,11,1,6,2,1,6,2,1,...,0,0,0,0,27,2,67,0,4,125
2,71613.0,26,426,49,127,111,21,42,1,8,...,0,0,0,0,776,0,56,0,20,472
3,26646.0,26,11,4,20,10,3,5,2,2,...,0,0,0,0,53,1,37,0,6,65
4,58293.0,94,173,43,118,46,27,15,5,5,...,0,0,0,0,422,1,40,0,14,321
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2231,61223.0,46,709,43,182,42,118,247,2,9,...,0,0,0,0,1341,1,54,0,16,541
2232,64014.0,56,406,0,30,0,0,8,7,8,...,1,0,0,0,444,3,75,1,15,61
2233,56981.0,91,908,48,217,32,12,24,1,2,...,0,0,0,0,1241,0,40,1,18,315
2234,69245.0,8,428,30,214,80,30,61,2,6,...,0,0,0,0,843,1,65,0,21,316


In [12]:
scaled_data = scaler.fit_transform(d_data)

In [13]:
# Verifying mean of each feature = 0
scaled_data.mean(axis = 0)

array([ 2.24030693e-16, -7.14991572e-17, -7.94435080e-18, -5.56104556e-18,
       -3.17774032e-17,  4.28994943e-17,  2.06553121e-17,  4.76661048e-18,
        1.27109613e-17,  0.00000000e+00, -7.30880274e-17, -1.32273441e-16,
       -1.52531535e-16,  1.74775718e-17, -5.24327153e-17, -4.44883645e-17,
       -2.22441822e-17, -2.06553121e-17,  1.58887016e-17, -2.22441822e-17,
        4.76661048e-18,  7.94435080e-17,  6.51436766e-17,  4.92549750e-17,
        8.73878588e-18,  6.35548064e-17])

In [14]:
# Verifying std of each feature = 1
scaled_data.std(axis = 0)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [15]:
scaled_data

array([[ 0.2885133 ,  0.30685572,  0.98322818, ...,  0.62045628,
         1.31207969,  1.97546695],
       [-0.26243786, -0.38397129, -0.87106406, ..., -0.50198729,
        -1.18619751, -1.6670165 ],
       [ 0.91799157, -0.7984675 ,  0.36215914, ..., -0.50198729,
         1.03449334, -0.17299551],
       ...,
       [ 0.23446459,  1.44672029,  1.79448103, ...,  0.62045628,
         0.75690698, -0.84896466],
       [ 0.80737157, -1.42021181,  0.36810239, ..., -0.50198729,
         1.17328652, -0.84465913],
       [ 0.04237444, -0.31488859, -0.65413564, ...,  0.62045628,
        -0.6310248 ,  1.16172065]])

In [16]:
scaled_data = pd.DataFrame(scaled_data, columns= d_data.columns )

In [17]:
#Getting dummies for Education and Relationship Status columns
scaled_c_data = pd.concat([scaled_data, pd.get_dummies(dfo)], axis=1)

In [18]:
scaled_c_data.head(3)

Unnamed: 0,Income,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,...,Age,TotalAcceptedCmp,NumTotalPurchases,Customer_Day,Education_Highschool,Education_Master,Education_PhD,Education_Undergraduate,Relationship Status_Couple,Relationship Status_Single
0,0.288513,0.306856,0.983228,1.55417,1.679746,2.461068,1.475396,0.845669,0.348738,1.407639,...,1.016868,0.620456,1.31208,1.975467,0,0,0,1,0,1
1,-0.262438,-0.383971,-0.871064,-0.636431,-0.713455,-0.650414,-0.631638,-0.729742,-0.1687,-1.110921,...,1.273264,-0.501987,-1.186198,-1.667017,0,0,0,1,0,1
2,0.917992,-0.798467,0.362159,0.572177,-0.177201,1.344595,-0.147262,-0.038098,-0.686137,1.407639,...,0.333146,-0.501987,1.034493,-0.172996,0,0,0,1,1,0


In [23]:
c_data.to_csv('../Data/c_data_Preprocess.csv', index = False)
scaled_c_data.to_csv('../Data/c_data_scaled.csv', index = False)