# Scalling (for numerical data)

In [2]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')

In [3]:
data = pd.read_csv('no_outliers/data_nooutliers1.csv')
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,...,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,0,0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,...,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,1,1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,...,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,2,2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,...,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,3,3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,...,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,4,4,200004,0,32.0,Self Enquiry,1,8.0,Small Business,Male,...,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [4]:
data.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'CustomerID', 'ProdTaken', 'Age',
       'TypeofContact', 'CityTier', 'DurationOfPitch', 'Occupation', 'Gender',
       'NumberOfPersonVisiting', 'NumberOfFollowups', 'ProductPitched',
       'PreferredPropertyStar', 'MaritalStatus', 'NumberOfTrips', 'Passport',
       'PitchSatisfactionScore', 'OwnCar', 'NumberOfChildrenVisiting',
       'Designation', 'MonthlyIncome'],
      dtype='object')

In [5]:
data.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'],axis=1,inplace=True)

In [6]:
data

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,32.0,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4874,204883,1,49.0,Self Enquiry,3,9.0,Small Business,Male,3,5.0,Deluxe,4.0,Unmarried,2.0,1,1,1,1.0,Manager,26576.0
4875,204884,1,28.0,Company Invited,1,31.0,Salaried,Male,4,5.0,Basic,3.0,Single,3.0,1,3,1,2.0,Executive,21212.0
4876,204885,1,52.0,Self Enquiry,3,17.0,Salaried,Female,4,4.0,Standard,4.0,Married,7.0,0,1,1,3.0,Senior Manager,31820.0
4877,204886,1,19.0,Self Enquiry,3,16.0,Small Business,Male,3,4.0,Basic,3.0,Single,3.0,0,5,0,2.0,Executive,20289.0


## Scaling Methods:
1. Absolute Maximum Scaling
2. Min-Max Scaling
3. Normalization
4. Standardization
5. Robust Scaling

In [7]:
data['CityTier'] = data['CityTier'].astype('O')
data['NumberOfPersonVisiting'] = data['CityTier'].astype('O')
data['PreferredPropertyStar'] = data['CityTier'].astype('O')
data['Passport'] = data['CityTier'].astype('O')
data['PitchSatisfactionScore'] = data['CityTier'].astype('O')
data['OwnCar'] = data['CityTier'].astype('O')
data['ProdTaken'] = data['CityTier'].astype('O')

In [8]:
num_data=data[[fea for fea in data.columns if data[fea].dtype != 'O']]

In [9]:
num_data.head()

Unnamed: 0,CustomerID,Age,DurationOfPitch,NumberOfFollowups,NumberOfTrips,NumberOfChildrenVisiting,MonthlyIncome
0,200000,41.0,6.0,3.0,1.0,0.0,20993.0
1,200001,49.0,14.0,4.0,2.0,2.0,20130.0
2,200002,37.0,8.0,4.0,7.0,0.0,17090.0
3,200003,33.0,9.0,3.0,2.0,1.0,17909.0
4,200004,32.0,8.0,3.0,1.0,0.0,18468.0


In [10]:
num_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4879 entries, 0 to 4878
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CustomerID                4879 non-null   int64  
 1   Age                       4879 non-null   float64
 2   DurationOfPitch           4879 non-null   float64
 3   NumberOfFollowups         4879 non-null   float64
 4   NumberOfTrips             4879 non-null   float64
 5   NumberOfChildrenVisiting  4879 non-null   float64
 6   MonthlyIncome             4879 non-null   float64
dtypes: float64(6), int64(1)
memory usage: 266.9 KB


## Absolute maximum scalling

In [24]:
from sklearn.preprocessing import MaxAbsScaler as absmax
from sklearn.preprocessing import StandardScaler as ss

In [126]:
num_absmax = absmax().fit_transform(X=num_data.iloc[:,1:])

In [132]:
num_absmax

array([[0.67213115, 0.16666667, 0.5       , 0.125     , 0.        ,
        0.54277736],
       [0.80327869, 0.38888889, 0.66666667, 0.25      , 0.66666667,
        0.52046436],
       [0.60655738, 0.22222222, 0.66666667, 0.875     , 0.        ,
        0.44186467],
       ...,
       [0.85245902, 0.47222222, 0.66666667, 0.875     , 1.        ,
        0.82271117],
       [0.31147541, 0.44444444, 0.66666667, 0.375     , 0.66666667,
        0.52457533],
       [0.59016393, 0.38888889, 0.66666667, 0.375     , 0.66666667,
        0.62158389]])

In [135]:
num_data_absmax = pd.DataFrame(num_absmax,columns=num_data.iloc[:,1:].columns)

In [136]:
num_data_absmax['CustomerID'] = num_data['CustomerID']
num_data_absmax.set_index('CustomerID',inplace=True)
num_data_absmax

Unnamed: 0_level_0,Age,DurationOfPitch,NumberOfFollowups,NumberOfTrips,NumberOfChildrenVisiting,MonthlyIncome
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
200000,0.672131,0.166667,0.500000,0.125,0.000000,0.542777
200001,0.803279,0.388889,0.666667,0.250,0.666667,0.520464
200002,0.606557,0.222222,0.666667,0.875,0.000000,0.441865
200003,0.540984,0.250000,0.500000,0.250,0.333333,0.463040
200004,0.524590,0.222222,0.500000,0.125,0.000000,0.477493
...,...,...,...,...,...,...
204883,0.803279,0.250000,0.833333,0.250,0.333333,0.687127
204884,0.459016,0.861111,0.833333,0.375,0.666667,0.548440
204885,0.852459,0.472222,0.666667,0.875,1.000000,0.822711
204886,0.311475,0.444444,0.666667,0.375,0.666667,0.524575


## Min-Max scalling

In [37]:
from sklearn.preprocessing import minmax_scale

In [141]:
num_minmax = minmax_scale(num_data.iloc[:,1:],feature_range=(0, 1))

In [144]:
num_data_minmax = pd.DataFrame(num_minmax,columns=num_data.iloc[:,1:].columns)

In [146]:
num_data_minmax['CustomerID'] = num_data['CustomerID']
num_data_minmax.set_index('CustomerID',inplace=True)
num_data_minmax

Unnamed: 0_level_0,Age,DurationOfPitch,NumberOfFollowups,NumberOfTrips,NumberOfChildrenVisiting,MonthlyIncome,CustoomerID
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
200000,0.534884,0.032258,0.4,0.000000,0.000000,0.479867,200000
200001,0.720930,0.290323,0.6,0.142857,0.666667,0.454484,200001
200002,0.441860,0.096774,0.6,0.857143,0.000000,0.365070,200002
200003,0.348837,0.129032,0.4,0.142857,0.333333,0.389159,200003
200004,0.325581,0.096774,0.4,0.000000,0.000000,0.405600,200004
...,...,...,...,...,...,...,...
204883,0.720930,0.129032,0.8,0.142857,0.333333,0.644078,204883
204884,0.232558,0.838710,0.8,0.285714,0.666667,0.486308,204884
204885,0.790698,0.387097,0.6,0.857143,1.000000,0.798318,204885
204886,0.023256,0.354839,0.6,0.285714,0.666667,0.459161,204886


## Normalization

In [82]:
from sklearn import preprocessing

In [86]:
a = num_data
normalized = preprocessing.normalize(a)

In [90]:
normalized_num_data = pd.DataFrame(normalized,columns=num_data.columns)

In [91]:
normalized_num_data

Unnamed: 0,CustomerID,Age,DurationOfPitch,NumberOfFollowups,NumberOfTrips,NumberOfChildrenVisiting,MonthlyIncome
0,0.994536,0.000204,0.000030,0.000015,0.000005,0.000000,0.104391
1,0.994973,0.000244,0.000070,0.000020,0.000010,0.000010,0.100144
2,0.996369,0.000184,0.000040,0.000020,0.000035,0.000000,0.085139
3,0.996015,0.000164,0.000045,0.000015,0.000010,0.000005,0.089187
4,0.995764,0.000159,0.000040,0.000015,0.000005,0.000000,0.091947
...,...,...,...,...,...,...,...
4874,0.991692,0.000237,0.000044,0.000024,0.000010,0.000005,0.128635
4875,0.994683,0.000136,0.000151,0.000024,0.000015,0.000010,0.102981
4876,0.988154,0.000251,0.000082,0.000019,0.000034,0.000014,0.153467
4877,0.995133,0.000092,0.000078,0.000019,0.000015,0.000010,0.098544


In [92]:
normalized_num_data.drop(columns='CustomerID',axis=1,inplace=True)

In [93]:
normalized_num_data['CustomerID'] = num_data['CustomerID']

In [96]:
normalized_num_data.set_index('CustomerID',inplace=True)

In [97]:
normalized_num_data

Unnamed: 0_level_0,Age,DurationOfPitch,NumberOfFollowups,NumberOfTrips,NumberOfChildrenVisiting,MonthlyIncome
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
200000,0.000204,0.000030,0.000015,0.000005,0.000000,0.104391
200001,0.000244,0.000070,0.000020,0.000010,0.000010,0.100144
200002,0.000184,0.000040,0.000020,0.000035,0.000000,0.085139
200003,0.000164,0.000045,0.000015,0.000010,0.000005,0.089187
200004,0.000159,0.000040,0.000015,0.000005,0.000000,0.091947
...,...,...,...,...,...,...
204883,0.000237,0.000044,0.000024,0.000010,0.000005,0.128635
204884,0.000136,0.000151,0.000024,0.000015,0.000010,0.102981
204885,0.000251,0.000082,0.000019,0.000034,0.000014,0.153467
204886,0.000092,0.000078,0.000019,0.000015,0.000010,0.098544


## Standardization

In [102]:
num_data_new = (num_data-num_data.mean())/num_data.std()

In [104]:
num_data_new.drop(columns='CustomerID',axis=1,inplace=True)

In [105]:
num_data_new['CustomerID'] = num_data['CustomerID']

In [106]:
num_data_new.set_index('CustomerID',inplace=True)

In [107]:
num_data_new

Unnamed: 0_level_0,Age,DurationOfPitch,NumberOfFollowups,NumberOfTrips,NumberOfChildrenVisiting,MonthlyIncome
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
200000,0.358290,-1.149469,-0.707330,-1.256315,-1.386915,-0.491907
200001,1.217465,-0.173080,0.291241,-0.693143,0.941632,-0.659379
200002,-0.071297,-0.905372,0.291241,2.122715,-1.386915,-1.249315
200003,-0.500884,-0.783323,-0.707330,-0.693143,-0.222641,-1.090382
200004,-0.608281,-0.905372,-0.707330,-1.256315,-1.386915,-0.981903
...,...,...,...,...,...,...
204883,1.217465,-0.783323,1.289812,-0.693143,-0.222641,0.591518
204884,-1.037868,1.901748,1.289812,-0.129972,0.941632,-0.449408
204885,1.539655,0.193067,0.291241,2.122715,2.105906,1.609158
204886,-2.004440,0.071018,0.291241,-0.129972,0.941632,-0.628524


## Robust Scalling

In [108]:
from sklearn.preprocessing import RobustScaler as robsc

In [111]:
robsc_transf = robsc().fit(num_data.iloc[:,1:])

In [115]:
robsc_num_data = pd.DataFrame(robsc_transf.transform(num_data.iloc[:,1:]),columns=num_data.iloc[:,1:].columns)

In [116]:
robsc_num_data['CustomerID'] = num_data['CustomerID']

In [120]:
robsc_num_data.set_index('CustomerID',inplace=True)

In [121]:
robsc_num_data

Unnamed: 0_level_0,Age,DurationOfPitch,NumberOfFollowups,NumberOfTrips,NumberOfChildrenVisiting,MonthlyIncome
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
200000,0.384615,-0.636364,-1.0,-1.0,-1.0,-0.237492
200001,1.000000,0.090909,0.0,-0.5,1.0,-0.399512
200002,0.076923,-0.454545,0.0,2.0,-1.0,-0.970243
200003,-0.230769,-0.363636,-1.0,-0.5,0.0,-0.816484
200004,-0.307692,-0.454545,-1.0,-1.0,-1.0,-0.711537
...,...,...,...,...,...,...
204883,1.000000,-0.363636,1.0,-0.5,0.0,0.810664
204884,-0.615385,1.636364,1.0,0.0,1.0,-0.196377
204885,1.230769,0.363636,0.0,2.0,2.0,1.795175
204886,-1.307692,0.272727,0.0,0.0,1.0,-0.369661


# Generating CSVs

In [147]:
os.makedirs('Scalling data')

In [148]:
num_data_absmax.to_csv('Scalling data/num_data_absmax.csv')

In [149]:
num_data_minmax.to_csv('Scalling data/num_data_minmax.csv')

In [150]:
normalized_num_data.to_csv('Scalling data/normalized_num_data.csv')

In [151]:
num_data_new.to_csv('Scalling data/num_data_new.csv')

In [152]:
robsc_num_data.to_csv('Scalling data/robsc_num_data.csv')