In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import sklearn

In [5]:
data=pd.read_csv('Travel.csv')

In [6]:
num_col=[fea for fea in data.columns if (data[fea].dtype !='O' and data[fea].dtype !=int)]
#segregating categorical and numerical variables 
num_col

['Age',
 'DurationOfPitch',
 'NumberOfFollowups',
 'PreferredPropertyStar',
 'NumberOfTrips',
 'NumberOfChildrenVisiting',
 'MonthlyIncome']

In [12]:
X=data[num_col]


Method 1: Standardizing Features

Standardization is the process of centering the variable at zero and standizing the variance to 1.To standardize the features
we subtrsct the mean from each observation and then divide the result by standard deviation:

z=(x-mean(x))/std(x)

In [11]:
# Performing Mean Imputation

for var in num_col:
    value=X[var].mean()
    X[var]=X[var].fillna(value)

In [18]:
# using sklearn
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X_scaled=scaler.transform(X)
X_scaled_final=pd.DataFrame(X_scaled)
X_scaled_final

Unnamed: 0,0,1,2,3,4,5,6
0,3.712822e-01,-1.143871,-0.710021,-0.730127,-1.227404,-1.393568,-0.500322
1,1.250646e+00,-0.179681,0.292203,0.526467,-0.678603,0.953955,-0.664693
2,-6.839967e-02,-0.902823,0.292203,-0.730127,2.065400,-1.393568,-1.243704
3,-5.080815e-01,-0.782299,-0.710021,-0.730127,-0.678603,-0.219807,-1.087714
4,-7.810318e-16,-0.902823,-0.710021,0.526467,-1.227404,-1.393568,-0.981245
...,...,...,...,...,...,...,...
4883,1.250646e+00,-0.782299,1.294428,0.526467,-0.678603,-0.219807,0.563041
4884,-1.057684e+00,1.869222,1.294428,-0.730127,-0.129803,0.953955,-0.458610
4885,1.580407e+00,0.181890,0.292203,0.526467,2.065400,2.127717,1.561836
4886,-2.046968e+00,0.061367,0.292203,-0.730127,-0.129803,0.953955,-0.634409


Method 2: Performing mean normalization

In mean normaliztion ,we center the variable at zero and rescale the distribution to the value range.
This procedure involves subtracting from the mean from each observation and then dividing the result by the difference
between the minimum and maximum values.


x_scaled=(x-mean(x))/(max(x)-min(x))


The transformation results in a distribution centered around 0 ,with min and max values within the range of -1 to 1



In [24]:
from sklearn.preprocessing import StandardScaler,RobustScaler
scaler_mean=StandardScaler(with_mean=True,with_std=False)# No division by standard deviation
scaler_minmax=RobustScaler(with_centering=False,with_scaling=True,quantile_range=(0,100))
scaler_mean.fit(X)
scaler_minmax.fit(X)
X_scaled=scaler_minmax.transform(X)
X_scaled_final=pd.DataFrame(X_scaled)
X_scaled_final


Unnamed: 0,0,1,2,3,4,5,6
0,0.953488,0.049180,0.6,1.5,0.047619,0.000000,0.214920
1,1.139535,0.114754,0.8,2.0,0.095238,0.666667,0.206085
2,0.860465,0.065574,0.8,1.5,0.333333,0.000000,0.174963
3,0.767442,0.073770,0.6,1.5,0.095238,0.333333,0.183347
4,0.874936,0.065574,0.6,2.0,0.047619,0.000000,0.189070
...,...,...,...,...,...,...,...
4883,1.139535,0.073770,1.0,2.0,0.095238,0.333333,0.272078
4884,0.651163,0.254098,1.0,1.5,0.142857,0.666667,0.217163
4885,1.209302,0.139344,0.8,2.0,0.333333,1.000000,0.325764
4886,0.441860,0.131148,0.8,1.5,0.142857,0.666667,0.207713


Method 3: Scaling to the maximum and minimum values

Scaling to the minimum and minimum values squeezes the values of the variables between 0 and 1.

xscaled=(x-min(x))/(max(x)-min(x))

In [26]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(X)
X_scaled=scaler.transform(X)
X_scaled_final=pd.DataFrame(X_scaled)
X_scaled_final


Unnamed: 0,0,1,2,3,4,5,6
0,0.534884,0.008197,0.4,0.0,0.000000,0.000000,0.204683
1,0.720930,0.073770,0.6,0.5,0.047619,0.666667,0.195848
2,0.441860,0.024590,0.6,0.0,0.285714,0.000000,0.164725
3,0.348837,0.032787,0.4,0.0,0.047619,0.333333,0.173110
4,0.456332,0.024590,0.4,0.5,0.000000,0.000000,0.178832
...,...,...,...,...,...,...,...
4883,0.720930,0.032787,0.8,0.5,0.047619,0.333333,0.261840
4884,0.232558,0.213115,0.8,0.0,0.095238,0.666667,0.206925
4885,0.790698,0.098361,0.6,0.5,0.285714,1.000000,0.315527
4886,0.023256,0.090164,0.6,0.0,0.095238,0.666667,0.197475


Method 4: Implementing Maximum absolute scaling


xscaled=x/max(x)

In [None]:
from sklearn.preprocessing import MaxAbsScaler
scaler=MaxAbsScaler()
scaler.fit(X)
X_scaled=scaler.transform(X)
X_scaled_final=pd.DataFrame(X_scaled)
X_scaled_final

Method 5: Scaling with median and Quantiles

xscaled= (x-median(x))/(q3(x)-q1(x))
    


In [28]:
scaler=RobustScaler()
scaler.fit(X)
X_scaled=scaler.transform(X)
X_scaled_final=pd.DataFrame(X_scaled)
X_scaled_final

Unnamed: 0,0,1,2,3,4,5,6
0,0.333333,-0.8,-1.0,0.0,-1.0,-1.0,-0.336454
1,1.000000,0.0,0.0,1.0,-0.5,1.0,-0.511159
2,0.000000,-0.6,0.0,0.0,2.0,-1.0,-1.126575
3,-0.333333,-0.5,-1.0,0.0,-0.5,0.0,-0.960777
4,0.051855,-0.6,-1.0,1.0,-1.0,-1.0,-0.847614
...,...,...,...,...,...,...,...
4883,1.000000,-0.5,1.0,1.0,-0.5,0.0,0.793765
4884,-0.750000,1.7,1.0,0.0,0.0,1.0,-0.292120
4885,1.250000,0.3,0.0,1.0,2.0,2.0,1.855357
4886,-1.500000,0.2,0.0,0.0,0.0,1.0,-0.478972


Method 6: Scaling to vector unit length

 xscaled =x/norm
 
 where norm may be either Manhattan distance or Euclidean Distance

In [29]:
from sklearn.preprocessing import Normalizer
scaler=Normalizer(norm='l2')# l2 for Euclidean Distance
scaler.fit(X)
X_scaled=scaler.transform(X)
X_scaled_final=pd.DataFrame(X_scaled)
X_scaled_final

Unnamed: 0,0,1,2,3,4,5,6
0,0.001953,0.000286,0.000143,0.000143,0.000048,0.000000,0.999998
1,0.002434,0.000695,0.000199,0.000199,0.000099,0.000099,0.999997
2,0.002165,0.000468,0.000234,0.000176,0.000410,0.000000,0.999997
3,0.001843,0.000503,0.000168,0.000168,0.000112,0.000056,0.999998
4,0.002037,0.000433,0.000162,0.000217,0.000054,0.000000,0.999998
...,...,...,...,...,...,...,...
4883,0.001844,0.000339,0.000188,0.000151,0.000075,0.000038,0.999998
4884,0.001320,0.001461,0.000236,0.000141,0.000141,0.000094,0.999998
4885,0.001634,0.000534,0.000126,0.000126,0.000220,0.000094,0.999998
4886,0.000936,0.000789,0.000197,0.000148,0.000148,0.000099,0.999999
