# Data Preprocessing ( Data Normalization )

In [1]:
import pandas as  pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [2]:
housing_df = pd.read_csv('WestRoxbury.csv')

In [3]:
housing_df.head()

Unnamed: 0,TOTAL VALUE,TAX,LOT SQFT,YR BUILT,GROSS AREA,LIVING AREA,FLOORS,ROOMS,BEDROOMS,FULL BATH,HALF BATH,KITCHEN,FIREPLACE,REMODEL
0,344.2,4330,9965,1880,2436,1352,2.0,6,3,1,1,1,0,
1,412.6,5190,6590,1945,3108,1976,2.0,10,4,2,1,1,0,Recent
2,330.1,4152,7500,1890,2294,1371,2.0,8,4,1,1,1,0,
3,498.6,6272,13773,1957,5032,2608,1.0,9,5,1,1,1,1,
4,331.5,4170,5000,1910,2370,1438,2.0,7,3,2,0,1,0,


In [4]:
housing_df.columns = [s.strip().replace(' ','_') for s in housing_df.columns]
housing_df

Unnamed: 0,TOTAL_VALUE,TAX,LOT_SQFT,YR_BUILT,GROSS_AREA,LIVING_AREA,FLOORS,ROOMS,BEDROOMS,FULL_BATH,HALF_BATH,KITCHEN,FIREPLACE,REMODEL
0,344.2,4330,9965,1880,2436,1352,2.0,6,3,1,1,1,0,
1,412.6,5190,6590,1945,3108,1976,2.0,10,4,2,1,1,0,Recent
2,330.1,4152,7500,1890,2294,1371,2.0,8,4,1,1,1,0,
3,498.6,6272,13773,1957,5032,2608,1.0,9,5,1,1,1,1,
4,331.5,4170,5000,1910,2370,1438,2.0,7,3,2,0,1,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5797,404.8,5092,6762,1938,2594,1714,2.0,9,3,2,1,1,1,Recent
5798,407.9,5131,9408,1950,2414,1333,2.0,6,3,1,1,1,1,
5799,406.5,5113,7198,1987,2480,1674,2.0,7,3,1,1,1,1,
5800,308.7,3883,6890,1946,2000,1000,1.0,5,2,1,0,1,0,


In [5]:
housing_df.dtypes

TOTAL_VALUE    float64
TAX              int64
LOT_SQFT         int64
YR_BUILT         int64
GROSS_AREA       int64
LIVING_AREA      int64
FLOORS         float64
ROOMS            int64
BEDROOMS         int64
FULL_BATH        int64
HALF_BATH        int64
KITCHEN          int64
FIREPLACE        int64
REMODEL         object
dtype: object

In [7]:
intcol = [c for c in housing_df.columns if housing_df[c].dtype == 'int64']
housing_df[intcol] = housing_df[intcol].astype('float64')

housing_df = housing_df.drop(['REMODEL'], axis=1)

In [8]:
housing_df.dtypes

TOTAL_VALUE    float64
TAX            float64
LOT_SQFT       float64
YR_BUILT       float64
GROSS_AREA     float64
LIVING_AREA    float64
FLOORS         float64
ROOMS          float64
BEDROOMS       float64
FULL_BATH      float64
HALF_BATH      float64
KITCHEN        float64
FIREPLACE      float64
dtype: object

# Table - Scaling Data 

In [9]:
housing_df

Unnamed: 0,TOTAL_VALUE,TAX,LOT_SQFT,YR_BUILT,GROSS_AREA,LIVING_AREA,FLOORS,ROOMS,BEDROOMS,FULL_BATH,HALF_BATH,KITCHEN,FIREPLACE
0,344.2,4330.0,9965.0,1880.0,2436.0,1352.0,2.0,6.0,3.0,1.0,1.0,1.0,0.0
1,412.6,5190.0,6590.0,1945.0,3108.0,1976.0,2.0,10.0,4.0,2.0,1.0,1.0,0.0
2,330.1,4152.0,7500.0,1890.0,2294.0,1371.0,2.0,8.0,4.0,1.0,1.0,1.0,0.0
3,498.6,6272.0,13773.0,1957.0,5032.0,2608.0,1.0,9.0,5.0,1.0,1.0,1.0,1.0
4,331.5,4170.0,5000.0,1910.0,2370.0,1438.0,2.0,7.0,3.0,2.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5797,404.8,5092.0,6762.0,1938.0,2594.0,1714.0,2.0,9.0,3.0,2.0,1.0,1.0,1.0
5798,407.9,5131.0,9408.0,1950.0,2414.0,1333.0,2.0,6.0,3.0,1.0,1.0,1.0,1.0
5799,406.5,5113.0,7198.0,1987.0,2480.0,1674.0,2.0,7.0,3.0,1.0,1.0,1.0,1.0
5800,308.7,3883.0,6890.0,1946.0,2000.0,1000.0,1.0,5.0,2.0,1.0,0.0,1.0,0.0


In [10]:
housing_df.describe()

Unnamed: 0,TOTAL_VALUE,TAX,LOT_SQFT,YR_BUILT,GROSS_AREA,LIVING_AREA,FLOORS,ROOMS,BEDROOMS,FULL_BATH,HALF_BATH,KITCHEN,FIREPLACE
count,5802.0,5802.0,5802.0,5802.0,5802.0,5802.0,5802.0,5802.0,5802.0,5802.0,5802.0,5802.0,5802.0
mean,392.685715,4939.485867,6278.083764,1936.744916,2924.842123,1657.065322,1.68373,6.994829,3.230093,1.296794,0.613926,1.01534,0.739917
std,99.177414,1247.649118,2669.707974,35.98991,883.984726,540.456726,0.444884,1.437657,0.846607,0.52204,0.533839,0.12291,0.565108
min,105.0,1320.0,997.0,0.0,821.0,504.0,1.0,3.0,1.0,1.0,0.0,1.0,0.0
25%,325.125,4089.5,4772.0,1920.0,2347.0,1308.0,1.0,6.0,3.0,1.0,0.0,1.0,0.0
50%,375.9,4728.0,5683.0,1935.0,2700.0,1548.5,2.0,7.0,3.0,1.0,1.0,1.0,1.0
75%,438.775,5519.5,7022.25,1955.0,3239.0,1873.75,2.0,8.0,4.0,2.0,1.0,1.0,1.0
max,1217.8,15319.0,46411.0,2011.0,8154.0,5289.0,3.0,14.0,9.0,5.0,3.0,2.0,4.0


In [11]:
df = housing_df.copy()

# Z-score normalization of the data
norm_df = (housing_df - housing_df.mean()) / housing_df.std()
norm_df

Unnamed: 0,TOTAL_VALUE,TAX,LOT_SQFT,YR_BUILT,GROSS_AREA,LIVING_AREA,FLOORS,ROOMS,BEDROOMS,FULL_BATH,HALF_BATH,KITCHEN,FIREPLACE
0,-0.488879,-0.488507,1.381019,-1.576690,-0.552998,-0.564458,0.710905,-0.691980,-0.271783,-0.568528,0.723202,-0.124803,-1.309337
1,0.200795,0.200789,0.116835,0.229372,0.207196,0.590121,0.710905,2.090325,0.909403,1.347035,0.723202,-0.124803,-1.309337
2,-0.631048,-0.631176,0.457697,-1.298834,-0.713635,-0.529303,0.710905,0.699173,0.909403,-0.568528,0.723202,-0.124803,-1.309337
3,1.067927,1.068020,2.807392,0.562799,2.383704,1.759502,-1.536872,1.394749,2.090589,-0.568528,0.723202,-0.124803,0.460235
4,-0.616932,-0.616749,-0.478735,-0.743123,-0.627660,-0.405334,0.710905,0.003597,-0.271783,1.347035,-1.150021,-0.124803,-1.309337
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5797,0.122148,0.122241,0.181262,0.034873,-0.374262,0.105345,0.710905,1.394749,-0.271783,1.347035,0.723202,-0.124803,0.460235
5798,0.153405,0.153500,1.172381,0.368300,-0.577886,-0.599614,0.710905,-0.691980,-0.271783,-0.568528,0.723202,-0.124803,0.460235
5799,0.139289,0.139073,0.344576,1.396366,-0.503224,0.031334,0.710905,0.003597,-0.271783,-0.568528,0.723202,-0.124803,0.460235
5800,-0.846823,-0.846781,0.229207,0.257158,-1.046220,-1.215759,-1.536872,-1.387556,-1.452969,-0.568528,-1.150021,-0.124803,-1.309337


In [12]:
scaler = StandardScaler()
norm_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
norm_df

Unnamed: 0,TOTAL_VALUE,TAX,LOT_SQFT,YR_BUILT,GROSS_AREA,LIVING_AREA,FLOORS,ROOMS,BEDROOMS,FULL_BATH,HALF_BATH,KITCHEN,FIREPLACE
0,-0.488921,-0.488550,1.381138,-1.576825,-0.553046,-0.564507,0.710966,-0.692039,-0.271806,-0.568577,0.723264,-0.124814,-1.309450
1,0.200812,0.200806,0.116845,0.229392,0.207214,0.590172,0.710966,2.090505,0.909482,1.347151,0.723264,-0.124814,-1.309450
2,-0.631102,-0.631230,0.457736,-1.298946,-0.713696,-0.529349,0.710966,0.699233,0.909482,-0.568577,0.723264,-0.124814,-1.309450
3,1.068020,1.068112,2.807634,0.562847,2.383909,1.759654,-1.537005,1.394869,2.090769,-0.568577,0.723264,-0.124814,0.460275
4,-0.616985,-0.616802,-0.478777,-0.743187,-0.627714,-0.405369,0.710966,0.003597,-0.271806,1.347151,-1.150120,-0.124814,-1.309450
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5797,0.122158,0.122252,0.181277,0.034876,-0.374295,0.105355,0.710966,1.394869,-0.271806,1.347151,0.723264,-0.124814,0.460275
5798,0.153418,0.153513,1.172483,0.368332,-0.577935,-0.599666,0.710966,-0.692039,-0.271806,-0.568577,0.723264,-0.124814,0.460275
5799,0.139301,0.139085,0.344605,1.396486,-0.503267,0.031337,0.710966,0.003597,-0.271806,-0.568577,0.723264,-0.124814,0.460275
5800,-0.846896,-0.846854,0.229227,0.257180,-1.046310,-1.215864,-1.537005,-1.387675,-1.453094,-0.568577,-1.150120,-0.124814,-1.309450


In [13]:
rescaler = (housing_df - housing_df.min()) / (housing_df.max() -  housing_df.min())
rescaler

Unnamed: 0,TOTAL_VALUE,TAX,LOT_SQFT,YR_BUILT,GROSS_AREA,LIVING_AREA,FLOORS,ROOMS,BEDROOMS,FULL_BATH,HALF_BATH,KITCHEN,FIREPLACE
0,0.214953,0.215015,0.197472,0.934858,0.220237,0.177220,0.5,0.272727,0.250,0.00,0.333333,0.0,0.00
1,0.276420,0.276448,0.123156,0.967181,0.311878,0.307628,0.5,0.636364,0.375,0.25,0.333333,0.0,0.00
2,0.202283,0.202300,0.143194,0.939831,0.200873,0.181191,0.5,0.454545,0.375,0.00,0.333333,0.0,0.00
3,0.353702,0.353740,0.281323,0.973148,0.574253,0.439707,0.0,0.545455,0.500,0.00,0.333333,0.0,0.25
4,0.203541,0.203586,0.088145,0.949776,0.211237,0.195193,0.5,0.363636,0.250,0.25,0.000000,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5797,0.269410,0.269448,0.126943,0.963700,0.241784,0.252874,0.5,0.545455,0.250,0.25,0.333333,0.0,0.25
5798,0.272196,0.272234,0.185207,0.969667,0.217237,0.173250,0.5,0.272727,0.250,0.00,0.333333,0.0,0.25
5799,0.270938,0.270948,0.136544,0.988066,0.226238,0.244514,0.5,0.363636,0.250,0.00,0.333333,0.0,0.25
5800,0.183052,0.183085,0.129762,0.967678,0.160780,0.103657,0.0,0.181818,0.125,0.00,0.000000,0.0,0.00


In [14]:
scaler = MinMaxScaler()
rescale = pd.DataFrame(scaler.fit_transform(housing_df), index=housing_df.index, columns=housing_df.columns)
rescale

Unnamed: 0,TOTAL_VALUE,TAX,LOT_SQFT,YR_BUILT,GROSS_AREA,LIVING_AREA,FLOORS,ROOMS,BEDROOMS,FULL_BATH,HALF_BATH,KITCHEN,FIREPLACE
0,0.214953,0.215015,0.197472,0.934858,0.220237,0.177220,0.5,0.272727,0.250,0.00,0.333333,0.0,0.00
1,0.276420,0.276448,0.123156,0.967181,0.311878,0.307628,0.5,0.636364,0.375,0.25,0.333333,0.0,0.00
2,0.202283,0.202300,0.143194,0.939831,0.200873,0.181191,0.5,0.454545,0.375,0.00,0.333333,0.0,0.00
3,0.353702,0.353740,0.281323,0.973148,0.574253,0.439707,0.0,0.545455,0.500,0.00,0.333333,0.0,0.25
4,0.203541,0.203586,0.088145,0.949776,0.211237,0.195193,0.5,0.363636,0.250,0.25,0.000000,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5797,0.269410,0.269448,0.126943,0.963700,0.241784,0.252874,0.5,0.545455,0.250,0.25,0.333333,0.0,0.25
5798,0.272196,0.272234,0.185207,0.969667,0.217237,0.173250,0.5,0.272727,0.250,0.00,0.333333,0.0,0.25
5799,0.270938,0.270948,0.136544,0.988066,0.226238,0.244514,0.5,0.363636,0.250,0.00,0.333333,0.0,0.25
5800,0.183052,0.183085,0.129762,0.967678,0.160780,0.103657,0.0,0.181818,0.125,0.00,0.000000,0.0,0.00


In [15]:
sigmoid = 1 / (1+np.exp(-housing_df))
sigmoid

Unnamed: 0,TOTAL_VALUE,TAX,LOT_SQFT,YR_BUILT,GROSS_AREA,LIVING_AREA,FLOORS,ROOMS,BEDROOMS,FULL_BATH,HALF_BATH,KITCHEN,FIREPLACE
0,1.0,1.0,1.0,1.0,1.0,1.0,0.880797,0.997527,0.952574,0.731059,0.731059,0.731059,0.500000
1,1.0,1.0,1.0,1.0,1.0,1.0,0.880797,0.999955,0.982014,0.880797,0.731059,0.731059,0.500000
2,1.0,1.0,1.0,1.0,1.0,1.0,0.880797,0.999665,0.982014,0.731059,0.731059,0.731059,0.500000
3,1.0,1.0,1.0,1.0,1.0,1.0,0.731059,0.999877,0.993307,0.731059,0.731059,0.731059,0.731059
4,1.0,1.0,1.0,1.0,1.0,1.0,0.880797,0.999089,0.952574,0.880797,0.500000,0.731059,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5797,1.0,1.0,1.0,1.0,1.0,1.0,0.880797,0.999877,0.952574,0.880797,0.731059,0.731059,0.731059
5798,1.0,1.0,1.0,1.0,1.0,1.0,0.880797,0.997527,0.952574,0.731059,0.731059,0.731059,0.731059
5799,1.0,1.0,1.0,1.0,1.0,1.0,0.880797,0.999089,0.952574,0.731059,0.731059,0.731059,0.731059
5800,1.0,1.0,1.0,1.0,1.0,1.0,0.731059,0.993307,0.880797,0.731059,0.500000,0.731059,0.500000


from sklearn.

In [16]:
from sklearn.preprocessing import Normalizer

sigmoid_scaler = Normalizer()

sigmoid_df = pd.DataFrame(sigmoid_scaler.fit_transform(housing_df), index=housing_df.index, columns=housing_df.columns)
sigmoid_df

Unnamed: 0,TOTAL_VALUE,TAX,LOT_SQFT,YR_BUILT,GROSS_AREA,LIVING_AREA,FLOORS,ROOMS,BEDROOMS,FULL_BATH,HALF_BATH,KITCHEN,FIREPLACE
0,0.030251,0.380550,0.875792,0.165227,0.214092,0.118823,0.000176,0.000527,0.000264,0.000088,0.000088,0.000088,0.000000
1,0.044013,0.553628,0.702969,0.207477,0.331537,0.210784,0.000213,0.001067,0.000427,0.000213,0.000107,0.000107,0.000000
2,0.035950,0.452180,0.816800,0.205834,0.249832,0.149311,0.000218,0.000871,0.000436,0.000109,0.000109,0.000109,0.000000
3,0.030615,0.385115,0.845694,0.120164,0.308976,0.160137,0.000061,0.000553,0.000307,0.000061,0.000061,0.000061,0.000061
4,0.045182,0.568351,0.681477,0.260324,0.323020,0.195993,0.000273,0.000954,0.000409,0.000273,0.000000,0.000136,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5797,0.043845,0.551528,0.732411,0.209910,0.280963,0.185648,0.000217,0.000975,0.000325,0.000217,0.000108,0.000108,0.000108
5798,0.036280,0.456362,0.836768,0.173437,0.214706,0.118560,0.000178,0.000534,0.000267,0.000089,0.000089,0.000089,0.000089
5799,0.042608,0.535929,0.754472,0.208271,0.259946,0.175463,0.000210,0.000734,0.000314,0.000105,0.000105,0.000105,0.000105
5800,0.036525,0.459432,0.815216,0.230248,0.236637,0.118319,0.000118,0.000592,0.000237,0.000118,0.000000,0.000118,0.000000
