### This notebook normalizes the original and bootstrapped training data and applies the respective normalization transformations to the testing data for each. 

In [2]:
import sklearn as sk
import pandas as pd
import numpy as np

from sklearn import preprocessing

### Normalize original training data and apply to testing data

In [31]:
# import training data from relative filepath
data = pd.read_csv("../train.csv")

# shuffles the data by taking a random sample without replacement
# and sampling 100% of the original dataframe
data = data.sample(frac = 1)

# extract labels from training data
data_labels = data.loc[:, "Cover_Type"]

# extract ids from training data
data_ids = data.loc[:, "Id"]

data.drop(columns = ["Id", "Cover_Type"], inplace = True)

data.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
10210,3367,121,18,732,157,633,247,223,97,67,...,0,0,0,0,0,0,0,1,0,0
11224,2579,353,22,85,34,1442,178,197,154,591,...,0,0,0,0,0,0,0,0,0,0
13089,3221,38,21,649,-2,902,215,188,99,1950,...,0,0,0,0,0,0,0,0,1,0
13516,3415,282,32,30,22,2969,116,222,237,2161,...,0,0,0,0,0,0,0,0,1,0
10605,3293,125,28,108,-37,633,253,208,63,3728,...,0,0,0,0,0,0,0,0,0,0


In [32]:
# create scaler object to do the normalization
scaler = preprocessing.StandardScaler().fit(data)

# create new normalized df
data_normalized = pd.DataFrame(scaler.fit_transform(data), columns=data.columns, index=data.index)

# show that means = 0
print("Average Mean: \n" + str(np.mean(data_normalized.mean())))

# show variances = 1
print("Avergage Variance: \n" + str(np.mean(data_normalized.std()**2)))

Average Mean: 
3.000027354439726e-16
Avergage Variance: 
0.9630266552020016


In [33]:
# re-attach labels
data_normalized["Cover_Type"] = data_labels
data_normalized["Id"] = data_ids

data_normalized

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type,Id
10210,1.478885,-0.324091,0.177250,2.403048,1.729719,-0.815853,1.122231,0.176938,-0.830005,-1.312980,...,-0.206085,-0.038173,-0.082413,-0.025726,-0.047474,4.446260,-0.213134,-0.176939,7,10211
11224,-0.407798,1.783426,0.650419,-0.676902,-0.278858,-0.205297,-1.135602,-0.963353,0.411996,-0.836573,...,-0.206085,-0.038173,-0.082413,-0.025726,-0.047474,-0.224908,-0.213134,-0.176939,3,11225
13089,1.129322,-1.078073,0.532127,2.007938,-0.866734,-0.612837,0.075120,-1.358069,-0.786426,0.398993,...,-0.206085,-0.038173,-0.082413,-0.025726,-0.047474,-0.224908,4.691876,-0.176939,7,13090
13516,1.593809,1.138453,1.833340,-0.938722,-0.474817,0.947136,-3.164379,0.133080,2.220524,0.590829,...,-0.206085,-0.038173,-0.082413,-0.025726,-0.047474,-0.224908,4.691876,-0.176939,1,13517
10605,1.301709,-0.287755,1.360172,-0.567414,-1.438280,-0.815853,1.318565,-0.480922,-1.570848,2.015504,...,-0.206085,-0.038173,-0.082413,-0.025726,-0.047474,-0.224908,-0.213134,-0.176939,2,10606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4787,-1.250580,-0.205998,1.833340,-0.762588,-0.295188,-0.636988,1.220398,-0.568637,-1.701585,-0.081960,...,-0.206085,-0.038173,-0.082413,-0.025726,-0.047474,-0.224908,-0.213134,-0.176939,4,4788
3503,-1.011153,-0.551194,0.295543,1.374811,-0.295188,-0.238504,1.122231,-0.437065,-1.113269,-0.490178,...,-0.206085,-0.038173,-0.082413,-0.025726,-0.047474,-0.224908,-0.213134,-0.176939,4,3504
2103,0.068661,1.311051,1.596756,0.684560,2.742173,-0.341144,-2.902601,-0.349350,1.871892,-0.528364,...,4.852366,-0.038173,-0.082413,-0.025726,-0.047474,-0.224908,-0.213134,-0.176939,6,2104
2039,-1.779713,-0.705624,-0.059334,-1.081532,-0.834074,-1.148677,0.893176,-0.437065,-0.873584,-1.065685,...,-0.206085,-0.038173,-0.082413,-0.025726,-0.047474,-0.224908,-0.213134,-0.176939,4,2040


In [36]:
# import testing data from relative filepath
test = pd.read_csv("../test.csv")

# extract ids from testing data
test_ids = test.loc[:, "Id"]

# drop id column from testing data
test.drop(columns = ["Id"], inplace = True)

# create new normalized df
test_data_normalized = pd.DataFrame(scaler.fit_transform(test), columns=test.columns, index=test.index)

# re attach ids
test_data_normalized["Id"] = test_ids

test_data_normalized

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Id
0,-1.043274,1.771765,-0.005320,-1.273208,-0.795308,0.202921,-0.605045,-0.479714,0.349015,3.504738,...,-0.317525,-0.29228,-0.053065,-0.056315,-0.01388,-0.021604,-0.164133,-0.154236,-0.121939,15121
1,-1.032291,-1.390021,-0.139546,-1.273208,-0.795308,0.183708,-0.417506,-0.378026,0.243835,3.527338,...,-0.317525,-0.29228,-0.053065,-0.056315,-0.01388,-0.021604,-0.164133,-0.154236,-0.121939,15122
2,-0.922464,-1.247115,0.128906,-1.273208,-0.795308,0.392484,-0.229966,-0.784777,-0.150590,3.277978,...,-0.317525,-0.29228,-0.053065,-0.056315,-0.01388,-0.021604,-0.164133,-0.154236,-0.121939,15123
3,-0.937108,-1.175663,0.397357,-1.273208,-0.795308,0.373272,-0.154950,-1.140685,-0.466130,3.300578,...,-0.317525,-0.29228,-0.053065,-0.056315,-0.01388,-0.021604,-0.164133,-0.154236,-0.121939,15124
4,-0.948091,-1.131005,0.665808,-1.273208,-0.795308,0.354059,-0.079934,-1.445748,-0.729080,3.323179,...,-0.317525,-0.29228,-0.053065,-0.056315,-0.01388,-0.021604,-0.164133,-0.154236,-0.121939,15125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
565887,-2.082970,-0.023486,0.800033,-0.873208,-0.503259,-1.446795,1.045303,0.689696,-0.650195,-0.870747,...,-0.317525,-0.29228,-0.053065,-0.056315,-0.01388,-0.021604,-0.164133,-0.154236,-0.121939,581008
565888,-2.101275,-0.032418,0.665808,-0.957914,-0.589156,-1.455121,1.045303,0.689696,-0.623900,-0.864720,...,-0.317525,-0.29228,-0.053065,-0.056315,-0.01388,-0.021604,-0.164133,-0.154236,-0.121939,581009
565889,-2.119579,0.030103,0.397357,-0.990855,-0.675052,-1.458323,0.895271,0.893072,-0.334655,-0.857940,...,-0.317525,-0.29228,-0.053065,-0.056315,-0.01388,-0.021604,-0.164133,-0.154236,-0.121939,581010
565890,-2.126901,0.128351,0.128906,-0.990855,-0.709411,-1.458323,0.670224,1.096448,0.007180,-0.850407,...,-0.317525,-0.29228,-0.053065,-0.056315,-0.01388,-0.021604,-0.164133,-0.154236,-0.121939,581011


In [40]:
# outputs to csv file
data_normalized.to_csv("normalized_training_data.csv", index = False)
test_data_normalized.to_csv("normalized_test_data.csv", index = False)

### Normalize bootstrapped data and apply to testing data

In [44]:
# import training data from relative filepath
boot_data = pd.read_csv("../Bootstrapped Data/bootstrapped_data.csv")

# shuffles the data by taking a random sample without replacement
# and sampling 100% of the original dataframe
boot_data = boot_data.sample(frac = 1)

# extract labels from training data
boot_data_labels = boot_data.loc[:, "Cover_Type"]

# extract ids from training data
boot_data_ids = boot_data.loc[:, "Id"]

boot_data.drop(columns = ["Id", "Cover_Type"], inplace = True)

boot_data.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
35952,3374,78,7,192,25,2724,229,228,132,1655,...,0,0,0,0,0,0,0,1,0,0
24406,2870,280,13,0,0,2093,183,241,198,977,...,0,0,0,0,0,0,0,0,0,0
23491,2791,57,10,0,0,1595,227,219,125,1724,...,0,0,0,0,0,0,0,0,0,0
5982,2480,96,11,571,124,607,238,224,115,1006,...,0,0,0,0,0,0,0,0,0,0
38082,2843,189,24,366,116,1711,213,249,157,2192,...,0,0,0,0,0,0,0,0,0,0


In [45]:
# create scaler object to do the normalization
boot_scaler = preprocessing.StandardScaler().fit(boot_data)

# create new normalized df
boot_data_normalized = pd.DataFrame(boot_scaler.fit_transform(boot_data), 
                                    columns=boot_data.columns, index=boot_data.index)

# show that means = 0
print("Average Mean: \n" + str(np.mean(boot_data_normalized.mean())))

# show variances = 1
print("Avergage Variance: \n" + str(np.mean(boot_data_normalized.std()**2)))

Average Mean: 
-2.2310613438973064e-16
Avergage Variance: 
0.9629841927731588


In [47]:
# re-attach labels
boot_data_normalized["Cover_Type"] = boot_data_labels
boot_data_normalized["Id"] = boot_data_ids

boot_data_normalized

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type,Id
35952,1.395906,-0.693804,-1.012598,-0.252699,-0.396372,0.459139,0.522597,0.327637,-0.090492,-0.102954,...,-0.235944,-0.047005,-0.081595,-0.023483,-0.036996,4.524059,-0.204159,-0.175476,7,7271
24406,0.024550,1.165126,-0.275095,-1.149829,-0.816364,0.037003,-1.145260,0.922046,1.437227,-0.645003,...,-0.235944,-0.047005,-0.081595,-0.023483,-0.036996,-0.221040,-0.204159,-0.175476,5,12318
23491,-0.190404,-0.887060,-0.643846,-1.149829,-0.816364,-0.296158,0.450081,-0.083876,-0.252523,-0.047789,...,-0.235944,-0.047005,-0.081595,-0.023483,-0.036996,-0.221040,-0.204159,-0.175476,5,7135
5982,-1.036618,-0.528157,-0.520929,1.518199,1.266800,-0.957126,0.848917,0.144742,-0.483996,-0.621818,...,-0.235944,-0.047005,-0.081595,-0.023483,-0.036996,-0.221040,-0.204159,-0.175476,6,6350
38082,-0.048915,0.327687,1.076994,0.560326,1.132402,-0.218554,-0.057527,1.287836,0.488189,0.326368,...,-0.235944,-0.047005,-0.081595,-0.023483,-0.036996,-0.221040,-0.204159,-0.175476,5,8721
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44804,0.995927,0.346092,-1.258432,-0.836768,-0.782765,-0.029228,0.196277,1.059217,0.557631,-1.326961,...,-0.235944,-0.047005,12.255611,-0.023483,-0.036996,-0.221040,-0.204159,-0.175476,7,9853
17026,1.006811,-0.997491,-1.381349,1.877986,0.174818,1.428516,0.232535,0.464809,0.256717,0.109709,...,-0.235944,-0.047005,-0.081595,-0.023483,-0.036996,-0.221040,-0.204159,-0.175476,1,202
36354,1.491139,1.763296,1.445745,0.560326,1.938788,-0.733013,-2.051704,-1.501312,0.603926,-0.280439,...,4.238303,-0.047005,-0.081595,-0.023483,-0.036996,-0.221040,-0.204159,-0.175476,7,10416
12303,-1.831134,-1.218354,2.552000,0.102415,1.048403,-1.141769,-1.507837,-3.604603,-1.201561,-0.897639,...,-0.235944,-0.047005,-0.081595,-0.023483,-0.036996,-0.221040,-0.204159,-0.175476,3,3044


In [48]:
# import testing data from relative filepath
boot_test = pd.read_csv("../test.csv")

# extract ids from testing data
boot_test_ids = boot_test.loc[:, "Id"]

# drop id column from testing data
boot_test.drop(columns = ["Id"], inplace = True)

# create new normalized df
boot_test_data_normalized = pd.DataFrame(boot_scaler.fit_transform(boot_test), 
                                         columns=boot_test.columns, index=boot_test.index)

# re attach ids
boot_test_data_normalized["Id"] = boot_test_ids

boot_test_data_normalized

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Id
0,-1.043274,1.771765,-0.005320,-1.273208,-0.795308,0.202921,-0.605045,-0.479714,0.349015,3.504738,...,-0.317525,-0.29228,-0.053065,-0.056315,-0.01388,-0.021604,-0.164133,-0.154236,-0.121939,15121
1,-1.032291,-1.390021,-0.139546,-1.273208,-0.795308,0.183708,-0.417506,-0.378026,0.243835,3.527338,...,-0.317525,-0.29228,-0.053065,-0.056315,-0.01388,-0.021604,-0.164133,-0.154236,-0.121939,15122
2,-0.922464,-1.247115,0.128906,-1.273208,-0.795308,0.392484,-0.229966,-0.784777,-0.150590,3.277978,...,-0.317525,-0.29228,-0.053065,-0.056315,-0.01388,-0.021604,-0.164133,-0.154236,-0.121939,15123
3,-0.937108,-1.175663,0.397357,-1.273208,-0.795308,0.373272,-0.154950,-1.140685,-0.466130,3.300578,...,-0.317525,-0.29228,-0.053065,-0.056315,-0.01388,-0.021604,-0.164133,-0.154236,-0.121939,15124
4,-0.948091,-1.131005,0.665808,-1.273208,-0.795308,0.354059,-0.079934,-1.445748,-0.729080,3.323179,...,-0.317525,-0.29228,-0.053065,-0.056315,-0.01388,-0.021604,-0.164133,-0.154236,-0.121939,15125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
565887,-2.082970,-0.023486,0.800033,-0.873208,-0.503259,-1.446795,1.045303,0.689696,-0.650195,-0.870747,...,-0.317525,-0.29228,-0.053065,-0.056315,-0.01388,-0.021604,-0.164133,-0.154236,-0.121939,581008
565888,-2.101275,-0.032418,0.665808,-0.957914,-0.589156,-1.455121,1.045303,0.689696,-0.623900,-0.864720,...,-0.317525,-0.29228,-0.053065,-0.056315,-0.01388,-0.021604,-0.164133,-0.154236,-0.121939,581009
565889,-2.119579,0.030103,0.397357,-0.990855,-0.675052,-1.458323,0.895271,0.893072,-0.334655,-0.857940,...,-0.317525,-0.29228,-0.053065,-0.056315,-0.01388,-0.021604,-0.164133,-0.154236,-0.121939,581010
565890,-2.126901,0.128351,0.128906,-0.990855,-0.709411,-1.458323,0.670224,1.096448,0.007180,-0.850407,...,-0.317525,-0.29228,-0.053065,-0.056315,-0.01388,-0.021604,-0.164133,-0.154236,-0.121939,581011


In [49]:
# outputs to csv file
boot_data_normalized.to_csv("boot_normalized_training_data.csv", index = False)
boot_test_data_normalized.to_csv("boot_normalized_test_data.csv", index = False)