### This notebook implements bootstrapping on the original training data, also described as sampling with replacement, in order to build a training data set with more examples of underrepresented samples.

In [2]:
import sklearn as sk
import pandas as pd
import numpy as np

In [3]:
# import training data from relative filepath
data = pd.read_csv("../train.csv")

# shuffles the data by taking a random sample without replacement
# and sampling 100% of the original dataframe
data = data.sample(frac = 1)

data.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
12571,12572,2362,268,22,350,109,384,159,243,220,...,0,0,0,0,0,0,0,0,0,3
6248,6249,3302,100,24,323,104,1723,251,201,65,...,0,0,0,0,0,0,0,1,0,1
9039,9040,3381,169,7,330,69,1387,226,243,151,...,0,0,0,0,0,0,1,0,0,7
9785,9786,3421,300,22,1073,293,4143,154,226,209,...,0,0,0,0,0,0,0,0,0,7
8692,8693,2665,324,18,85,33,2265,173,218,184,...,0,0,0,0,0,0,0,0,0,6


In [4]:
# shows number of entries of each type in the training data
data.groupby("Cover_Type").size()

Cover_Type
1    2160
2    2160
3    2160
4    2160
5    2160
6    2160
7    2160
dtype: int64

In [5]:
# double checks the above numbers by displaying data where cover type is 4
data.loc[data.Cover_Type == 4]

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
3801,3802,2187,138,19,134,62,1173,246,229,103,...,0,0,0,0,0,0,0,0,0,4
2941,2942,2067,45,14,30,-4,1116,223,210,118,...,0,0,0,0,0,0,0,0,0,4
2052,2053,2010,70,23,0,0,212,238,188,69,...,0,0,0,0,0,0,0,0,0,4
5283,5284,2194,93,18,0,0,1471,245,211,89,...,0,0,0,0,0,0,0,0,0,4
12591,12592,2129,342,1,30,1,1340,217,237,157,...,0,0,0,0,0,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4332,4333,2211,128,8,0,0,778,234,235,133,...,0,0,0,0,0,0,0,0,0,4
12545,12546,2280,128,16,497,122,1403,244,229,109,...,0,0,0,0,0,0,0,0,0,4
4986,4987,2190,351,19,0,0,1657,183,205,157,...,0,0,0,0,0,0,0,0,0,4
6010,6011,2388,65,21,0,0,722,234,193,81,...,0,0,0,0,0,0,0,0,0,4


In [6]:
# list of output classes to sample for bootstrapping
cover_types = [1, 2, 3, 4, 5, 6, 7]

# relative weights of each class to represent in bootstrapped sample
weights = [4, 5, 1, 1, 1, 1, 1]

# create series to hold the probability weights for each sample
weight_series = pd.Series(range(1, len(data)+2))

# loop over each weight and
# assign to appropriate place in series
for weight, cover_type in zip(weights, cover_types):
    
    # assign appropriate weight
    weight_series[data.loc[data.Cover_Type == cover_type]["Id"]] = weight

In [7]:
# create bootstrapped data
bootstrapped_data = data.sample(frac = 3, replace = True, 
                                weights = weight_series, random_state = 1)

bootstrapped_data.reset_index(drop = True, inplace = True)

In [8]:
bootstrapped_data

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,7623,2539,315,24,150,81,684,149,212,200,...,0,0,0,0,0,0,0,0,0,6
1,11480,2647,108,22,85,31,1320,251,210,76,...,0,0,0,0,0,0,0,0,0,5
2,9786,3421,300,22,1073,293,4143,154,226,209,...,0,0,0,0,0,0,0,0,0,7
3,150,2536,34,5,242,16,1242,219,228,146,...,0,0,0,0,0,0,0,0,0,2
4,10229,3141,245,12,272,32,127,194,249,192,...,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45355,9106,3338,253,9,150,16,3798,201,246,184,...,0,0,0,0,0,0,1,0,0,7
45356,14356,3353,124,9,234,31,1848,235,234,130,...,0,0,0,0,0,0,1,0,0,7
45357,11620,2867,211,8,85,12,2830,213,248,170,...,0,0,0,0,0,0,0,0,0,1
45358,342,2978,27,4,192,13,4780,218,230,149,...,0,0,0,0,0,0,0,0,0,2


In [18]:
# outputs to csv file
bootstrapped_data.to_csv("bootstrapped_data.csv", index = False)

In [9]:
# create bootstrapped data subsampled from total population
subsampled_data = data.sample(frac = 0.8, replace = False, 
                                weights = weight_series, random_state = 1)

subsampled_data.reset_index(drop = True, inplace = True)

In [10]:
subsampled_data

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,7623,2539,315,24,150,81,684,149,212,200,...,0,0,0,0,0,0,0,0,0,6
1,11480,2647,108,22,85,31,1320,251,210,76,...,0,0,0,0,0,0,0,0,0,5
2,9786,3421,300,22,1073,293,4143,154,226,209,...,0,0,0,0,0,0,0,0,0,7
3,150,2536,34,5,242,16,1242,219,228,146,...,0,0,0,0,0,0,0,0,0,2
4,10229,3141,245,12,272,32,127,194,249,192,...,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12091,7696,2537,99,14,127,37,1150,242,221,106,...,0,0,0,0,0,0,0,0,0,3
12092,13032,2389,304,24,134,30,1190,148,220,209,...,0,0,0,0,0,0,0,0,0,6
12093,2734,2747,283,8,0,0,2015,199,241,181,...,0,0,0,0,0,0,0,0,0,5
12094,10977,2515,24,14,277,47,700,212,209,132,...,0,0,0,0,0,0,0,0,0,6


In [11]:
# outputs to csv file
subsampled_data.to_csv("subsampled_data.csv", index = False)