### This notebook implements bootstrapping on the original training data, also described as sampling with replacement, in order to build a training data set with more examples of underrepresented samples.

In [1]:
import sklearn as sk
import pandas as pd
import numpy as np

In [2]:
# import training data from relative filepath
data = pd.read_csv("../train.csv")

# shuffles the data by taking a random sample without replacement
# and sampling 100% of the original dataframe
data = data.sample(frac = 1)

data.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
14259,14260,3108,246,11,309,-30,2588,197,249,190,...,1,0,0,0,0,0,0,0,0,1
9910,9911,3404,81,10,120,0,2889,233,223,121,...,0,0,0,1,0,0,0,0,0,7
959,960,2948,310,8,60,-1,4761,197,234,177,...,0,0,0,0,0,0,0,0,0,5
8572,8573,3152,155,5,30,0,3416,226,240,149,...,0,0,0,0,0,0,0,0,0,2
4551,4552,2190,142,25,30,10,1124,247,225,91,...,0,0,0,0,0,0,0,0,0,4


In [3]:
# shows number of entries of each type in the training data
data.groupby("Cover_Type").size()

Cover_Type
1    2160
2    2160
3    2160
4    2160
5    2160
6    2160
7    2160
dtype: int64

In [4]:
# double checks the above numbers by displaying data where cover type is 4
data.loc[data.Cover_Type == 4]

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
4551,4552,2190,142,25,30,10,1124,247,225,91,...,0,0,0,0,0,0,0,0,0,4
13017,13018,2220,63,17,85,13,684,233,204,98,...,0,0,0,0,0,0,0,0,0,4
5140,5141,2266,105,23,150,45,573,251,206,71,...,0,0,0,0,0,0,0,0,0,4
4348,4349,2258,156,19,212,77,1110,238,238,121,...,0,0,0,0,0,0,0,0,0,4
12724,12725,2213,248,5,0,0,1170,209,244,173,...,0,0,0,0,0,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4495,4496,2169,53,2,30,-13,1233,221,234,150,...,0,0,0,0,0,0,0,0,0,4
4695,4696,2207,121,21,42,14,976,250,218,85,...,0,0,0,0,0,0,0,0,0,4
3948,3949,2131,271,15,60,7,1063,179,244,204,...,0,0,0,0,0,0,0,0,0,4
12584,12585,2115,289,11,30,0,1179,190,239,189,...,0,0,0,0,0,0,0,0,0,4


In [5]:
# list of output classes to sample for bootstrapping
cover_types = [1, 2, 3, 4, 5, 6, 7]

# relative weights of each class to represent in bootstrapped sample
weights = [4, 5, 1, 1, 1, 1, 1]

# create series to hold the probability weights for each sample
weight_series = pd.Series(range(1, len(data)+2))

# loop over each weight and
# assign to appropriate place in series
for weight, cover_type in zip(weights, cover_types):
    
    # assign appropriate weight
    weight_series[data.loc[data.Cover_Type == cover_type]["Id"]] = weight

In [24]:
# create bootstrapped data
bootstrapped_data = data.sample(frac = 3, replace = True, 
                                weights = weight_series, random_state = 1)

bootstrapped_data.reset_index(drop = True, inplace = True)

In [25]:
bootstrapped_data

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,6805,2510,309,20,301,96,127,163,223,199,...,0,0,0,0,0,0,0,0,0,3
1,8260,3151,60,8,0,0,1812,226,224,132,...,0,0,0,0,0,0,0,0,0,1
2,14260,3108,246,11,309,-30,2588,197,249,190,...,1,0,0,0,0,0,0,0,0,1
3,3315,2932,177,26,150,24,277,222,244,138,...,0,0,0,0,0,0,0,0,0,5
4,13968,3372,180,5,30,0,2768,222,243,156,...,0,0,0,0,0,0,1,0,0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45355,10736,2483,139,22,0,0,616,247,227,96,...,0,0,0,0,0,0,0,0,0,6
45356,10945,2682,145,10,342,152,2329,234,239,136,...,0,0,0,0,0,0,0,0,0,3
45357,2509,2792,81,9,0,0,658,232,225,125,...,0,0,0,0,0,0,0,0,0,5
45358,9738,3232,286,19,509,132,3475,164,236,210,...,0,1,0,0,0,0,0,0,0,2


In [26]:
# outputs to csv file
bootstrapped_data.to_csv("bootstrapped_data.csv")