### This notebook implements bootstrapping on the original training data, also described as sampling with replacement, in order to build a training data set with more examples of underrepresented samples.

In [1]:
import sklearn as sk
import pandas as pd
import numpy as np

In [2]:
# import training data from relative filepath
data = pd.read_csv("../train.csv")

# shuffles the data by taking a random sample without replacement
# and sampling 100% of the original dataframe
data = data.sample(frac = 1)

data.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
15026,15027,2428,138,12,90,34,503,239,236,125,...,0,0,0,0,0,0,0,0,0,3
14028,14029,3313,216,13,60,-5,2620,207,252,179,...,0,0,0,0,0,0,1,0,0,7
12550,12551,2221,150,33,170,101,1082,241,219,80,...,0,0,0,0,0,0,0,0,0,4
14793,14794,3477,10,18,376,200,1173,197,201,137,...,0,0,0,0,0,0,0,0,1,7
13134,13135,2400,13,16,277,130,841,203,206,137,...,0,0,0,0,0,0,0,0,0,6


In [3]:
# shows number of entries of each type in the training data
data.groupby("Cover_Type").size()

Cover_Type
1    2160
2    2160
3    2160
4    2160
5    2160
6    2160
7    2160
dtype: int64

In [4]:
# double checks the above numbers by displaying data where cover type is 4
data.loc[data.Cover_Type == 4]

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
12550,12551,2221,150,33,170,101,1082,241,219,80,...,0,0,0,0,0,0,0,0,0,4
1997,1998,2003,355,6,0,0,127,211,230,157,...,0,0,0,0,0,0,0,0,0,4
4011,4012,2250,313,12,0,0,1155,189,231,182,...,0,0,0,0,0,0,0,0,0,4
4180,4181,2320,102,24,361,139,1110,252,200,62,...,0,0,0,0,0,0,0,0,0,4
4969,4970,2350,100,25,309,129,582,251,198,59,...,0,0,0,0,0,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5724,5725,2308,59,13,0,0,1006,229,213,115,...,0,0,0,0,0,0,0,0,0,4
3097,3098,2078,90,27,67,25,488,249,187,50,...,0,0,0,0,0,0,0,0,0,4
5112,5113,2203,337,11,0,0,1642,196,225,168,...,0,0,0,0,0,0,0,0,0,4
4300,4301,2258,132,27,127,75,1231,251,214,72,...,0,0,0,0,0,0,0,0,0,4


In [36]:
# list of output classes to sample for bootstrapping
cover_types = [1, 2, 3, 4, 5, 6, 7]

# relative weights of each class to represent in bootstrapped sample
weights = [1, 6, 1, 1, 1, 1, 1]

# create series to hold the probability weights for each sample
weight_series = pd.Series(range(1, len(data)+2))

# loop over each weight and
# assign to appropriate place in series
for weight, cover_type in zip(weights, cover_types):
    
    # assign appropriate weight
    weight_series[data.loc[data.Cover_Type == cover_type]["Id"]] = weight

In [37]:
# create bootstrapped data
bootstrapped_data = data.sample(frac = 1, replace = True, 
                                weights = weight_series, random_state = 1)

In [38]:
bootstrapped_data

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
14187,14188,2902,99,11,180,15,2112,238,225,117,...,0,0,0,0,0,0,0,0,0,2
3599,3600,2330,35,8,360,106,760,219,224,140,...,0,0,0,0,0,0,0,0,0,3
12550,12551,2221,150,33,170,101,1082,241,219,80,...,0,0,0,0,0,0,0,0,0,4
11499,11500,2931,36,7,272,27,4559,220,225,141,...,0,0,0,0,0,0,0,0,0,2
5824,5825,2381,38,16,216,91,932,218,203,115,...,0,0,0,0,0,0,0,0,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5802,5803,2343,43,12,0,0,930,222,213,123,...,0,0,0,0,0,0,0,0,0,4
14842,14843,3139,149,13,150,40,108,236,239,130,...,1,0,0,0,0,0,0,0,0,1
617,618,2662,138,23,30,8,2173,248,225,93,...,0,0,0,0,0,0,0,0,0,2
3062,3063,2092,106,19,0,0,450,248,214,86,...,0,0,0,0,0,0,0,0,0,4


In [39]:
# outputs to csv file
bootstrapped_data.to_csv("bootstrapped_data.csv")