### This notebook implements bootstrapping on the original training data, also described as sampling with replacement, in order to build a training data set with more examples of underrepresented samples.

In [1]:
import sklearn as sk
import pandas as pd
import numpy as np

In [2]:
# import training data from relative filepath
data = pd.read_csv("../train.csv")

# shuffles the data by taking a random sample without replacement
# and sampling 100% of the original dataframe
data = data.sample(frac = 1)

data.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
2434,2435,2796,130,16,60,9,630,244,229,109,...,0,0,0,0,0,0,0,0,0,5
1033,1034,3105,350,7,90,12,5245,208,228,159,...,0,0,0,0,0,0,0,0,0,1
11632,11633,2906,76,14,268,42,3037,236,214,106,...,0,0,0,0,0,0,0,0,0,2
6369,6370,2398,58,19,0,0,732,230,197,92,...,0,0,0,0,0,0,0,0,0,4
59,60,2489,42,6,162,13,810,221,227,141,...,0,0,0,0,0,0,0,0,0,5


In [3]:
# shows number of entries of each type in the training data
data.groupby("Cover_Type").size()

Cover_Type
1    2160
2    2160
3    2160
4    2160
5    2160
6    2160
7    2160
dtype: int64

In [4]:
# double checks the above numbers by displaying data where cover type is 4
data.loc[data.Cover_Type == 4]

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
6369,6370,2398,58,19,0,0,732,230,197,92,...,0,0,0,0,0,0,0,0,0,4
4969,4970,2350,100,25,309,129,582,251,198,59,...,0,0,0,0,0,0,0,0,0,4
3733,3734,2214,173,22,242,88,1176,228,244,137,...,0,0,0,0,0,0,0,0,0,4
3961,3962,2194,128,28,127,58,1307,252,209,64,...,0,0,0,0,0,0,0,0,0,4
2228,2229,2044,92,15,30,8,480,242,215,98,...,0,0,0,0,0,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12588,12589,2152,357,20,0,0,990,185,201,150,...,0,0,0,0,0,0,0,0,0,4
3153,3154,2352,0,5,201,58,942,213,231,156,...,0,0,0,0,0,0,0,0,0,4
12404,12405,2359,120,34,272,65,1045,254,189,33,...,0,0,0,0,0,0,0,0,0,4
12762,12763,2209,32,6,0,0,1230,219,226,144,...,0,0,0,0,0,0,0,0,0,4


In [15]:
# list of output classes to sample for bootstrapping
cover_types = [1, 2, 3, 4, 5, 6, 7]

# relative weights of each class to represent in bootstrapped sample
weights = [4, 5, 1, 1, 1, 1, 1]

# create series to hold the probability weights for each sample
weight_series = pd.Series(range(1, len(data)+2))

# loop over each weight and
# assign to appropriate place in series
for weight, cover_type in zip(weights, cover_types):
    
    # assign appropriate weight
    weight_series[data.loc[data.Cover_Type == cover_type]["Id"]] = weight

In [16]:
# create bootstrapped data
bootstrapped_data = data.sample(frac = 3, replace = True, 
                                weights = weight_series, random_state = 1)

bootstrapped_data.reset_index(drop = True, inplace = True)

In [17]:
bootstrapped_data

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,182,2755,320,4,30,-1,2890,209,236,165,...,0,0,0,0,0,0,0,0,0,2
1,1472,2747,56,30,67,-6,150,224,162,51,...,0,0,0,0,0,0,0,0,0,5
2,1034,3105,350,7,90,12,5245,208,228,159,...,0,0,0,0,0,0,0,0,0,1
3,3469,2305,119,18,458,11,1332,248,222,96,...,0,0,0,0,0,0,0,0,0,4
4,99,2562,59,3,0,0,1116,221,233,148,...,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45355,6264,2313,258,10,0,0,600,197,246,188,...,0,0,0,0,0,0,0,0,0,4
45356,8804,3190,354,14,323,53,2205,196,215,156,...,1,0,0,0,0,0,0,0,0,1
45357,4409,2170,328,16,30,2,1341,180,220,179,...,0,0,0,0,0,0,0,0,0,3
45358,6475,2777,36,9,525,43,180,220,221,136,...,0,0,0,0,0,0,0,0,0,5


In [18]:
# outputs to csv file
bootstrapped_data.to_csv("bootstrapped_data.csv", index = False)