### This notebook implements bootstrapping on the original training data, also described as sampling with replacement, in order to build a training data set with more examples of underrepresented samples.

In [1]:
import sklearn as sk
import pandas as pd
import numpy as np

In [2]:
# import training data from relative filepath
data = pd.read_csv("../train.csv")

# shuffles the data by taking a random sample without replacement
# and sampling 100% of the original dataframe
data = data.sample(frac = 1)

data.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
565,566,2932,54,16,283,119,3578,228,203,104,...,0,0,0,0,0,0,0,0,0,2
9845,9846,3515,158,27,120,-37,3221,237,232,106,...,0,0,0,1,0,0,0,0,0,7
12200,12201,2725,353,23,42,9,1116,174,193,152,...,0,1,0,0,0,0,0,0,0,2
1129,1130,3080,315,16,150,22,4795,176,226,189,...,0,0,0,0,0,0,0,0,0,1
15020,15021,2483,358,18,180,55,842,191,205,150,...,0,0,0,0,0,0,0,0,0,3


In [3]:
# shows number of entries of each type in the training data
data.groupby("Cover_Type").size()

Cover_Type
1    2160
2    2160
3    2160
4    2160
5    2160
6    2160
7    2160
dtype: int64

In [4]:
# double checks the above numbers by displaying data where cover type is 4
data.loc[data.Cover_Type == 4]

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
3767,3768,2180,141,34,108,60,1116,246,209,61,...,0,0,0,0,0,0,0,0,0,4
13017,13018,2220,63,17,85,13,684,233,204,98,...,0,0,0,0,0,0,0,0,0,4
3052,3053,2198,95,29,210,147,499,251,182,38,...,0,0,0,0,0,0,0,0,0,4
12714,12715,2297,183,20,319,116,1190,221,249,151,...,0,0,0,0,0,0,0,0,0,4
3513,3514,2296,234,29,319,176,1041,159,250,216,...,0,0,0,0,0,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5368,5369,2358,130,29,212,117,255,252,209,62,...,0,0,0,0,0,0,0,0,0,4
3339,3340,2232,120,32,234,132,799,254,194,40,...,0,0,0,0,0,0,0,0,0,4
4030,4031,2165,309,19,0,0,1138,164,224,198,...,0,0,0,0,0,0,0,0,0,4
12765,12766,2270,137,25,150,87,1211,249,221,84,...,0,0,0,0,0,0,0,0,0,4


In [5]:
# list of output classes to sample for bootstrapping
cover_types = [1, 2, 3, 4, 5, 6, 7]

# relative weights of each class to represent in bootstrapped sample
weights = [4, 5, 1, 1, 1, 1, 1]

# create series to hold the probability weights for each sample
weight_series = pd.Series(range(1, len(data)+2))

# loop over each weight and
# assign to appropriate place in series
for weight, cover_type in zip(weights, cover_types):
    
    # assign appropriate weight
    weight_series[data.loc[data.Cover_Type == cover_type]["Id"]] = weight

In [6]:
# create bootstrapped data
bootstrapped_data = data.sample(frac = 3, replace = True, 
                                weights = weight_series, random_state = 1)

bootstrapped_data.reset_index(drop = True, inplace = True)

In [7]:
bootstrapped_data

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,14719,2893,240,19,42,10,2916,182,252,204,...,0,0,0,0,0,0,0,0,0,1
1,10416,3409,345,27,366,164,942,158,188,162,...,0,1,0,0,0,0,0,0,0,7
2,566,2932,54,16,283,119,3578,228,203,104,...,0,0,0,0,0,0,0,0,0,2
3,9857,3277,333,9,234,32,450,200,229,168,...,1,0,0,0,0,0,0,0,0,2
4,1475,2741,63,31,108,-12,124,229,160,40,...,0,0,0,0,0,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45355,4759,3283,92,8,124,18,3670,233,228,128,...,0,0,0,0,0,0,1,0,0,7
45356,9321,3007,152,9,60,-1,384,232,240,140,...,0,0,0,0,0,0,0,0,0,1
45357,537,2932,18,2,485,13,4326,217,234,154,...,0,0,0,0,0,0,0,0,0,2
45358,355,3008,281,10,780,110,5864,194,241,187,...,0,0,0,0,0,0,0,0,0,2


In [8]:
# outputs to csv file
bootstrapped_data.to_csv("bootstrapped_data.csv", index = False)