In [1]:
import pandas as pd
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [148]:
train = pd.read_csv("train_values.csv")
train_labels = pd.read_csv("train_labels.csv")
test = pd.read_csv("test_values.csv")

In [149]:
train.columns, train_labels.columns

(Index(['building_id', 'geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
        'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
        'land_surface_condition', 'foundation_type', 'roof_type',
        'ground_floor_type', 'other_floor_type', 'position',
        'plan_configuration', 'has_superstructure_adobe_mud',
        'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
        'has_superstructure_cement_mortar_stone',
        'has_superstructure_mud_mortar_brick',
        'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
        'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
        'has_superstructure_rc_engineered', 'has_superstructure_other',
        'legal_ownership_status', 'count_families', 'has_secondary_use',
        'has_secondary_use_agriculture', 'has_secondary_use_hotel',
        'has_secondary_use_rental', 'has_secondary_use_institution',
        'has_secondary_use_school', 'has_

In [150]:
train.shape, test.shape

((260601, 39), (86868, 39))

In [151]:
# one hot encode column

def one_hot(df, col):
    unique = df[col].unique()
    keys = {}
    for i, k in enumerate(unique):
        code = np.zeros(len(unique))
        code[i] = 1
        keys[k] = code

    hot_col = []
    for i in range(len(df[col])):
        k = df[col].loc[i]
        hot_col.append(keys[k])

    return np.array(hot_col)


one_hot(train, "foundation_type")

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.]])

In [152]:
# apply encoding to multiple columns

def apply_one_hot(df, columns):
    for col in columns:
       new_col = one_hot(df, col)
       for i in range(new_col.shape[1]): # make each encoding its own column
           df[col+"_"+str(i)] = new_col[:,i]
       df.drop([col], inplace = True, axis=1)
    return df

In [153]:
# list of all columns to be encoded
cols_to_use = [
        'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
        'land_surface_condition', 'foundation_type', 'roof_type',
        'ground_floor_type', 'other_floor_type', 'position',
        'plan_configuration', 'has_superstructure_adobe_mud',
        'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
        'has_superstructure_cement_mortar_stone',
        'has_superstructure_mud_mortar_brick',
        'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
        'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
        'has_superstructure_rc_engineered', 'has_superstructure_other',
       ]
cols_to_update = ["land_surface_condition", "foundation_type", "roof_type", "ground_floor_type", "other_floor_type", "position", "plan_configuration"]

train = train[cols_to_use]
test = test[cols_to_use]

In [154]:
train_hot = apply_one_hot(train, cols_to_update)
test_hot =  apply_one_hot(test, cols_to_update)

In [141]:
train_hot.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,...,plan_configuration_4,plan_configuration_5,plan_configuration_6,plan_configuration_7,plan_configuration_8,plan_configuration_9,legal_ownership_status_0,legal_ownership_status_1,legal_ownership_status_2,legal_ownership_status_3
0,802906,6,487,12198,2,30,6,5,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,28830,8,900,2812,2,10,8,7,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,94947,21,363,8973,2,10,5,5,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,590882,22,418,10694,2,10,6,5,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,201944,11,131,1488,3,30,8,9,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [136]:
# sanity check all categories are present in train and test set
len(train_hot.columns) == len(test_hot.columns)

True

In [155]:
x = train_hot.values
x_test = test_hot.values

In [156]:
# for preprocessing the data
from sklearn.preprocessing import StandardScaler

# the model
from sklearn.ensemble import RandomForestClassifier

# for combining the preprocess with model training
from sklearn.pipeline import make_pipeline

# for optimizing the hyperparameters of the pipeline
from sklearn.model_selection import GridSearchCV

In [157]:
pipe = make_pipeline(StandardScaler(), 
                     RandomForestClassifier(random_state=42))
pipe

In [158]:
param_grid = {'randomforestclassifier__n_estimators': [50, 100],
              'randomforestclassifier__min_samples_leaf': [1, 5]}
gs = GridSearchCV(pipe, param_grid, cv=5)

In [161]:
gs.fit(x, train_labels.damage_grade.values.ravel())

In [162]:
gs.best_params_


{'randomforestclassifier__min_samples_leaf': 5,
 'randomforestclassifier__n_estimators': 100}

In [163]:

from sklearn.metrics import f1_score

in_sample_preds = gs.predict(x)
f1_score(train_labels.damage_grade.values, in_sample_preds, average='micro')

0.6442991392972398

In [None]:
predictions = gs.predict(x_test)