# Start

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train_values = pd.read_csv('train_values.csv', index_col='building_id')
train_labels = pd.read_csv('train_labels.csv', index_col='building_id')

In [None]:
train_values.dtypes

In [None]:
(train_labels.damage_grade
             .value_counts()
             .sort_index()
             .plot.bar(title="Number of Buildings with Each Damage Grade"))

# Feature Selection

In [None]:
selected_features = ['foundation_type', 
                     'area_percentage', 
                     'height_percentage',
                     'count_floors_pre_eq',
                     'land_surface_condition',
                     'has_superstructure_cement_mortar_stone']

train_values_subset = train_values[selected_features]
sns.pairplot(train_values_subset.join(train_labels), hue='damage_grade')

In [None]:
#get_dummies performs one-hot encoding
train_values_subset = pd.get_dummies(train_values_subset)
train_values_subset

# Model Evaluation

In [None]:
# for preprocessing the data
from sklearn.preprocessing import StandardScaler

# the model
from sklearn.ensemble import RandomForestClassifier

# for combining the preprocess with model training
from sklearn.pipeline import make_pipeline

# for optimizing the hyperparameters of the pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
pipe = make_pipeline(StandardScaler(), 
                     RandomForestClassifier(random_state=2018))
pipe

In [None]:
#tuning hyperparameters
param_grid = {'randomforestclassifier__n_estimators': [50, 100],
              'randomforestclassifier__min_samples_leaf': [1, 5]}
gs = GridSearchCV(pipe, param_grid, cv=5)

In [None]:
#works because pd.get_dummies() from above splits out columns of categorical data into sets of {0,1}
gs.fit(train_values_subset, train_labels.values.ravel())

In [None]:
gs.best_params_

In [None]:
from sklearn.metrics import f1_score

in_sample_preds = gs.predict(train_values_subset)
f1_score(train_labels, in_sample_preds, average='micro') #using micro f1 score, perfect score = 1

# Making Predictions

In [None]:
# read test CSV
test_values = pd.read_csv('test_values.csv', index_col='building_id')

In [None]:
#use the same selected features on test_values as train_values
test_values_subset = pd.get_dummies(test_values[selected_features])
#test_values_subset = pd.get_dummies(test_values)

In [None]:
predictions = gs.predict(test_values_subset)

In [None]:
submission_format = pd.read_csv('submission_format.csv', index_col='building_id')

In [None]:
my_submission = pd.DataFrame(data=predictions,
                             columns=submission_format.columns,
                             index=submission_format.index)

In [None]:
my_submission.head()

In [None]:
#create a csv file that will be submitted to DrivenData
my_submission.to_csv('submission_lgb1.csv')

In [None]:
!head submission_lgb1.csv #0.6368
!head submission_lgb2.csv #0.7426