# Start

In [None]:
import numpy as np
import pandas as pd

In [None]:
train_values = pd.read_csv('csv_original/train_values.csv', index_col='building_id')
train_labels = pd.read_csv('csv_original/train_labels.csv', index_col='building_id')

# Feature Selection (Original)

In [None]:
selected_features = ['foundation_type', 
                     'area_percentage', 
                     'height_percentage',
                     'count_floors_pre_eq',
                     'land_surface_condition',
                     'has_superstructure_cement_mortar_stone']

train_values_subset = train_values[selected_features]
sns.pairplot(train_values_subset.join(train_labels), hue='damage_grade')

In [None]:
#get_dummies performs one-hot encoding
train_values_subset = pd.get_dummies(train_values_subset)
train_values_subset

# Extra Tree Ensemble (ET)

In [None]:
# TRY 2: 4 features from Paper 1
# selected_features = ['area_percentage', 
#                      'height_percentage',
#                      'count_floors_pre_eq',
#                      'age']
# train_values_subset = train_values[selected_features]

train_values_subset = pd.get_dummies(train_values)

In [None]:
# for preprocessing the data
from sklearn.preprocessing import StandardScaler

# the model
from sklearn.ensemble import ExtraTreesClassifier

# for combining the preprocess with model training
from sklearn.pipeline import make_pipeline

# for optimizing the hyperparameters of the pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
#getting rid of random_state=2018
pipe = make_pipeline(StandardScaler(), 
                     ExtraTreesClassifier())
pipe

In [None]:
# #TRY 1: tuning hyperparameters, same as RF example
# #the ET paper uses three: n_estimators, max_features, min_samples_split
# #but says there was no significant improvement in accuracy

# #for Nepal, hyperparameters not so different but based on other earthquakes
# param_grid = {'extratreesclassifier__n_estimators': [50, 100], #more trees the better it seems
#               'extratreesclassifier__max_features' : [4, 6], #instead of using all the selected_features, let the estimator decide max_features
#               'extratreesclassifier__min_samples_leaf': [1, 5]} #4-6 sample size per split seems ideal
# gs = GridSearchCV(pipe, param_grid, cv=5) #default 5-fold cross validation

#TRY 2: using Shivam's hyperparameter search, 4 selected features from Paper 1
#TRY 3: using Shivam's hyperparameter search, all features
param_grid = {'extratreesclassifier__n_estimators': [1, 125],
              'extratreesclassifier__max_depth' : [1, 15],
              'extratreesclassifier__min_samples_leaf': [1, 2]}
gs = GridSearchCV(pipe, param_grid, cv=5) #default 5-fold cross validation

In [None]:
#check to see what are the string names for hyperparameters
gs.get_params().keys()

In [None]:
#make sure that I do feature selection again (including get_dummies)
#and reassign train_values_subset if I want to try different features
gs.fit(train_values_subset, train_labels.values.ravel())

In [None]:
gs.best_params_
#BLOG: min_samples_leaf = 5, n_estimators = 100
#BASE: min_samples_leaf = 1 and n_estimators = 50
#TRY1: leaf = 5, estimators = 100, features = 6
#TRY2: leaf = 2, estimators = 125, depth = 15
#TRY3: leaf = 2, estimators = 125, depth = 15

In [None]:
from sklearn.metrics import f1_score

in_sample_preds = gs.predict(train_values_subset)
#in_sample_preds = gs.predict(train_values_set)
f1_score(train_labels, in_sample_preds, average='micro') #using micro f1 score, perfect score = 1

#BLOG: 0.5894183 for RF
#BASE: 0.59438 for ET
#TRY1: 0.5840538 with max_features
#TRY2: 0.5804736 with Shivam's hyperparameters and 4 selected features from Paper 1
#TRY3: 0.64818247 with Shivam's hyperparameters but all features

# LightGBM lgb1, lgb2

In [None]:
import lightgbm as lgb #pip3 install lightgbm && brew install libomp

In [None]:
# one-hot-encoding
train_values_df = pd.get_dummies(train_values)

In [None]:
# convert dataframes (or numpy array) to lgb dataset
train_data = lgb.Dataset(train_values_df, label=train_labels)

In [None]:
# parameters
param = {
  'objective':'multiclass',
  'num_class':4, # since we have values 1, 2, and 3 (3 will only allow 0, 1, 2)
  'max_leaves':2, # lgb1
  'max_depth':15, # lgb1
  'n_estimators':125, # lgb1
  #'max_leaves':131072, # lgb2 (no limit)
  #'max_depth':10, # lgb2
  #'n_estimators':200, # lgb2
}

In [None]:
# train our model
bst = lgb.train(param, train_data)

In [None]:
# save model after training
bst.save_model('model.txt')

In [None]:
# get the optimal hyperparameters
eval_hist = lgb.cv(param, train_data, nfold=5)
eval_hist

In [None]:
# predict labels
ypred = bst.predict(train_values_df)
ypred

In [None]:
# make it [1,2,3]
ypred_arg = [np.argmax(line) for line in ypred]
ypred_arg

In [None]:
# score
from sklearn.metrics import f1_score
f1_score(train_labels, ypred_arg, average='micro') #using micro f1 score, perfect score = 1
#lgb1: 0.6387, lgb2: 0.81

In [None]:
# read test CSV
test_values = pd.read_csv('test_values.csv', index_col='building_id')

In [None]:
#use the same selected features on test_values as train_values
#test_values_subset = pd.get_dummies(test_values[selected_features])
test_values_subset = pd.get_dummies(test_values)

In [None]:
predictions = bst.predict(test_values_subset)
predictions

In [None]:
# make it [1,2,3]
predictions_arg = [np.argmax(line) for line in predictions]
predictions_arg

In [None]:
submission_format = pd.read_csv('submission_format.csv', index_col='building_id')

In [None]:
my_submission = pd.DataFrame(data=predictions_arg,
                             columns=submission_format.columns,
                             index=submission_format.index)

In [None]:
my_submission.head()

In [None]:
#create a csv file that will be submitted to DrivenData
my_submission.to_csv('submission_lgb1.csv')

In [None]:
!head submission_lgb1.csv #0.6368
!head submission_lgb2.csv #0.7426

# LightGBM (with GridSearch) lgb3

In [None]:
# one-hot-encoding
train_values_df = pd.get_dummies(train_values)
len(train_values_df.columns)

In [None]:
# convert dataframes (or numpy array) to lgb dataset
# train_data = lgb.Dataset(train_values_df, label=train_labels)

In [None]:
import lightgbm as lgb #pip3 install lightgbm && brew install libomp

# for preprocessing the data
from sklearn.preprocessing import StandardScaler

# for combining the preprocess with model training
from sklearn.pipeline import Pipeline

# for optimizing the hyperparameters of the pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
pipe = Pipeline([('scaler', StandardScaler()), ('lgb', lgb.LGBMClassifier(random_state=2018,#))])
                                                                         # max_leaves=0,
                                                                         max_depth=10, n_estimators=200))])
pipe

In [None]:
#max_leaves=0, max_depth=10, n_estimators=200
param_grid = {
    #'lgb__num_leaves'    : [2, 5, 10, 15, 20, 25, 31],      #default=31, where > 1 (max_leaves is not a feature)
    #'lgb__max_depth'     : [0, 2, 5, 10],                   #default=-1, where <=0 means no limit
    #'lgb__n_estimators'  : [100, 150, 200],                 #default=100
}

In [None]:
gbm = GridSearchCV(pipe, param_grid, cv=5)
#gbm.get_params
gbm.fit(train_values_df, train_labels.values.ravel())

In [None]:
print(gbm.best_params_) 
print(gbm.best_score_)

In [None]:
from sklearn.metrics import f1_score

in_sample_preds = gbm.predict(train_values_df)
f1_score(train_labels, in_sample_preds, average='micro') #0.733

In [None]:
# read test CSV
test_values = pd.read_csv('test_values.csv', index_col='building_id')

In [None]:
#use the same selected features on test_values as train_values
#test_values_subset = pd.get_dummies(test_values[selected_features])
test_values_subset = pd.get_dummies(test_values)

In [None]:
predictions = gbm.predict(test_values_subset)

In [None]:
my_submission = pd.DataFrame(data=predictions,
                             columns=submission_format.columns,
                             index=submission_format.index)

In [None]:
my_submission.head()

In [None]:
#create a csv file that will be submitted to DrivenData
my_submission.to_csv('submission_lgb3.csv')

In [None]:
!head submission_lgb3.csv

# Catboost (with GridSearch) cbc1

In [None]:
# one-hot-encoding
train_values_df = pd.get_dummies(train_values)
len(train_values_df.columns)

In [None]:
from catboost import CatBoostClassifier

# for preprocessing the data
from sklearn.preprocessing import StandardScaler

# for combining the preprocess with model training
from sklearn.pipeline import Pipeline

# for optimizing the hyperparameters of the pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
pipe = Pipeline([('scaler', StandardScaler()), ('cbc', CatBoostClassifier(random_state=2018,#))])
                                                                         #max_leaves=0,
                                                                         max_depth=10, n_estimators=200))])
pipe

In [None]:
param_grid = {
#     'cbc__max_leaves'    : list(range(2, 31)),       #default=31, not recommended to use above 64
#     'cbc__max_depth'     : list(range(1, 10)),       #default=6
#     'cbc__n_estimators'  : [100, 150, 200, 1000],    #default=1000
}

In [None]:
cbc = GridSearchCV(pipe, param_grid, cv=5)
#gbm.get_params
cbc.fit(train_values_df, train_labels.values.ravel())

In [None]:
from sklearn.metrics import f1_score

in_sample_preds = cbc.predict(train_values_df)
f1_score(train_labels, in_sample_preds, average='micro') #0.7774

In [None]:
# read test CSV
test_values = pd.read_csv('test_values.csv', index_col='building_id')

#test_values_subset = pd.get_dummies(test_values[selected_features])
test_values_subset = pd.get_dummies(test_values)

In [None]:
predictions = cbc.predict(test_values_subset)
my_submission = pd.DataFrame(data=predictions,
                             columns=submission_format.columns,
                             index=submission_format.index)

In [None]:
#create a csv file that will be submitted to DrivenData
my_submission.to_csv('submission_cbc1.csv')

In [None]:
!head submission_cbc1.csv

# Catboost cbc2 / cbc3

In [None]:
import catboost as cb

In [None]:
categorical_features_indices = np.where(train_values.dtypes != int)[0]
categorical_features_indices

In [38]:
model = cb.CatBoostClassifier()

In [39]:
#model.set_params(n_estimators=200, max_depth=10) #max_leaves no limit
model.set_params(n_estimators=325, max_depth=16, #max is 16 in catboost
                 eta = 0.060000000000000005,random_state=42,
                 colsample_bylevel = 1.0,
                 reg_lambda = 1.4200000000000002,
                )
# subsample = 0.8 -> bayesian doesn't support taken fraction option
# objective = "multiclass" loss not supported
# not used: tree_method/task_type, max_leaves, colsample_bytree, min_child_weight, gamma, reg_alpha, num_class/classes_count

<catboost.core.CatBoostClassifier at 0x123679a60>

In [40]:
model.fit(
    train_values, train_labels,
    cat_features=categorical_features_indices,
)

0:	learn: 1.0616970	total: 36.8s	remaining: 3h 18m 38s
1:	learn: 1.0275927	total: 1m 24s	remaining: 3h 48m 38s
2:	learn: 0.9998627	total: 1m 26s	remaining: 2h 34m 44s
3:	learn: 0.9728956	total: 2m 18s	remaining: 3h 5m 15s
4:	learn: 0.9485669	total: 3m 13s	remaining: 3h 26m 33s
5:	learn: 0.9274250	total: 3m 59s	remaining: 3h 32m 6s
6:	learn: 0.9081345	total: 4m 44s	remaining: 3h 35m 31s
7:	learn: 0.8905140	total: 5m 27s	remaining: 3h 36m
8:	learn: 0.8740088	total: 6m 8s	remaining: 3h 35m 31s
9:	learn: 0.8589200	total: 6m 51s	remaining: 3h 36m 7s
10:	learn: 0.8451202	total: 7m 28s	remaining: 3h 33m 14s
11:	learn: 0.8324369	total: 8m 7s	remaining: 3h 32m 8s
12:	learn: 0.8212526	total: 8m 49s	remaining: 3h 31m 45s
13:	learn: 0.8104305	total: 9m 25s	remaining: 3h 29m 19s
14:	learn: 0.8011117	total: 10m 2s	remaining: 3h 27m 23s
15:	learn: 0.7919241	total: 10m 38s	remaining: 3h 25m 28s
16:	learn: 0.7839507	total: 11m 12s	remaining: 3h 23m 7s
17:	learn: 0.7763554	total: 11m 45s	remaining: 3h 2

<catboost.core.CatBoostClassifier at 0x123679a60>

In [42]:
# predict labels
predictions = model.predict(train_values)
predictions

array([[2],
       [2],
       [3],
       ...,
       [3],
       [2],
       [3]])

In [43]:
# score
from sklearn.metrics import f1_score
f1_score(train_labels, predictions, average='micro') #using micro f1 score, perfect score = 1
#cbc2: 0.77
#cbc3: 0.81

0.8092102486176186

In [45]:
# read test CSV
test_values = pd.read_csv('csv_original/test_values.csv', index_col='building_id')

In [46]:
# predict labels
predictions_test = model.predict(test_values)
predictions_test

array([[3],
       [2],
       [2],
       ...,
       [2],
       [2],
       [2]])

In [54]:
submission_format = pd.read_csv('csv_original/submission_format.csv', index_col='building_id')

In [55]:
my_submission = pd.DataFrame(data=predictions_test,
                             columns=submission_format.columns,
                             index=submission_format.index)

In [56]:
my_submission.head()

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,3
99355,2
890251,2
745817,2
421793,3


In [57]:
#create a csv file that will be submitted to DrivenData
#my_submission.to_csv('submission_cbc2.csv')
my_submission.to_csv('csv_brian/submission_cbc3.csv')

In [58]:
#!head submission_cbc2.csv
!head csv_brian/submission_cbc3.csv

building_id,damage_grade
300051,3
99355,2
890251,2
745817,2
421793,3
871976,2
691228,2
896100,3
343471,2
