# Start

In [74]:
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

In [75]:
train_values = pd.read_csv('csv_original/train_values.csv', index_col='building_id')
train_labels = pd.read_csv('csv_original/train_labels.csv', index_col='building_id')
test_values = pd.read_csv('csv_original/test_values.csv', index_col='building_id')

In [76]:
cat_indices = np.where(train_values.dtypes != int)[0]
#print(cat_indices)
int_columns = train_values.select_dtypes(['int']).columns
cat_columns = train_values.select_dtypes(['object']).columns
print(int_columns)
print(cat_columns)

Index(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone',
       'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'count_families', 'has_secondary_use', 'has_secondary_use_agriculture',
       'has_secondary_use_hotel', 'has_secondary_use_rental',
       'has_secondary_use_institution', 'has_secondary_use_school',
       'has_secondary_use_industry', 'has_secondary_use_health_post',
       'has_secondary_use_gov_office', 'has_secondary_use_use_police',
       'has_secondary_use_other'],
      dtype='object')
Index(['land_surf

## Converting Category - Integer

In [77]:
for col in cat_columns:
    train_values[col] = train_values[col].astype('category')

In [78]:
#turn category to numeric (without one-hot encoding)
train_values[cat_columns] = train_values[cat_columns].apply(lambda x: x.cat.codes)
train_values[cat_columns]

Unnamed: 0_level_0,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,legal_ownership_status
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
802906,2,2,0,0,1,3,2,2
28830,1,2,0,3,1,2,2,2
94947,2,2,0,0,3,3,2,2
590882,2,2,0,0,3,2,2,2
201944,2,2,0,0,3,2,2,2
...,...,...,...,...,...,...,...,...
688636,0,2,0,0,0,2,7,2
669485,2,2,0,0,1,2,2,2
602512,2,2,1,0,1,2,2,2
151409,2,2,2,2,2,0,2,2


## Converting Category to One-Hot Encoding

In [None]:
#join all the one-hot encoded columns
for col in cat_columns:
    one_hot = pd.get_dummies(train_values[col])
    pd.concat([train_values, one_hot], axis = 1)
    #train_values = train_values.drop(col,axis = 1) #may drop all original cat columns (but will keep for now)

In [None]:
train_values[cat_columns] #still there

In [None]:
train_values #what catboost will use, with cat_indices being cat_indices

In [None]:
train_values_int = train_values[int_columns]
train_values_int #what lightgbm xgboost will use

## Scaling (skip scaling for the new method)

In [79]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit_transform(train_values)
train_values

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,2,2,0,...,0,0,0,0,0,0,0,0,0,0
28830,8,900,2812,2,10,8,7,1,2,0,...,0,0,0,0,0,0,0,0,0,0
94947,21,363,8973,2,10,5,5,2,2,0,...,0,0,0,0,0,0,0,0,0,0
590882,22,418,10694,2,10,6,5,2,2,0,...,0,0,0,0,0,0,0,0,0,0
201944,11,131,1488,3,30,8,9,2,2,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688636,25,1335,1621,1,55,6,3,0,2,0,...,0,0,0,0,0,0,0,0,0,0
669485,17,715,2060,2,0,6,5,2,2,0,...,0,0,0,0,0,0,0,0,0,0
602512,17,51,8163,3,55,6,7,2,2,1,...,0,0,0,0,0,0,0,0,0,0
151409,26,39,1851,2,10,14,6,2,2,2,...,0,0,0,0,0,0,0,0,0,0


## 3 Classifiers

In [80]:
#transform train_labels from [1, 2, 3] to [0, 1, 2] (later inverse_transform for prediction)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_labels = le.fit_transform(train_labels)
train_labels

  y = column_or_1d(y, warn=True)


array([2, 1, 2, ..., 2, 1, 2])

In [81]:
clf1 = xgb.XGBClassifier(max_depth=10, n_estimators=200)
clf2 = lgb.LGBMClassifier(max_depth=10, n_estimators=200) #lgb2
clf3 = cb.CatBoostClassifier(max_depth=10, n_estimators=200) #cbc2

In [82]:
#needs to be a pipeline where it selects only train_values[int]
clf1.fit(train_values, train_labels)

In [83]:
#needs to be a pipeline where it selects only train_values[int]
clf2.fit(train_values, train_labels)

In [84]:
clf3.fit(train_values, train_labels, cat_features=cat_indices)

Learning rate set to 0.409419
0:	learn: 0.8961130	total: 734ms	remaining: 2m 26s
1:	learn: 0.8147929	total: 1.63s	remaining: 2m 41s
2:	learn: 0.7714987	total: 2.49s	remaining: 2m 43s
3:	learn: 0.7444884	total: 3.5s	remaining: 2m 51s
4:	learn: 0.7274157	total: 4.39s	remaining: 2m 51s
5:	learn: 0.7145754	total: 5.29s	remaining: 2m 50s
6:	learn: 0.7046370	total: 6.09s	remaining: 2m 48s
7:	learn: 0.6989232	total: 6.99s	remaining: 2m 47s
8:	learn: 0.6948007	total: 7.94s	remaining: 2m 48s
9:	learn: 0.6914349	total: 8.77s	remaining: 2m 46s
10:	learn: 0.6858429	total: 9.49s	remaining: 2m 43s
11:	learn: 0.6819614	total: 10.4s	remaining: 2m 42s
12:	learn: 0.6789370	total: 11.1s	remaining: 2m 39s
13:	learn: 0.6762138	total: 12s	remaining: 2m 39s
14:	learn: 0.6742180	total: 12.9s	remaining: 2m 39s
15:	learn: 0.6718933	total: 13.9s	remaining: 2m 39s
16:	learn: 0.6687387	total: 14.6s	remaining: 2m 36s
17:	learn: 0.6670812	total: 15.5s	remaining: 2m 37s
18:	learn: 0.6651494	total: 16.4s	remaining: 2m

<catboost.core.CatBoostClassifier at 0x137acc970>

In [85]:
from mlxtend.classifier import EnsembleVoteClassifier
eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], weights=[1,1,1], fit_base_estimators=False)
# eclf = EnsembleVoteClassifier(clfs=[pipe1, pipe2, clf3], weights=[1,1,1], fit_base_estimators=False)
eclf.fit(train_values, train_labels)



In [87]:
# predict labels
preds_clf1 = clf1.predict(train_values)
preds_clf1 = le.inverse_transform(preds_clf1) #reverse label encoder
preds_clf1

array([3, 2, 3, ..., 3, 2, 3])

In [88]:
# predict labels
preds_clf2 = clf2.predict(train_values)
preds_clf2 = le.inverse_transform(preds_clf2) #reverse label encoder
preds_clf2

array([2, 2, 3, ..., 3, 2, 3])

In [89]:
# predict labels
preds_clf3 = clf3.predict(train_values)
preds_clf3 = le.inverse_transform(preds_clf3) #reverse label encoder
preds_clf3

  y = column_or_1d(y, warn=True)


array([3, 2, 3, ..., 3, 2, 3])

In [90]:
# predict labels
preds_eclf = eclf.predict(train_values)
preds_eclf = le.inverse_transform(preds_eclf) #reverse label encoder
preds_eclf

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([3, 2, 3, ..., 3, 2, 3])

In [91]:
train_labels = le.inverse_transform(train_labels) #reverse label encoder
train_labels

array([3, 2, 3, ..., 3, 2, 3])

In [92]:
from sklearn.metrics import f1_score
print(f1_score(train_labels, preds_clf1, average='micro')) #clf1: 0.85786
print(f1_score(train_labels, preds_clf2, average='micro')) #clf2: 0.73362
print(f1_score(train_labels, preds_clf3, average='micro')) #clf3: 0.77006
print(f1_score(train_labels, preds_eclf, average='micro')) #eclf: 0.78572

0.8578631701336525
0.7336234319899003
0.7700622791163503
0.785726071657438


## Submission (File Pre-Processing)

In [93]:
# read test CSV
test_values = pd.read_csv('csv_original/test_values.csv', index_col='building_id')

In [96]:
for col in cat_columns:
    test_values[col] = test_values[col].astype('category')

In [98]:
#turn category to numeric (without one-hot encoding)
test_values[cat_columns] = test_values[cat_columns].apply(lambda x: x.cat.codes)
test_values[cat_columns]

Unnamed: 0_level_0,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,legal_ownership_status
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
300051,2,2,0,0,1,2,2,2
99355,2,2,0,0,1,2,2,2
890251,2,2,0,0,1,2,2,2
745817,2,2,2,2,0,3,2,2
421793,2,2,1,0,1,3,2,2
...,...,...,...,...,...,...,...,...
310028,2,2,1,0,1,3,2,3
663567,0,2,0,0,1,2,2,2
1049160,2,2,0,0,0,2,2,2
442785,2,2,0,0,1,2,2,0


In [99]:
scaler = MinMaxScaler()
scaler.fit_transform(test_values)
test_values

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
300051,17,596,11307,3,20,7,6,2,2,0,...,0,0,0,0,0,0,0,0,0,0
99355,6,141,11987,2,25,13,5,2,2,0,...,1,0,0,0,0,0,0,0,0,0
890251,22,19,10044,2,5,4,5,2,2,0,...,0,0,0,0,0,0,0,0,0,0
745817,26,39,633,1,0,19,3,2,2,2,...,0,0,1,0,0,0,0,0,0,0
421793,17,289,7970,3,15,8,7,2,2,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310028,4,605,3623,3,70,20,6,2,2,1,...,1,0,0,0,0,0,0,0,0,0
663567,10,1407,11907,3,25,6,7,0,2,0,...,0,0,0,0,0,0,0,0,0,0
1049160,22,1136,7712,1,50,3,3,2,2,0,...,0,0,0,0,0,0,0,0,0,0
442785,6,1041,912,2,5,9,5,2,2,0,...,0,0,0,0,0,0,0,0,0,0


## Submission (Prediction)

In [100]:
# predict labels
preds_test = eclf.predict(test_values)
preds_test = le.inverse_transform(preds_test) #reverse label encoder
preds_test

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([3, 2, 2, ..., 2, 2, 2])

In [101]:
submission_format = pd.read_csv('csv_original/submission_format.csv', index_col='building_id')

In [102]:
my_submission = pd.DataFrame(data=preds_test,
                             columns=submission_format.columns,
                             index=submission_format.index)

In [103]:
my_submission.head()

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,3
99355,2
890251,2
745817,2
421793,3


In [104]:
#create a csv file that will be submitted to DrivenData
#my_submission.to_csv('submission_cbc2.csv')
my_submission.to_csv('csv_brian/submission_mlxtend1.csv')

In [105]:
#!head submission_cbc2.csv
!head csv_brian/submission_mlxtend1.csv

building_id,damage_grade
300051,3
99355,2
890251,2
745817,2
421793,3
871976,2
691228,1
896100,3
343471,2
