# Richter's Predictor

Initial code is a copy of the example found here: http://drivendata.co/blog/richters-predictor-benchmark/

We'll then use an XGBoost model to get a better estimate, and also will look at engineering some features using the geocode.

The intention is to try to find a way to use the fact that some areas (geo locations) will have suffered more damage than others.  There seem to be too many level 3 geolocations for a tree-based algorithm to deal with effectively, but if we can somehow uncover information about the geolocation and encode it in a way that is easier for the tree to deal with then it may improve our scores

In [53]:
%matplotlib inline

from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# for preprocessing the data
from sklearn.preprocessing import StandardScaler, LabelEncoder
from collections import defaultdict
from sklearn.model_selection import train_test_split

# the model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.utils import class_weight

# for combining the preprocess with model training
from sklearn.pipeline import make_pipeline

# for optimizing the hyperparameters of the pipeline
from sklearn.model_selection import GridSearchCV

import itertools

In [2]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import xgboost as xgb

In [3]:
DATA_DIR = Path('.', 'data')

In [4]:
train_values = pd.read_csv(DATA_DIR / 'train_values.csv', index_col='building_id')
train_labels = pd.read_csv(DATA_DIR / 'train_labels.csv', index_col='building_id')


In [5]:
test_values  = pd.read_csv(DATA_DIR / 'test_values.csv', index_col='building_id')

#### For some feature engineering we want to use the damage grades, so we'll join them here

In [6]:
train = train_values.join(train_labels)

#### For others we also want to include test values (eg. when encoding categorical) 

In [7]:
test_and_train_values = pd.concat([train_values,test_values])

In [8]:
train_labels.head()

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
802906,3
28830,2
94947,3
590882,2
201944,3


In [9]:
train_values.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
28830,8,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,0
94947,21,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
590882,22,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
201944,11,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,0,0,0


In [10]:
test_and_train_values.dtypes

geo_level_1_id                             int64
geo_level_2_id                             int64
geo_level_3_id                             int64
count_floors_pre_eq                        int64
age                                        int64
area_percentage                            int64
height_percentage                          int64
land_surface_condition                    object
foundation_type                           object
roof_type                                 object
ground_floor_type                         object
other_floor_type                          object
position                                  object
plan_configuration                        object
has_superstructure_adobe_mud               int64
has_superstructure_mud_mortar_stone        int64
has_superstructure_stone_flag              int64
has_superstructure_cement_mortar_stone     int64
has_superstructure_mud_mortar_brick        int64
has_superstructure_cement_mortar_brick     int64
has_superstructure_t

## Data exploration

In [None]:
(train_labels.damage_grade
             .value_counts()
             .sort_index()
             .plot.bar(title="Number of Buildings with Each Damage Grade"))

In [None]:
selected_features = ['foundation_type', 
                     'area_percentage', 
                     'height_percentage',
                     'count_floors_pre_eq',
                     'land_surface_condition',
                     'has_superstructure_cement_mortar_stone']

train_values_subset = train_values[selected_features]

In [None]:
sns.pairplot(train_values_subset.join(train_labels), 
             hue='damage_grade')

In [None]:
secondary_uses = [
'has_secondary_use',
'has_secondary_use_agriculture',
'has_secondary_use_hotel',
'has_secondary_use_rental',
'has_secondary_use_institution',
'has_secondary_use_school',
'has_secondary_use_industry',
'has_secondary_use_health_post',
'has_secondary_use_gov_office',
'has_secondary_use_use_police',
'has_secondary_use_other'
]

structure = [
'has_superstructure_adobe_mud',
'has_superstructure_mud_mortar_stone',
'has_superstructure_stone_flag',
'has_superstructure_cement_mortar_stone',
'has_superstructure_mud_mortar_brick',
'has_superstructure_cement_mortar_brick',
'has_superstructure_timber',
'has_superstructure_bamboo',
'has_superstructure_rc_non_engineered',
'has_superstructure_rc_engineered',
'has_superstructure_other'
]



In [None]:
for use in secondary_uses:
    print(use, train[train[use]==1]['damage_grade'].mean())

In [None]:
for s in structure:
    print(s, train[train[s]==1]['damage_grade'].mean())

In [None]:
biggest_geo3 = train['geo_level_3_id'].value_counts().head(30).index.values

In [None]:
for location in biggest_geo3:
    print('Geo3 id:',location)
    for s in structure:
        s_filter = (train['geo_level_3_id'] == location) & (train[s] == 1)
        print(s, train.loc[s_filter]['damage_grade'].count(), train.loc[s_filter]['damage_grade'].mean())

### There are similarities in damage between the mortar types (mud/cement) and the reinforced concrete types (non-eng, engineered) so for the sake of our geoid indicator we'll group them

In [11]:

train['mud'] = train['has_superstructure_adobe_mud'] | train['has_superstructure_mud_mortar_stone'] | train['has_superstructure_mud_mortar_brick']
train['cement'] = train['has_superstructure_cement_mortar_stone'] | train['has_superstructure_cement_mortar_brick'] 
train['concrete'] = train['has_superstructure_rc_non_engineered'] | train['has_superstructure_rc_engineered'] 
train['natural'] = train['has_superstructure_timber'] | train['has_superstructure_bamboo'] 

test_and_train_values['mud'] = test_and_train_values['has_superstructure_adobe_mud'] | test_and_train_values['has_superstructure_mud_mortar_stone'] | test_and_train_values['has_superstructure_mud_mortar_brick']
test_and_train_values['cement'] = test_and_train_values['has_superstructure_cement_mortar_stone'] | test_and_train_values['has_superstructure_cement_mortar_brick'] 
test_and_train_values['concrete'] = test_and_train_values['has_superstructure_rc_non_engineered'] | test_and_train_values['has_superstructure_rc_engineered'] 
test_and_train_values['natural'] = test_and_train_values['has_superstructure_timber'] | test_and_train_values['has_superstructure_bamboo'] 

test_values['mud'] = test_values['has_superstructure_adobe_mud'] | test_values['has_superstructure_mud_mortar_stone'] | test_values['has_superstructure_mud_mortar_brick']
test_values['cement'] = test_values['has_superstructure_cement_mortar_stone'] | test_values['has_superstructure_cement_mortar_brick'] 
test_values['concrete'] = test_values['has_superstructure_rc_non_engineered'] | test_values['has_superstructure_rc_engineered'] 
test_values['natural'] = test_values['has_superstructure_timber'] | test_values['has_superstructure_bamboo'] 

train['n_struc_types'] = train['mud'] + train['cement'] + train['concrete'] + train['natural']
test_and_train_values['n_struc_types'] = test_and_train_values['mud'] + test_and_train_values['cement'] + test_and_train_values['concrete'] + test_and_train_values['natural']
test_values['n_struc_types'] = test_values['mud'] + test_values['cement'] + test_values['concrete'] + test_values['natural']

train['concrete_only'] = (train['concrete']==True) & (train['n_struc_types']==1)
train['cement_only'] = (train['cement']==True) & (train['n_struc_types']==1)

test_and_train_values['concrete_only'] = (test_and_train_values['concrete']==True) & (test_and_train_values['n_struc_types']==1)
test_and_train_values['cement_only'] = (test_and_train_values['cement']==True) & (test_and_train_values['n_struc_types']==1)

test_values['concrete_only'] = (test_values['concrete']==True) & (test_values['n_struc_types']==1)
test_values['cement_only'] = (test_values['cement']==True) & (test_values['n_struc_types']==1)

# This next one is just a helper column for use later to get averages of damage for each region
train['no-mud'] = abs(train['mud']-1)
test_and_train_values['no-mud'] = abs(test_and_train_values['mud']-1)
test_values['no-mud'] = abs(test_values['mud']-1)


In [None]:
concrete_only = (train['concrete']==True) & (train['n_struc_types']==1)
mud_only = (train['mud']==True) & (train['n_struc_types']==1)
cement_only = (train['cement']==True) & (train['n_struc_types']==1)
natural_only = (train['natural']==True) & (train['n_struc_types']==1)


Observations about the building types:
* If building has some mud, damage is at least 2
* The distribution of damage isn't materially different whether the building is all mud or just has some mud.
* All-concrete buildings suffer very little damage
* All-cement are also strong, but not quite as good as all-concrete
* 

In [None]:
(train[train['mud']==True].damage_grade
             .value_counts()
             .sort_index()
             .plot.bar(title="Number of mud Buildings with Each Damage Grade"))

In [None]:
(train[mud_only].damage_grade
             .value_counts()
             .sort_index()
             .plot.bar(title="Number of mud only Buildings with Each Damage Grade"))

In [None]:
(train[train['mud']==False].damage_grade
             .value_counts()
             .sort_index()
             .plot.bar(title="Number of non-mud Buildings with Each Damage Grade"))

In [None]:
(train[train['natural']==True].damage_grade
             .value_counts()
             .sort_index()
             .plot.bar(title="Number of natural Buildings with Each Damage Grade"))

In [None]:
(train[natural_only].damage_grade
             .value_counts()
             .sort_index()
             .plot.bar(title="Number of natural only Buildings with Each Damage Grade"))

In [None]:
(train[train['cement']==True].damage_grade
             .value_counts()
             .sort_index()
             .plot.bar(title="Number of cement Buildings with Each Damage Grade"))

In [None]:
(train[cement_only].damage_grade
             .value_counts()
             .sort_index()
             .plot.bar(title="Number of cement only Buildings with Each Damage Grade"))

In [None]:
(train[(train['concrete']==True)].damage_grade
             .value_counts()
             .sort_index()
             .plot.bar(title="Number of concrete Buildings with Each Damage Grade"))

In [None]:

(train[concrete_only].damage_grade
             .value_counts()
             .sort_index()
             .plot.bar(title="Number of concrete only Buildings with Each Damage Grade"))

In [None]:
struc_types = ['mud','natural','cement','concrete']
combs = []
for i in range(len(struc_types)):
    combs.extend([list(t) for t in [k for k in itertools.combinations(struc_types,i+1)]])
    
combs

In [None]:
filters = [train[t]==True for t in combs[12]]
train[np.logical_and.reduce(filters)].damage_grade.mean()

In [None]:
np.logical_and.reduce(filters)

In [None]:
for c in combs:
    filters = [train[t]==True for t in c]
    print(c)
    print(train[np.logical_and.reduce(filters)].damage_grade
             .mean())

In [None]:
[list(t) for t in [k for k in itertools.combinations(struc_types,2)]]

### Some geolocations might only be in the test set, so if we are going to build a universal lookup then we need to include test as well so we can get a complete list

In [13]:
test_and_train_values.shape

(347469, 46)

In [14]:
#test_and_train.head()
test_and_train_values.tail()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_use_police,has_secondary_use_other,mud,cement,concrete,natural,n_struc_types,concrete_only,cement_only,no-mud
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
310028,4,605,3623,3,70,20,6,t,r,q,...,0,0,1,0,0,1,2,False,False,0
663567,10,1407,11907,3,25,6,7,n,r,n,...,0,0,1,0,0,0,1,False,False,0
1049160,22,1136,7712,1,50,3,3,t,r,n,...,0,0,1,0,0,1,2,False,False,0
442785,6,1041,912,2,5,9,5,t,r,n,...,0,0,1,0,0,0,1,False,False,0
501372,26,36,6436,2,10,11,4,t,r,q,...,0,0,0,1,0,0,1,False,True,1


In [15]:
geo_lookup = test_and_train_values[['geo_level_3_id','geo_level_2_id','geo_level_1_id']].groupby(['geo_level_3_id']).first().reset_index()

In [16]:
geo_lookup.head()

Unnamed: 0,geo_level_3_id,geo_level_2_id,geo_level_1_id
0,0,179,12
1,1,194,15
2,2,657,0
3,3,73,30
4,4,1061,5


#### OK, now we can calcualte averages for each geoid level and construction type, and build our lookup table

First I'm going to double check what the frequency is like for the different structure types, because I'm worried that we won't have many examples of concrete and if that is an issue then we need to account for it somehow.

In [None]:
levels = ['1','2','3']
structure_cats = ['mud','no-mud','cement_only', 'concrete_only']


# plot
f, axes = plt.subplots(1, 4, figsize=(14, 4), sharex=True, sharey=True)

# Change the x-axis because it has a really long tail.  First attempt was to make it log, second just trims
#axes[0,0].set(xscale="log")
axes[0].set_xlim(right=20)

graph_colours = ['skyblue','olive', 'gold', 'teal']
ax = [axes[0],axes[1],axes[2],axes[3]]

for i,s in enumerate(structure_cats):
    sns.distplot( geo_lookup[s+'1_n'].fillna(0) , color=graph_colours[i], ax=ax[i], bins=300)

#################################
f, axes = plt.subplots(1, 4, figsize=(14, 4), sharex=True, sharey=True)

# Change the x-axis because it has a really long tail.  First attempt was to make it log, second just trims
#axes[0,0].set(xscale="log")
axes[0].set_xlim(right=20)

graph_colours = ['skyblue','olive', 'gold', 'teal']
ax = [axes[0],axes[1],axes[2],axes[3]]

for i,s in enumerate(structure_cats):
    sns.distplot( geo_lookup[s+'2_n'].fillna(0) , color=graph_colours[i], ax=ax[i], bins=300)


#################################
f, axes = plt.subplots(2, 2, figsize=(7, 7), sharex=True, sharey=True)

# Change the x-axis because it has a really long tail.  First attempt was to make it log, second just trims
#axes[0,0].set(xscale="log")
axes[0,0].set_xlim(right=20)

graph_colours = ['skyblue','olive', 'gold', 'teal']
ax = [axes[0, 0],axes[0, 1],axes[1, 0],axes[1, 1]]


for i,s in enumerate(structure_cats):
    sns.distplot( geo_lookup[s+'3_n'].fillna(0) , color=graph_colours[i], ax=ax[i], bins=300)


In [None]:
structure_cats = ['mud','natural', 'cement', 'concrete']

for location in biggest_geo3:
    print('Geo3 id:',location)
    for s in structure_cats:
        s_filter = (train['geo_level_3_id'] == location) & (train[s] == 1)
        print(s, train.loc[s_filter]['damage_grade'].count(), train.loc[s_filter]['damage_grade'].mean())

In [None]:
averages = {}
levels = ['1','2','3']
for level in levels:
    for s in structure_cats:
        s_filter = train[s] == 1
        averages[s+level] = train[s_filter].groupby('geo_level_'+level+'_id')['damage_grade'].agg({s+level+'_n':'count', 
                                     s+level+'_mean':'mean'})

In [None]:
averages['mud1'].head()

In [None]:
train.merge(averages['mud2'].reset_index(), how='left',on='geo_level_2_id').head(2)

#### Everything is looking OK, let's do it again but this time merge inline rather than saving to a dictionary first
We use train here because that df has got the damage values in it

In [17]:
levels = ['1','2','3']
structure_cats = ['mud','no-mud','cement_only', 'concrete_only']


for level in levels:
    print('averaging level',level)
    averages_list = []

    # Work out normalised damage grades for each structure type
    for s in structure_cats:
        s_filter = train[s] == 1
        averages = train[s_filter].groupby('geo_level_'+level+'_id')['damage_grade'].agg({s+level+'_n':'count', 
                                     s+level+'_mean':'mean'})
        col_to_norm = averages[s+level+'_mean']
        averages[s+level+'_mean_norm']=(col_to_norm-col_to_norm.min())/(col_to_norm.max()-col_to_norm.min())
        #print(averages.head(2))
        averages_list.append(averages)

    # Concat the averages into one dataframe
    averages = pd.concat(averages_list, axis=1)
    #print(averages.head())
    #print(geo_lookup.shape)
    
    # Now we have those, we can also calculate a weighted avergage across the structure types for that geoid
    
    cols = [s+level+'_mean_norm' for s in structure_cats]
    weights = [s+level+'_n' for s in structure_cats]

    norms_np = averages[cols].values
    weights_np = averages[weights].values

    norm_mask = np.isnan(norms_np)
    weights_mask = np.isnan(weights_np)

    norms_np = np.ma.masked_array(norms_np, mask=norm_mask)
    weights_np = np.ma.masked_array(weights_np, mask=weights_mask)

    wa_norm = np.ma.average(norms_np, weights=weights_np, axis=1)
    wa_norm.fill_value = -1
    averages['level'+level+'norm_damage'] = wa_norm.filled()
    #geo_lookup['level'+level+'_wa_norm_damage'] = wa_norm.filled()
    #print(averages.head())
    geo_lookup = geo_lookup.merge(averages['level'+level+'norm_damage'].reset_index(), how='left',on='geo_level_'+level+'_id')

print('Done')

averaging level 1


is deprecated and will be removed in a future version
  del sys.path[0]


averaging level 2
averaging level 3
Done


### Some  geoids have missing values though, presumably because the only examples are in the training set.  So we can use the next available level up

In [18]:
geo_lookup.loc[8313]

geo_level_3_id       8800.000000
geo_level_2_id        295.000000
geo_level_1_id         28.000000
level1norm_damage       0.549266
level2norm_damage            NaN
level3norm_damage            NaN
Name: 8313, dtype: float64

In [19]:
empty_level = geo_lookup['level2norm_damage'].isnull()
geo_lookup.loc[empty_level,'level2norm_damage'] = geo_lookup.loc[empty_level,'level1norm_damage']

empty_level = geo_lookup['level3norm_damage'].isnull()
geo_lookup.loc[empty_level,'level3norm_damage'] = geo_lookup.loc[empty_level,'level2norm_damage']



In [20]:
geo_lookup.loc[8313]

geo_level_3_id       8800.000000
geo_level_2_id        295.000000
geo_level_1_id         28.000000
level1norm_damage       0.549266
level2norm_damage       0.549266
level3norm_damage       0.549266
Name: 8313, dtype: float64

### Finally, join the lookup table with the test/train values

In [21]:
test_and_train_values = test_and_train_values.merge(geo_lookup[['geo_level_3_id','level3norm_damage']], on=['geo_level_3_id'])

In [22]:
test_and_train_values[22011:22015]

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_other,mud,cement,concrete,natural,n_struc_types,concrete_only,cement_only,no-mud,level3norm_damage
22011,8,600,3238,2,5,12,8,t,r,n,...,0,1,0,0,0,1,False,False,0,0.875
22012,8,600,3238,2,30,6,8,t,r,n,...,0,1,0,0,0,1,False,False,0,0.875
22013,17,1149,4221,3,30,5,7,t,r,n,...,0,1,0,0,1,2,False,False,0,0.989899
22014,17,1149,4221,2,10,7,5,t,r,n,...,0,1,0,0,1,2,False,False,0,0.989899


In [None]:
abs()

In [None]:
test_and_train_values.index

In [None]:
test_values.tail()

### Encode the categorical into numeric

This code taken from https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn

categorical_encoder = defaultdict(LabelEncoder)

'# Encoding the variable

fit = df.apply(lambda x: categorical_encoder[x.name].fit_transform(x))

'# Inverse the encoded

fit.apply(lambda x: categorical_encoder[x.name].inverse_transform(x))

'# Using the dictionary to label future data

df.apply(lambda x: categorical_encoder[x.name].transform(x))


In [23]:
categorical_columns = [
    'land_surface_condition',
    'foundation_type',
    'roof_type',
    'ground_floor_type',
    'other_floor_type',
    'position',
    'plan_configuration',
    'legal_ownership_status'
]

In [24]:
categorical_encoder = defaultdict(LabelEncoder)

### Fit the encoder on the combined dataset

In [25]:
test_and_train_values.loc[:,categorical_columns] = test_and_train_values.loc[:,categorical_columns].apply(lambda x: categorical_encoder[x.name].fit_transform(x))

In [26]:
test_and_train_values.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_other,mud,cement,concrete,natural,n_struc_types,concrete_only,cement_only,no-mud,level3norm_damage
0,6,487,12198,2,30,6,5,2,2,0,...,0,1,0,0,0,1,False,False,0,0.918919
1,6,487,12198,2,15,3,5,2,2,0,...,0,1,0,0,0,1,False,False,0,0.918919
2,6,487,12198,2,80,6,5,2,2,0,...,0,1,0,0,0,1,False,False,0,0.918919
3,6,487,12198,2,20,10,5,2,2,0,...,0,1,0,0,0,1,False,False,0,0.918919
4,6,487,12198,2,20,3,5,2,2,0,...,0,1,0,0,0,1,False,False,0,0.918919


### Apply the encoder to transform the train data

In [27]:
train_values.loc[:,categorical_columns] = train_values.loc[:,categorical_columns].apply(lambda x: categorical_encoder[x.name].transform(x))

In [28]:
train_values.dtypes

geo_level_1_id                            int64
geo_level_2_id                            int64
geo_level_3_id                            int64
count_floors_pre_eq                       int64
age                                       int64
area_percentage                           int64
height_percentage                         int64
land_surface_condition                    int32
foundation_type                           int32
roof_type                                 int32
ground_floor_type                         int32
other_floor_type                          int32
position                                  int32
plan_configuration                        int32
has_superstructure_adobe_mud              int64
has_superstructure_mud_mortar_stone       int64
has_superstructure_stone_flag             int64
has_superstructure_cement_mortar_stone    int64
has_superstructure_mud_mortar_brick       int64
has_superstructure_cement_mortar_brick    int64
has_superstructure_timber               

In [29]:
train.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_other,damage_grade,mud,cement,concrete,natural,n_struc_types,concrete_only,cement_only,no-mud
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,t,r,n,...,0,3,1,0,0,0,1,False,False,0
28830,8,900,2812,2,10,8,7,o,r,n,...,0,2,1,0,0,0,1,False,False,0
94947,21,363,8973,2,10,5,5,t,r,n,...,0,3,1,0,0,0,1,False,False,0
590882,22,418,10694,2,10,6,5,t,r,n,...,0,2,1,0,0,1,2,False,False,0
201944,11,131,1488,3,30,8,9,t,r,n,...,0,3,1,0,0,0,1,False,False,0


### All features are now categorical so we could use them on a classifier now, but...let's engineer some features first

In [30]:
train = train.merge(geo_lookup[['geo_level_3_id','level3norm_damage']], on=['geo_level_3_id'], how='left')

In [31]:
train.loc[:,categorical_columns] = train.loc[:,categorical_columns].apply(lambda x: categorical_encoder[x.name].transform(x))

In [32]:
train.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,damage_grade,mud,cement,concrete,natural,n_struc_types,concrete_only,cement_only,no-mud,level3norm_damage
0,6,487,12198,2,30,6,5,2,2,0,...,3,1,0,0,0,1,False,False,0,0.918919
1,8,900,2812,2,10,8,7,1,2,0,...,2,1,0,0,0,1,False,False,0,0.5
2,21,363,8973,2,10,5,5,2,2,0,...,3,1,0,0,0,1,False,False,0,0.771127
3,22,418,10694,2,10,6,5,2,2,0,...,2,1,0,0,1,2,False,False,0,0.548387
4,11,131,1488,3,30,8,9,2,2,0,...,3,1,0,0,0,1,False,False,0,0.68


### Prep the test data too

In [33]:
test_values = test_values.merge(geo_lookup[['geo_level_3_id','level3norm_damage']], on=['geo_level_3_id'], how='left')
test_values.loc[:,categorical_columns] = test_values.loc[:,categorical_columns].apply(lambda x: categorical_encoder[x.name].transform(x))

In [34]:
test_values.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_other,mud,cement,concrete,natural,n_struc_types,concrete_only,cement_only,no-mud,level3norm_damage
0,17,596,11307,3,20,7,6,2,2,0,...,0,1,0,0,0,1,False,False,0,0.775
1,6,141,11987,2,25,13,5,2,2,0,...,0,1,0,0,0,1,False,False,0,0.5
2,22,19,10044,2,5,4,5,2,2,0,...,0,1,0,0,0,1,False,False,0,1.0
3,26,39,633,1,0,19,3,2,2,2,...,0,0,1,0,0,1,False,True,1,0.211715
4,17,289,7970,3,15,8,7,2,2,1,...,0,1,0,0,0,1,False,False,0,0.911765


### Damage measures for each geolocation

The idae here is that I want a normalised measure of average damage per geolocation.  The easiest way would be to take the average damage value for the geoid but that wouldn't take into consideration the different mix of building types.  If some geolocations had sturdier buildings then it's average damage might be artifically low.

So instead I'll take the average damage for each building type and/or some sort of adjustment for the building type - for example wooden buildings seem to have less damage so perhaps we can work out some normalised values for each building type and then combine them to get a single normalised damage value for each geoid.

The last factor I want to allow for is that some geoids have only one data point, and that isn't gong to be of much use, so instead my intial approach will be to take the average of the next geolevel up if the count of datapoints is below a certain threshold

## Train Test Split

In [46]:
train_ex_geo = train.drop(['geo_level_1_id','geo_level_2_id','geo_level_3_id', 'damage_grade'], axis=1)


In [47]:
X_train, X_test, y_train, y_test = train_test_split(train_ex_geo, train['damage_grade'], test_size=0.2)


In [54]:
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)

## Try random forest classifier

In [60]:
clf = RandomForestClassifier(n_estimators=500, max_depth=10, class_weight='balanced')
%time clf.fit(X_train, y_train) 

Wall time: 1min 40s


RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=500, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [61]:
print(list(zip(X_train.columns,clf.feature_importances_)))

[('level3norm_damage', 0.6377843397118964), ('mud', 0.14696777455712237), ('cement_only', 0.01165088121606112), ('concrete_only', 0.013895335787116912), ('count_floors_pre_eq', 0.01955341387964623), ('age', 0.04823275767610806), ('foundation_type', 0.06212476746661701), ('roof_type', 0.05979072970543198)]


In [62]:

y_pred = clf.predict(X_test)
f1_score(y_test, y_pred, average='micro')

0.6912760691467931

## Select only a few of the columns

In [48]:
cols_to_use = [
    'level3norm_damage',
    'mud',
    'cement_only',
    'concrete_only',
    'count_floors_pre_eq',
    'age',
    'foundation_type',
    'roof_type'
    
]

train_select_cols = train[cols_to_use]

X_train, X_test, y_train, y_test = train_test_split(train_select_cols, train['damage_grade'], test_size=0.2)


In [57]:
clf = RandomForestClassifier(n_estimators=500, max_depth=10, class_weight='balanced')
%time clf.fit(X_train, y_train) 

Wall time: 1min 42s


RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=500, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [58]:
print(list(zip(X_train.columns,clf.feature_importances_)))

[('level3norm_damage', 0.6414288404434845), ('mud', 0.14719573302455086), ('cement_only', 0.013263387959962392), ('concrete_only', 0.014115023943922446), ('count_floors_pre_eq', 0.0196091954428837), ('age', 0.047876391357494816), ('foundation_type', 0.062119129951735456), ('roof_type', 0.054392297875965674)]


In [59]:

y_pred = clf.predict(X_test)
f1_score(y_test, y_pred, average='micro')

0.6917749083862551

## OK, now let's try that with XGBoost

In [None]:
# read in data
dtrain = xgb.DMatrix(X_train, label=y_train-1)
dtest = xgb.DMatrix(X_test, label=y_test-1)


In [None]:
# specify parameters via map
param = {'max_depth':10, 'eta':0.3, 'subsample':1, 'objective':'multi:softmax', 'num_class':3 }
num_round = 100


In [None]:
bst = xgb.train(param, dtrain, num_round)


In [None]:
# make prediction
y_pred = bst.predict(dtest) + 1

In [None]:
f1_score(y_test, y_pred, average='micro')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_ex_geo, train['damage_grade'], test_size=0)

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train-1)


In [None]:
bst = xgb.train(param, dtrain, num_round)


## Make a submission

In [None]:
test_ex_geo = test_values.drop(['geo_level_1_id','geo_level_2_id','geo_level_3_id'], axis=1)

In [None]:
dtest = xgb.DMatrix(test_ex_geo)


In [None]:
y_pred = bst.predict(dtest) + 1

In [None]:
submission_format = pd.read_csv(DATA_DIR / 'submission_format.csv', index_col='building_id')


In [None]:
my_submission = pd.DataFrame(data=y_pred,
                             columns=submission_format.columns,
                             index=submission_format.index)


In [None]:
my_submission.head()

In [None]:
my_submission.damage_grade = my_submission.damage_grade.astype(int)

In [None]:
my_submission.head()

In [None]:
my_submission.to_csv('submission.csv')

In [None]:
!head submission.csv