# 60-minute Models

* [Competition](https://www.drivendata.org/competitions/57/nepal-earthquake/page/136/)

## Import Data

In [1]:
import pandas as pd

In [2]:
dtype = {'geo_level_1_id': str,
         'geo_level_2_id': str,
         'geo_level_3_id': str}

X = pd.read_csv('data/train_values.csv', index_col='building_id', dtype=dtype)

In [3]:
X.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
28830,8,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,0
94947,21,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
590882,22,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
201944,11,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,0,0,0


In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260601 entries, 802906 to 747594
Data columns (total 38 columns):
 #   Column                                  Non-Null Count   Dtype 
---  ------                                  --------------   ----- 
 0   geo_level_1_id                          260601 non-null  object
 1   geo_level_2_id                          260601 non-null  object
 2   geo_level_3_id                          260601 non-null  object
 3   count_floors_pre_eq                     260601 non-null  int64 
 4   age                                     260601 non-null  int64 
 5   area_percentage                         260601 non-null  int64 
 6   height_percentage                       260601 non-null  int64 
 7   land_surface_condition                  260601 non-null  object
 8   foundation_type                         260601 non-null  object
 9   roof_type                               260601 non-null  object
 10  ground_floor_type                       260601 non-

In [5]:
X.foundation_type.value_counts()

r    219196
w     15118
u     14260
i     10579
h      1448
Name: foundation_type, dtype: int64

In [6]:
X.filter(like='has_super', axis=1).head()

Unnamed: 0_level_0,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
802906,1,1,0,0,0,0,0,0,0,0,0
28830,0,1,0,0,0,0,0,0,0,0,0
94947,0,1,0,0,0,0,0,0,0,0,0
590882,0,1,0,0,0,0,1,1,0,0,0
201944,1,0,0,0,0,0,0,0,0,0,0


In [7]:
X.has_superstructure_adobe_mud.value_counts()

0    237500
1     23101
Name: has_superstructure_adobe_mud, dtype: int64

In [8]:
y = pd.read_csv('data/train_labels.csv', index_col='building_id')['damage_grade']

## Split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Baseline

In [11]:
print('Baseline Accuracy:', y_train.value_counts(normalize=True).max())

Baseline Accuracy: 0.5697045280122793


## Build Model (MVP)

In [12]:
from sklearn.linear_model import LogisticRegression

In [22]:
# Instantiate
model_mvp = LogisticRegression()

# Train
model_mvp.fit(X_train[['age']], y_train)

LogisticRegression()

In [23]:
# Check metrics
print('Training Accuracy:', model_mvp.score(X_train[['age']], y_train))
print('Validation Accuracy:', model_mvp.score(X_val[['age']], y_val))

Training Accuracy: 0.5697045280122793
Validation Accuracy: 0.5657412559237159


In [24]:
# Check competition metric
from sklearn.metrics import f1_score
f1_score(y_val, model_mvp.predict(X_val[['age']]), average='micro')

0.5657412559237159

# Submission Function

In [34]:
def make_submission(model):
    X_test = pd.read_csv('data/test_values.csv', index_col='building_id', dtype=dtype)
    y_pred = model.predict(X_test)
    y_pred = pd.DataFrame(y_pred, columns=['damage_grade'], index=X_test.index)
    timestamp = pd.Timestamp.now().strftime(format='%Y-%m-%d_%H%M')
    filename = f'submissions/{timestamp}_submission.csv'
    y_pred.to_csv(filename)

In [33]:
make_submission(model_mvp)

## Build Model: More Features

In [35]:
X_train.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
739427,13,257,8352,3,25,2,5,n,r,n,...,0,0,0,0,0,0,0,0,0,0
17201,6,1076,9202,3,40,10,8,o,r,n,...,0,0,1,0,0,0,0,0,0,0
723805,7,838,10723,2,45,8,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
891512,7,555,2763,2,30,7,4,t,r,n,...,0,0,0,0,0,0,0,0,0,0
484350,17,682,1039,3,30,6,7,t,r,q,...,0,0,0,0,0,0,0,0,0,0


In [36]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 208480 entries, 739427 to 246461
Data columns (total 38 columns):
 #   Column                                  Non-Null Count   Dtype 
---  ------                                  --------------   ----- 
 0   geo_level_1_id                          208480 non-null  object
 1   geo_level_2_id                          208480 non-null  object
 2   geo_level_3_id                          208480 non-null  object
 3   count_floors_pre_eq                     208480 non-null  int64 
 4   age                                     208480 non-null  int64 
 5   area_percentage                         208480 non-null  int64 
 6   height_percentage                       208480 non-null  int64 
 7   land_surface_condition                  208480 non-null  object
 8   foundation_type                         208480 non-null  object
 9   roof_type                               208480 non-null  object
 10  ground_floor_type                       208480 non-

In [43]:
X_train.select_dtypes(include='object').columns

Index(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'legal_ownership_status'],
      dtype='object')

In [45]:
from sklearn.pipeline import make_pipeline
from category_encoders import OrdinalEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

In [44]:
cat_cols = ['geo_level_1_id', 'land_surface_condition', 
            'foundation_type', 'roof_type',
            'ground_floor_type', 'other_floor_type', 
            'position', 'plan_configuration', 'legal_ownership_status']

In [47]:
pipeline = make_pipeline(
    OrdinalEncoder(cols=['geo_level_2_id', 'geo_level_3_id']),
    OneHotEncoder(cols=cat_cols),
    RandomForestClassifier()
)

In [48]:
pipeline.fit(X_train, y_train)

  elif pd.api.types.is_categorical(cols):


Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['geo_level_2_id', 'geo_level_3_id'],
                                mapping=[{'col': 'geo_level_2_id',
                                          'data_type': dtype('O'),
                                          'mapping': 257        1
1076       2
838        3
555        4
682        5
        ... 
771     1408
1163    1409
1016    1410
361     1411
NaN       -2
Length: 1412, dtype: int64},
                                         {'col': 'geo_level_3_id',
                                          'data_type': dtype('O'),
                                          'mapping': 8352         1
9202         2
10723        3
2763         4
1039         5
         ...  
9515     11355
11049    11356
2433     11357
4959     11358
NaN         -2
Length: 11359, dtype: int64}])),
                ('onehotencoder',
                 OneHotEncoder(cols=['geo_level_1_id', 'land_surface_condition',
                               

In [55]:
pipeline.named_steps['randomforestclassifier']

In [50]:
print('Training Accuracy:', pipeline.score(X_train, y_train))
print('Validation Accuracy:', pipeline.score(X_val, y_val))
print('Validation F1:', f1_score(y_val, pipeline.predict(X_val), average='micro'))

Training Accuracy: 0.9867613200306984
Validation Accuracy: 0.7183860631990944
Validation F1: 0.7183860631990944


In [51]:
make_submission(pipeline)

# Tune Model

In [52]:
from sklearn.model_selection import RandomizedSearchCV

In [61]:
params = {'n_estimators': range(50, 350, 50),
          'max_depth': range(3, 25),
          'max_features': ['sqrt', 'log2', None]}

rscv = RandomizedSearchCV(RandomForestClassifier(),
                          param_distributions=params,
                          n_iter=20,
                          cv=3,
                          verbose=1,
                          n_jobs=6)

model = make_pipeline(
    OrdinalEncoder(cols=['geo_level_2_id', 'geo_level_3_id']),
    OneHotEncoder(cols=cat_cols),
    rscv
)

In [62]:
model.fit(X_train, y_train)

  elif pd.api.types.is_categorical(cols):


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  6.1min
[Parallel(n_jobs=6)]: Done  60 out of  60 | elapsed: 11.7min finished


Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['geo_level_2_id', 'geo_level_3_id'],
                                mapping=[{'col': 'geo_level_2_id',
                                          'data_type': dtype('O'),
                                          'mapping': 257        1
1076       2
838        3
555        4
682        5
        ... 
771     1408
1163    1409
1016    1410
361     1411
NaN       -2
Length: 1412, dtype: int64},
                                         {'col': 'geo_level_3_id',
                                          'data_type': dtype('O'),
                                          'mapping': 8352         1
9202         2
10723        3
2763         4
1039         5
         ...  
9515     11355
11049    11356
2433     1...
                                     'foundation_type', 'roof_type',
                                     'ground_floor_type', 'other_floor_type',
                                     'position', 'plan_configur

In [63]:
print('Training Accuracy:', model.score(X_train, y_train))
print('Validation Accuracy:', model.score(X_val, y_val))
print('Validation F1:', f1_score(y_val, model.predict(X_val), average='micro'))

Training Accuracy: 0.804336147352264
Validation Accuracy: 0.7278064503750887
Validation F1: 0.7278064503750887


In [64]:
rscv.best_params_

{'n_estimators': 150, 'max_features': None, 'max_depth': 18}

In [65]:
make_submission(model)