In [16]:
import pandas as pd
import matplotlib.pyplot as plt
from ipywidgets import widgets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Get Data from Website

In [17]:
!wget https://s3.amazonaws.com/drivendata/data/57/public/train_values.csv -nc -P ./nepal/
!wget https://s3.amazonaws.com/drivendata/data/57/public/train_labels.csv -nc -P ./nepal/
!wget https://s3.amazonaws.com/drivendata/data/57/public/test_values.csv -nc -P ./nepal/

File ‘./nepal/train_values.csv’ already there; not retrieving.

File ‘./nepal/train_labels.csv’ already there; not retrieving.

File ‘./nepal/test_values.csv’ already there; not retrieving.



# Import Data

In [51]:
X = pd.read_csv('./nepal/train_values.csv', 
                index_col='building_id', 
                dtype={'geo_level_1_id': 'object',
                       'geo_level_2_id': 'object',
                       'geo_level_3_id': 'object'})

In [19]:
X.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
28830,8,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,0
94947,21,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
590882,22,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
201944,11,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,0,0,0


In [52]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260601 entries, 802906 to 747594
Data columns (total 38 columns):
geo_level_1_id                            260601 non-null object
geo_level_2_id                            260601 non-null object
geo_level_3_id                            260601 non-null object
count_floors_pre_eq                       260601 non-null int64
age                                       260601 non-null int64
area_percentage                           260601 non-null int64
height_percentage                         260601 non-null int64
land_surface_condition                    260601 non-null object
foundation_type                           260601 non-null object
roof_type                                 260601 non-null object
ground_floor_type                         260601 non-null object
other_floor_type                          260601 non-null object
position                                  260601 non-null object
plan_configuration                        2

In [21]:
y = pd.read_csv('nepal/train_labels.csv', index_col='building_id')['damage_grade']

# Attempt 1: Model w/ One Feature

In [22]:
def housing_plot(X, y):
    def plotter(column):
        valid_rows = X[column].notna()
        plt.plot(X.loc[valid_rows, column], y[valid_rows], '.', color='k')
        plt.ylabel('Damage Level')
        plt.yticks([1,2,3])
    
    return plotter

dropdown_values = sorted(X.columns)
widgets.interact(housing_plot(X, y), column=dropdown_values);

interactive(children=(Dropdown(description='column', options=('age', 'area_percentage', 'count_families', 'cou…

In [23]:
X_height = X[['height_percentage']]
X_height.head()

Unnamed: 0_level_0,height_percentage
building_id,Unnamed: 1_level_1
802906,5
28830,7
94947,5
590882,5
201944,9


## Train-test split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X_height, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=42)

In [25]:
one_feat_model = LogisticRegression(solver='lbfgs', multi_class='auto')
one_feat_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
y_train_pred = one_feat_model.predict(X_train)

## Compare in- and out-sample metrics (f1 score)

In [27]:
print('In-sample f1 score:')
f1_score(y_train, y_train_pred, average='micro')

In-sample f1 score:


0.5699779355333845

In [28]:
y_test_pred = one_feat_model.predict(X_test)
print('Out-sample f1 score:')
f1_score(y_test, y_test_pred, average='micro')

Out-sample f1 score:


0.5660290477926364

## Create submission

In [29]:
X_comp_test = pd.read_csv('nepal/test_values.csv', index_col='building_id')
X_comp_test.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
300051,17,596,11307,3,20,7,6,t,r,n,...,0,0,0,0,0,0,0,0,0,0
99355,6,141,11987,2,25,13,5,t,r,n,...,1,0,0,0,0,0,0,0,0,0
890251,22,19,10044,2,5,4,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
745817,26,39,633,1,0,19,3,t,r,x,...,0,0,1,0,0,0,0,0,0,0
421793,17,289,7970,3,15,8,7,t,r,q,...,0,0,0,0,0,0,0,0,0,0


In [30]:
y_comp_pred = one_feat_model.predict(X_comp_test[['height_percentage']])

In [31]:
y_submission = pd.DataFrame(y_comp_pred, index=X_comp_test.index, columns=['damage_grade'])

In [32]:
y_submission.to_csv('nepal/2019-11-11_submission.csv')

**Score: 0.56**

# Create Functions for Repetitive Tasks

In [38]:
def create_submission(model, X_cols=None):
    X = pd.read_csv('nepal/test_values.csv', index_col='building_id')
    if X_cols != None:
        X = X[X_cols]
    y_pred = model.predict(X)
    submission = pd.DataFrame(y_pred, index=X.index, 
                              columns=['damage_grade'])
    date_string = pd.Timestamp.utcnow().strftime(format='%Y-%m-%d_%H%M_')
    submission.to_csv(f'nepal/{date_string}submission.csv')

In [39]:
create_submission(one_feat_model, ['height_percentage'])

# Attempt 2: Model w/ All Numerical Features

In [41]:
X.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
28830,8,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,0
94947,21,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
590882,22,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
201944,11,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,0,0,0


In [55]:
numerical_features = [col for col in X.columns if X[col].dtype == 'int64']
# Another way: list(X.describe().columns)

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

In [66]:
num_feat_model = LogisticRegression(solver='lbfgs', 
                                    max_iter=1000, 
                                    multi_class='auto')
num_feat_model.fit(X_train[numerical_features], y_train)
y_train_pred = num_feat_model.predict(X_train[numerical_features])

print('In-sample f1 score:')
f1_score(y_train, y_train_pred, average='micro')

In-sample f1 score:




0.5759976976208749

In [65]:
y_test_pred = num_feat_model.predict(X_test[numerical_features])

print('Out-sample f1 score:')
f1_score(y_test, y_test_pred, average='micro')

Out-sample f1 score:


0.5741064062470022

# Attempt 3: Numerical Features w/ Another Predictor

In [67]:
from sklearn.ensemble import GradientBoostingClassifier

xg_num_model = GradientBoostingClassifier()
xg_num_model.fit(X_train[numerical_features], y_train)
y_train_pred = xg_num_model.predict(X_train[numerical_features])

print('In-sample f1 score:')
f1_score(y_train, y_train_pred, average='micro')

In-sample f1 score:


0.5924501151189563

In [68]:
y_test_pred = xg_num_model.predict(X_test[numerical_features])

print('Out-sample f1 score:')
f1_score(y_test, y_test_pred, average='micro')

Out-sample f1 score:


0.5903954260278966

In [69]:
create_submission(xg_num_model, numerical_features)

# Attempt 4: Use all features (numeric and categorical)