# 60-minute Models

* [Competition](https://www.drivendata.org/competitions/57/nepal-earthquake/page/136/)

## Import Data

In [None]:
import pandas as pd

In [None]:
dtype = {'geo_level_1_id': str,
         'geo_level_2_id': str,
         'geo_level_3_id': str}

X = pd.read_csv('data/train_values.csv', index_col='building_id', dtype=dtype)

In [None]:
X.head()

In [None]:
X.info()

In [None]:
X.foundation_type.value_counts()

In [None]:
X.filter(like='has_super', axis=1).head()

In [None]:
X.has_superstructure_adobe_mud.value_counts()

In [None]:
y = pd.read_csv('data/train_labels.csv', index_col='building_id')['damage_grade']

## Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Baseline

In [None]:
print('Baseline Accuracy:', y_train.value_counts(normalize=True).max())

## Build Model (MVP)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# Instantiate
model_mvp = LogisticRegression()

# Train
model_mvp.fit(X_train[['age']], y_train)

In [None]:
# Check metrics
print('Training Accuracy:', model_mvp.score(X_train[['age']], y_train))
print('Validation Accuracy:', model_mvp.score(X_val[['age']], y_val))

In [None]:
# Check competition metric
from sklearn.metrics import f1_score
f1_score(y_val, model_mvp.predict(X_val[['age']]), average='micro')

In [None]:
# Load test data
X_test = pd.read_csv('data/test_values.csv', index_col='building_id', dtype=dtype)
X_test.head()

In [None]:
y_pred = model_mvp.predict(X_test[['age']])

In [None]:
y_pred = pd.DataFrame(y_pred, columns=['damage_grade'], index=X_test.index)

In [None]:
y_pred.to_csv('submissions/2020-08-07_submission.csv')

## Build Model: More Features

In [None]:
model_nums = LogisticRegression()

model_nums.fit(X_train.select_dtypes('number'), y_train)

In [None]:
f1_score(y_val, model_nums.predict(X_val.select_dtypes('number')), average='micro')