In [1]:
import sys
sys.path.append('../src')
sys.path.append('../submissions')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
# from clean import remove_missing_values
from evaluate import evaluate_model
from utils import save_submission, load_train_data, load_test_data

pd.set_option('display.max_columns', None)

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, TargetEncoder
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [3]:
# Load training data
train_data = load_train_data(local=True)

In [4]:

cols_numerical = ['count_floors_pre_eq', 'age', 'area_percentage', 
                  'height_percentage', 'count_families']

cols_categorical_geo = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']

cols_categorical_type = ['foundation_type', 'roof_type', 'ground_floor_type', 
                         'other_floor_type']

cols_categorical_other = ['land_surface_condition', 'position', 
                          'plan_configuration', 'legal_ownership_status']

cols_dummy_superstructure = ['has_superstructure_adobe_mud',
                             'has_superstructure_mud_mortar_stone', 
                             'has_superstructure_stone_flag',
                             'has_superstructure_cement_mortar_stone',
                             'has_superstructure_mud_mortar_brick',
                             'has_superstructure_cement_mortar_brick', 
                             'has_superstructure_timber',
                             'has_superstructure_bamboo', 
                             'has_superstructure_rc_non_engineered',
                             'has_superstructure_rc_engineered',
                             'has_superstructure_other']

cols_dummy_use = ['has_secondary_use', 'has_secondary_use_agriculture',
                  'has_secondary_use_hotel', 'has_secondary_use_rental',
                  'has_secondary_use_institution', 'has_secondary_use_school', 
                  'has_secondary_use_industry', 'has_secondary_use_health_post',
                  'has_secondary_use_gov_office', 'has_secondary_use_use_police', 
                  'has_secondary_use_other']

# Sanity check: all features in the dataset
len(cols_numerical + cols_categorical_geo + cols_categorical_type + \
    cols_categorical_other + cols_dummy_superstructure + cols_dummy_use) == (train_data.shape[1] - 1)

True

In [5]:
from clean import remove_age_na

# Remove entries where building age is 995
train_data = remove_age_na(train_data, na_value=995)
train_data.shape

(259211, 39)

In [6]:
# from clean import remove_age_old

# # Remove entries where building age is older than 100 years
# train_data = remove_age_old(train_data, age_limit=100)
# train_data.shape

In [7]:
# from clean import remove_high_floors
# # Remove entries where buildings have more than 3 floors (before earthquake)
# train_data = remove_high_floors(train_data)
# train_data.shape

In [8]:
# Subset of columns to keep for fitting the model
cols_to_keep = ['count_floors_pre_eq', 'age', 'area_percentage', 
                'height_percentage',
                'geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
                'foundation_type', 'roof_type', 'ground_floor_type', 
                'other_floor_type',
                'land_surface_condition', 'position', 
                'plan_configuration', 'legal_ownership_status',
                # 'has_superstructure_mud_mortar_stone', 
                # 'has_superstructure_cement_mortar_brick', 
                # 'has_superstructure_timber',
                # 'has_secondary_use', 'has_secondary_use_agriculture',
                # 'has_secondary_use_hotel', 'has_secondary_use_rental',
                # 'has_secondary_use_institution', 'has_secondary_use_school', 
                # 'has_secondary_use_industry', 'has_secondary_use_health_post',
                # 'has_secondary_use_gov_office', 'has_secondary_use_use_police', 
                # 'has_secondary_use_other',
                'damage_grade']

train_data = train_data[cols_to_keep].copy()

In [9]:
# Prepare data for preprocessing and modelling
TARGET = 'damage_grade'

X = train_data.copy()
y = train_data.pop(TARGET)

In [10]:
# Split into train and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2,
                                                      stratify=y,
                                                      random_state=42)

In [11]:
# Define list of columns for each pre-processing step
cols_ohe = cols_categorical_type + cols_categorical_other
cols_target_enc = cols_categorical_geo # ['geo_level_1_id', 'geo_level_2_id'] # ['geo_level_1_id']  
cols_log_transf = ['age', 'area_percentage', 'height_percentage']
cols_to_scale = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage']

# Create a pipeline for preprocessing data
preprocess = ColumnTransformer([
    ('onehot_encoder', OneHotEncoder(), cols_ohe),
    ('target_encoder', TargetEncoder(target_type="multiclass"), cols_target_enc),
    ('log_transform', FunctionTransformer(np.log1p), cols_log_transf),
    ('min_max_scaler', MinMaxScaler(), cols_to_scale),
])

In [14]:
# Evaluate performance of Logistic Regression model
log_reg = LogisticRegression(penalty='l2', multi_class='multinomial', 
                             max_iter=1000, random_state=42)  #  class_weight='balanced',
                            

base_pipe = Pipeline(steps=[
    ('preprocess', preprocess),
    ('base_model', log_reg)
])

base_score_valid, base_score_train = evaluate_model(base_pipe, 
                                                    X_train, X_valid, 
                                                    y_train, y_valid)

print(f"F1-score of LogisticRegression model (valid): {base_score_valid :.3f}")

F1-score of LogisticRegression model (valid): 0.691


In [None]:
# Evaluate performance of XGBoostClassifier model
xgboost = XGBClassifier(learning_rate=0.1, n_estimators=100, random_state=42)

xgb_pipe = Pipeline(steps=[
    ('preprocess', preprocess),
    ('xgb_model', xgboost)
])

xgb_score_valid, xgb_score_train = evaluate_model(xgb_pipe, 
                                                  X_train, X_valid,
                                                  y_train-1, y_valid-1)

print(f"F1-score of XGBoost model (valid): {xgb_score_valid :.3f}")

F1-score of XGBoost model (valid): 0.740


In [None]:
# Evaluate performance of GradientBoostingClassifier model
# gboostc = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, random_state=42)

# gbc_pipe = Pipeline(steps=[
#     ('preprocess', preprocess),
#     ('gbc_model', gboostc)
# ])

# gbc_score_valid, gbc_score_train = evaluate_model(gbc_pipe, 
#                                                   X_train, X_valid,
#                                                   y_train, y_valid)

# print(f"F1-score of GradBoostClassifier model (valid): {gbc_score_valid :.3f}")

In [None]:
# Load test data and save predictions into a file for submission
# test_data = load_test_data(local=True)
# save_submission(base_pipe, test_data)