In [1]:
from datetime import datetime
from pathlib import Path
import sys
sys.path.append('../src')
sys.path.append('../submissions')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from utils import save_submission, save_model, load_model
from encoding import freq_encode, get_house_volume

pd.set_option('display.max_columns', None)

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, TargetEncoder, FunctionTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

In [3]:
# Load training data
train_data = load_train_data(local=True)

In [4]:
# Prepare data for preprocessing and modelling
TARGET = 'damage_grade'

X = train_data.copy()
y = train_data.pop(TARGET)

In [5]:
# Split into train and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2,
                                                      stratify=y,
                                                      random_state=42)

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC


In [7]:
# First we run an XGB Classifier with default parameters

base_line_pipeline = load_model('../models/model_baseline.pickle')

y_train_ = y_train.copy()
y_valid_ = y_valid.copy()


new_model = XGBClassifier()

new_pipeline = base_line_pipeline.set_params(model=new_model)

# Evaluate model performance
if "xgboost" in str(type(new_model)):
    y_train_ = y_train_.apply(lambda x: int(x-1))
    y_valid_ = y_valid_.apply(lambda x: int(x-1))


score_valid, score_train = evaluate_model(new_pipeline, X_train, X_valid, y_train_, y_valid_)
print(f"{new_model}:")
print(f"F1-score (validation): {score_valid :.3f}")
print(f"F1-score (training): {score_train :.3f}")
print("________________________________________")

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              objective='multi:softprob', predictor=None, ...):
F1-score (validation): 0.745
F1-score (training): 0.765
________________________________________


In [8]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

# Define sample space for hyperparameter search 

space={'max_depth': hp.randint("max_depth", 3, 18),
        'gamma': hp.uniform ('gamma', 1, 9),
        'reg_alpha' : hp.randint('reg_alpha', 40,180),
        'reg_lambda' : hp.uniform('reg_lambda', 0, 1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5, 1),
        'min_child_weight' : hp.randint('min_child_weight', 0, 10),
        'n_estimators': hp.randint('n_estimators', 50, 200),
        'seed': 0
    }

In [9]:
# Define objective of search: Important is the "loos" feature
def objective(space):
    new_model=XGBClassifier(seed=space['seed'],
                    max_depth = space['max_depth'],
                    gamma = space['gamma'],
                    reg_alpha = space['reg_alpha'], 
                    reg_lambda = space['reg_lambda'],
                    colsample_bytree = space['colsample_bytree'],
                    min_child_weight = space['min_child_weight'],
                    n_estimators = space['n_estimators'], 
)
    
    clf = base_line_pipeline.set_params(model=new_model)
    
    score_valid, score_train = evaluate_model(clf, X_train, X_valid, y_train_, y_valid_)

    return {'loss': 1-score_valid, 'status': STATUS_OK }


In [10]:
# Perform hyerparam search
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 10,
                        trials = trials)


100%|██████████| 1/1 [00:39<00:00, 39.52s/trial, best loss: 0.25743941981159224]


In [11]:
# run model for best parameters
new_model = XGBClassifier(**best_hyperparams)
clf = base_line_pipeline.set_params(model=new_model)

score_valid, score_training = evaluate_model(clf, X_train, X_valid, y_train_, y_valid_)

print(f"F1-score (validation): {score_valid :.3f}")
print(f"F1-score (training): {score_training :.3f}")

In [12]:
# Load test data and save predictions into a file for submission
test_data = load_test_data(local=True)

# Create timestemp for filenames of model and submission files
timestamp =  datetime.now().timestamp()

# Save submission file
submission_fpath = save_submission(clf, test_data, timestamp)

# Save model
model_fpath = save_model(clf, timestamp)