In [1]:
import sys
sys.path.append('../src')
sys.path.append('../submissions')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from load import load_train_data, load_test_data
from evaluate import evaluate_model
from utils import save_submission, save_model, load_model
from encoding import freq_encode, get_house_volume
from datetime import datetime

pd.set_option('display.max_columns', None)

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, TargetEncoder, FunctionTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

In [3]:
# Load training data
train_data = load_train_data(local=True)

In [4]:
# Drop geo_level columns
#cols_to_drop = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']
#train_data = train_data.drop(columns=cols_to_drop)

In [5]:
# Prepare data for preprocessing and modelling
TARGET = 'damage_grade'

X = train_data.copy()
y = train_data.pop(TARGET)

In [6]:
# Split into train and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2,
                                                      stratify=y,
                                                      random_state=42)

In [7]:
# Create a simple pipeline
cols_numerical = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage']

cols_one_hot_encode = ['land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'legal_ownership_status']

cols_target_encode = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']

# WARNING: If changed, also change this variable in src/encoding.py
# TODO: Change this dirty hack :D 
cols_frequency_encode = ['geo_level_2_id', 'geo_level_3_id']

preprocess = ColumnTransformer([
    ('onehot_encoder', OneHotEncoder(), cols_one_hot_encode),
    ('target_encoder', TargetEncoder(target_type="multiclass"), cols_target_encode),
    ('frequency_encoder', FunctionTransformer(freq_encode) , cols_frequency_encode),
    ('min_max_scaler', MinMaxScaler(), cols_numerical),    
    #('f_e_house_volume', FunctionTransformer(get_house_volume, validate=False), ['area_percentage', 'height_percentage'])
])

log_reg = LogisticRegression(multi_class='multinomial', max_iter=500, random_state=42)

simple_pipe = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', log_reg)
])

In [8]:
# Evaluate model performance
score = evaluate_model(simple_pipe, X_train, X_valid, y_train, y_valid)
print(f"F1-score: {score :.3f}")

F1-score: 0.733


In [9]:
# Load test data and save predictions into a file for submission
test_data = load_test_data(local=True)

# Create timestemp for filenames of model and submission files
timestamp =  datetime.now().timestamp()
save_submission(simple_pipe, test_data, timestamp)
save_model(simple_pipe, timestamp)

In [10]:
# Here is an example: load the baseline pipeline and only change the model

base_line_pipeline = load_model('../models/model_baseline.pickle')
new_model = GradientBoostingClassifier()
new_pipeline = base_line_pipeline.set_params(model=new_model)
new_pipeline