In [1]:
import sys
sys.path.append('../src')
sys.path.append('../submissions')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from clean import remove_missing_values
from evaluate import evaluate_model
from utils import save_submission, load_train_data, load_test_data

pd.set_option('display.max_columns', None)

In [2]:
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder, TargetEncoder, FunctionTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

In [3]:
# Load training data
train_data = load_train_data(local=True)

In [4]:
# Remove entries where building age is 995
train_data = remove_missing_values(train_data)

In [5]:
# Feature engineering on building type
cols_building_type = [col for col in train_data.columns if 'type' in col]
train_data['all_type'] = train_data[cols_building_type].sum(axis=1)
train_data.drop(columns=cols_building_type)

transf_type = ColumnTransformer([
    ('transf_type', ce.CountEncoder(min_group_size=1700), 'all_type')
])

train_data['all_type'] = transf_type.fit_transform(train_data).flatten()
np.sort(train_data['all_type'].unique())

array([  1927,   1938,   2041,   2227,   2296,   2647,   2653,   2942,
         3095,   3167,   4093,   6342,   6446,   9715,  18679,  24791,
        25720,  36012, 100439])

In [6]:
# Prepare data for preprocessing and modelling
TARGET = 'damage_grade'

X = train_data.copy()
y = train_data.pop(TARGET)

In [7]:
# Split into train and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2,
                                                      stratify=y,
                                                      random_state=42)

In [8]:
# Create a pipeline for preprocessing data
cols_numerical = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage']

cols_one_hot_encode = ['foundation_type', 'roof_type', 'ground_floor_type', 
                       'other_floor_type', 'land_surface_condition', 'position',
                       'plan_configuration', 'legal_ownership_status']

cols_target_encode = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']

cols_frequency_encode = ['geo_level_2_id', 'geo_level_3_id']

cols_log_transform = ['age', 'area_percentage', 'height_percentage']

def freq_encode(df):
    return pd.concat([df[col].map(df[col].value_counts(normalize=True)) for col in cols_frequency_encode], axis=1)

def get_house_volume(df):
    df.loc[:, "house_volume"] = df["area_percentage"] * df["height_percentage"]
    return df

preprocess = ColumnTransformer([
    ('onehot_encoder', OneHotEncoder(), cols_one_hot_encode),
    ('target_encoder', TargetEncoder(target_type="multiclass"), cols_target_encode),
    ('frequency_encoder', FunctionTransformer(freq_encode) , cols_frequency_encode),
    ('log_transform', FunctionTransformer(np.log1p), cols_log_transform),
    ('min_max_scaler', MinMaxScaler(), cols_numerical),
    # ('f_e_house_volume', FunctionTransformer(get_house_volume, validate=False), ['area_percentage', 'height_percentage'])
])

In [9]:
# Evaluate performance of base model
# log_reg = LogisticRegression(multi_class='multinomial', max_iter=1000, random_state=42)

# base_pipe = Pipeline(steps=[
#     ('preprocess', preprocess),
#     ('base_model', log_reg)
# ])

# base_score_valid, base_score_train = evaluate_model(base_pipe, X_train, X_valid, y_train, y_valid)
# print(f"F1-score (valid): {base_score_valid :.3f}")

In [10]:
# Evaluate performance of base model
xgboost = XGBClassifier(learning_rate=0.1, n_estimators=100, random_state=42)

xgb_pipe = Pipeline(steps=[
    ('preprocess', preprocess),
    ('xgb_model', xgboost)
])

xgb_score_valid, xgb_score_train = evaluate_model(xgb_pipe, X_train, X_valid, y_train-1, y_valid-1)
print(f"F1-score: {xgb_score_valid :.3f}")

F1-score: 0.741


In [11]:
# Evaluate performance of base model
# gboostc = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, random_state=42)

# gbc_pipe = Pipeline(steps=[
#     ('preprocess', preprocess),
#     ('gbc_model', gboostc)
# ])

# gbc_score = evaluate_model(gbc_pipe, X_train, X_valid, y_train, y_valid)
# print(f"F1-score: {gbc_score :.3f}")

In [12]:
# Load test data and save predictions into a file for submission
# test_data = load_test_data(local=True)
# save_submission(base_pipe, test_data)