In [1]:
from datetime import datetime
from pathlib import Path
import sys

sys.path.append('../src')
sys.path.append('../submissions')

from utils import Data, Model, Submission

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score


pd.set_option('display.max_columns', None)

In [2]:
# Load training data
train_data = Data.load_train_data()

In [3]:
# Drop geo_level columns
cols_to_drop = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']
train_data = train_data.drop(columns=cols_to_drop)

In [4]:

# Prepare data for preprocessing and modelling
TARGET = 'damage_grade'

X = train_data.copy()
y = train_data.pop(TARGET)

# Label encode target variable
label_enc = LabelEncoder()
y = label_enc.fit_transform(y)

In [5]:
# Split into train and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2,
                                                      stratify=y,
                                                      random_state=42)

In [6]:
# Create a basic preprocessing transformer
cols_numerical = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage']
cols_categorical = train_data.select_dtypes(include='object').columns

base_preprocess = ColumnTransformer([
    ('onehot', OneHotEncoder(), cols_categorical),
    ('scale', MinMaxScaler(), cols_numerical)
])

In [7]:
# Create a base pipeline with preprocessing steps and a logistic regression model
logistic_regression = LogisticRegression(multi_class='multinomial', 
                                         max_iter=1000, random_state=42)

base_pipe = Pipeline(steps=[
    ('preprocess', base_preprocess),
    ('base_model', logistic_regression)
])

In [8]:
# Evaluate model performance with base model Logistic Regression
base_model = Model(base_pipe)
base_score_valid, base_score_train = base_model.evaluate_model(X_train, X_valid, 
                                                               y_train, y_valid)

print(f"F1-score of the base model: {base_score_valid :.3f}")

F1-score of the base model: 0.576


In [9]:
# Load test data and save predictions into a file for submission
test_data = Data.load_test_data()

In [10]:
timestamp =  datetime.now().timestamp()
# Create and save a submission file in submissions/
Submission(base_model, test_data).save_submission(timestamp)

PosixPath('../submissions/submission1706959851.csv')

In [11]:
# Save model as .pickle file in models/
base_model.save_model(timestamp)

PosixPath('../models/model_1706959851.pickle')