In [1]:
import sys
sys.path.append('../src')
sys.path.append('../submissions')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from load import load_train_data, load_test_data
from evaluate import evaluate_model
from submit import save_submission

pd.set_option('display.max_columns', None)

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [3]:
# Load training data
train_data = load_train_data(local=True)

In [4]:
# Drop geo_level columns
cols_to_drop = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']
train_data = train_data.drop(columns=cols_to_drop)

In [5]:
# Prepare data for preprocessing and modelling
TARGET = 'damage_grade'

X = train_data.copy()
y = train_data.pop(TARGET)

In [6]:
# Split into train and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2,
                                                      stratify=y,
                                                      random_state=42)

In [7]:
# Create a basic preprocessing transformer
cols_numerical = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage']
cols_categorical = train_data.select_dtypes(include='object').columns

base_preprocess = ColumnTransformer([
    ('onehot', OneHotEncoder(), cols_categorical),
    ('scale', MinMaxScaler(), cols_numerical)
])

In [8]:
# Create a basic pipeline with preprocessing and fitting a logistic regression model
logistic_regression = LogisticRegression(multi_class='multinomial', max_iter=500, random_state=42)

base_pipe = Pipeline(steps=[
    ('preprocess', base_preprocess),
    ('base_model', logistic_regression)
])

# Evaluate model performance with base model Logistic Regression
base_score = evaluate_model(base_pipe, X_train, X_valid, y_train, y_valid)
print(f"F1-score of the base model: {base_score :.3f}")

F1-score of the base model: 0.576


In [9]:
# Load test data and save predictions into a file for submission
test_data = load_test_data(local=True)
save_submission(base_pipe, test_data,)