In [13]:
import sys
sys.path.append('../src')
sys.path.append('../submissions')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from load import load_train_data, load_test_data
from evaluate import evaluate_model
from submit import save_submission

pd.set_option('display.max_columns', None)

In [55]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, TargetEncoder, FunctionTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [15]:
# Load training data
train_data = load_train_data(local=True)

In [16]:
# Drop geo_level columns
#cols_to_drop = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']
#train_data = train_data.drop(columns=cols_to_drop)

In [17]:
# Prepare data for preprocessing and modelling
TARGET = 'damage_grade'

X = train_data.copy()
y = train_data.pop(TARGET)

In [18]:
# Split into train and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2,
                                                      stratify=y,
                                                      random_state=42)

In [19]:
train_data.select_dtypes(include='object').columns

Index(['land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'legal_ownership_status'],
      dtype='object')

In [73]:
# Create a simple pipeline
cols_numerical = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage']

cols_one_hot_encode = ['land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'legal_ownership_status']

cols_target_encode = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']


preprocess = ColumnTransformer([
    ('onehot_encoder', OneHotEncoder(), cols_one_hot_encode),
    ('target_encoder', TargetEncoder(target_type="multiclass"), cols_target_encode),
    ('min_max_scaler', MinMaxScaler(), cols_numerical)
])

log_reg = LogisticRegression(multi_class='multinomial', max_iter=500, random_state=42)

simple_pipe = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', log_reg)
])

In [74]:
# Evaluate model performance
score = evaluate_model(simple_pipe, X_train, X_valid, y_train, y_valid)
print(f"F1-score: {score :.3f}")

F1-score: 0.733


In [75]:
# Load test data and save predictions into a file for submission
test_data = load_test_data(local=True)
save_submission(simple_pipe, test_data, 'submission_0.csv')