In [17]:
import sys
sys.path.append('../src')
sys.path.append('../submissions')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import encoding

pd.set_option('display.max_columns', None)

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score


In [3]:
DATA_DIR = Path('../data')
TARGET = 'damage_grade'

# Load data
train_values = pd.read_csv(DATA_DIR / 'train_values.csv', index_col='building_id')
train_labels = pd.read_csv(DATA_DIR / 'train_labels.csv', index_col='building_id')
test_values = pd.read_csv(DATA_DIR / 'test_values.csv', index_col='building_id')

# Join training features and labels into one DataFrame
train_data = train_values.join(train_labels)

train_data.shape
#TODO: put in a function, add to a .py file in src/

(260601, 39)

In [4]:
# Sanity check for no missing values in our dataset
na_count = train_data.isna().sum()
na_count[na_count > 0]

Series([], dtype: int64)

In [6]:
TARGET = 'damage_grade'

X = train_data.copy()
y = train_data.pop(TARGET)

In [7]:
# Pre-process target variable
# A bit of an overkill in our case since it only does a subtraction (minus 1)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train_labels.to_numpy().ravel())

In [22]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2,
                                                      stratify=y,
                                                      random_state=42)

In [23]:
# Create a simple pipeline
cols_building_type = [col for col in train_data.columns if '_type' in col]
cols_categorical = train_data.select_dtypes(include='object').columns

preprocess = ColumnTransformer([
    ('onehot', OneHotEncoder(), cols_categorical)
])

log_reg = LogisticRegression(multi_class='multinomial', random_state=42)

simple_pipe = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', log_reg)
])

simple_pipe

In [19]:
def evaluate_model(pipe, X_train, X_valid, y_train, y_valid):
    pipe.fit(X_train, y_train)
    preds_valid = pipe.predict(X_valid)
    score = f1_score(y_valid, preds_valid, average='micro')
    return score, pipe

In [24]:
evaluate_model(simple_pipe, X_train, X_valid, y_train, y_valid)

0.5774447919264788

In [36]:
pd.DataFrame.from_dict(simple_pipe.named_steps)

ValueError: If using all scalar values, you must pass an index

In [40]:
from submit import save_submission
save_submission(simple_pipe, test_values, 'submission_0.csv')