In [3]:
import pandas as pd
pd.options.plotting.backend = "plotly"
pd.set_option('display.max_columns', None)

import numpy as np

# Load data

In [4]:
X_train = pd.read_csv('../data/train_values.csv', index_col='building_id')
y_train = pd.read_csv('../data/train_labels.csv', index_col='building_id')

X_test = pd.read_csv('../data/test_values.csv', index_col='building_id')

# Explore features

In [5]:
X_train[['geo_level_1_id','geo_level_2_id','geo_level_3_id']] = X_train[['geo_level_1_id','geo_level_2_id','geo_level_3_id']].astype(str)
X_test[['geo_level_1_id','geo_level_2_id','geo_level_3_id']] = X_test[['geo_level_1_id','geo_level_2_id','geo_level_3_id']].astype(str)

X_train['volume_percentage']=X_train['area_percentage'] * X_train['height_percentage']
X_test['volume_percentage']=X_test['area_percentage'] * X_test['height_percentage']

# Categorical columns 
categorical_columns = [c for c in X_train.select_dtypes(include=['object'])]

## Split the data

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, train_size=0.8, test_size=0.2, random_state=0)

In [15]:
from sklearn.utils import resample

def upsample(X_train, y_train):
    data = pd.concat([X_train, y_train], axis=1)

    # Separate classes
    damage_3 = data[data['damage_grade']==3]
    damage_2 = data[data['damage_grade']==2]
    damage_1  = data[data['damage_grade']==1]
 
    # Upsample miniroty class, damage=1
    damage_1_upsampled = resample(damage_1, 
                                  replace=True,     # sample with replacement
                                  n_samples=damage_2.shape[0]) # reproducible results
    
    # Upsample miniroty class, damage=3
    damage_3_upsampled = resample(damage_3, 
                                  replace=True,     # sample with replacement
                                  n_samples=damage_2.shape[0]) # reproducible results

    # Combine majority class with upsampled minority classes
    data_upsampled = pd.concat([damage_1_upsampled, damage_2, damage_3_upsampled])

    y_train_upsampled = data_upsampled[['damage_grade']]
    X_train_upsampled = data_upsampled.drop(['damage_grade'], axis=1)

    return (X_train_upsampled, y_train_upsampled)

X_train_new, y_train_new = upsample(X_train=X_train, y_train=y_train)

All possible values from all categorical columns are present in the train set

## Encode categorical values

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import TargetEncoder

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('target', TargetEncoder(target_type="continuous"))
])

# Bundle preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_columns)
    ])



# Modell training

In [17]:
# for preprocessing the data
#from sklearn.preprocessing import StandardScaler

# the model
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=57)

clf = Pipeline(
    steps=[('preprocessor', preprocessor),
           ('model', model)])

clf.fit(X_train, y_train['damage_grade'])


## Show quality metric 

In [18]:
from sklearn.metrics import f1_score

pred_valid = clf.predict(X_valid)

my_f1_score = f1_score(y_valid, pred_valid, average='micro')

print(f"F1 score: {my_f1_score}")

F1 score: 0.7288808733523916


# Prediction on validation data

In [28]:
preds = clf.predict(X_test)

my_submission = pd.DataFrame(data=preds,
                             columns=['damage_grade'],
                             index=X_test.index)

my_submission.to_csv('submission.csv')