In [1]:
import pandas as pd
import numpy as np

In [41]:
random_state = 42

# Load data

In [2]:
train_values = pd.read_csv('../data/train_values.csv', index_col='building_id')
train_values.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
28830,8,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,0
94947,21,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
590882,22,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
201944,11,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,0,0,0


In [3]:
train_labels = pd.read_csv('../data/train_labels.csv', index_col='building_id')
train_labels.head()

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
802906,3
28830,2
94947,3
590882,2
201944,3


# Explore features

In [4]:
# Categorical columns = 
categorical_columns = [c for c in train_values.select_dtypes(include=['object'])]
for c in categorical_columns:
    print(c)

land_surface_condition
foundation_type
roof_type
ground_floor_type
other_floor_type
position
plan_configuration
legal_ownership_status


In [5]:
numerical_columns = list(set(train_values.columns) - set(categorical_columns))
numerical_columns

['has_superstructure_bamboo',
 'has_superstructure_mud_mortar_stone',
 'has_secondary_use_school',
 'has_superstructure_cement_mortar_brick',
 'has_secondary_use_institution',
 'has_secondary_use_gov_office',
 'has_secondary_use',
 'has_secondary_use_hotel',
 'height_percentage',
 'count_floors_pre_eq',
 'has_secondary_use_other',
 'area_percentage',
 'has_secondary_use_health_post',
 'count_families',
 'age',
 'has_secondary_use_agriculture',
 'has_secondary_use_rental',
 'has_superstructure_rc_engineered',
 'has_secondary_use_industry',
 'has_superstructure_adobe_mud',
 'geo_level_2_id',
 'has_secondary_use_use_police',
 'geo_level_3_id',
 'geo_level_1_id',
 'has_superstructure_timber',
 'has_superstructure_mud_mortar_brick',
 'has_superstructure_other',
 'has_superstructure_stone_flag',
 'has_superstructure_rc_non_engineered',
 'has_superstructure_cement_mortar_stone']

In [6]:
train_values.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
28830,8,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,0
94947,21,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
590882,22,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
201944,11,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,0,0,0


### Drop duplicates

In [7]:
duplicate_index_mask = train_values.index.duplicated(keep='first')
train_values = train_values[~duplicate_index_mask]
train_labels = train_labels[~duplicate_index_mask]

### Drop the index column

In [8]:
# Drop building_id (index) from X and y
train_values.reset_index(drop=True, inplace=True)
train_labels.reset_index(drop=True, inplace=True)
# Convert y to a Series instead of a DataFrame
train_labels = train_labels['damage_grade']


In [9]:
# convert geo data to categorical
def fixup_geo_data(data):
    geo_cols = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']
    for c in geo_cols:
        data[c] = data[c].astype('object')
fixup_geo_data(train_values)

## Split the data

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train_values, train_labels, train_size=0.8, test_size=0.2, random_state=random_state)

All possible values from all categorical columns are present in the train set

## Encode categorical values

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, TargetEncoder

In [12]:
def build_preprocessor_drop_cat():
    return ColumnTransformer(
    transformers=[
        ('cat', 'drop', categorical_columns),
        ('numerical', 'passthrough', numerical_columns),
    ])

def build_preprocessor_ordinal_enc():
    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    # Bundle preprocessing pipeline
    return ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical_columns),
            ("numerical", "passthrough", numerical_columns),
        ])

def build_preprocessor_target_enc():
    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('target', TargetEncoder(target_type='continuous'))
    ])

    # Bundle preprocessing pipeline
    return ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical_columns),
            ("numerical", "passthrough", numerical_columns),
        ])

In [13]:
preprocessors = dict(
    #drop_cat=build_preprocessor_drop_cat(),
    #ordinal_enc_cat=build_preprocessor_ordinal_enc(),
    target_enc_cat=build_preprocessor_target_enc())

# Bayesian search for best hyperparameters for XGBoost

In [49]:
# for preprocessing the data
#from sklearn.preprocessing import StandardScaler

# the model
from sklearn.ensemble import GradientBoostingClassifier
from skopt import BayesSearchCV

#random_forest = GradientBoostingClassifier(n_estimators=100, random_state=57)

# Define the hyperparameter search space
search_space = {"n_estimators": (250, 350),
                "max_depth": (5, 30),
                "learning_rate": (0.02, 1.0, 'log-uniform'),
                "min_samples_split": (2, 10),
                "min_samples_leaf": (1, 10)}

# Initialize the GradientBoostingClassifier
gbc = GradientBoostingClassifier(random_state=random_state)

# Initialize the BayesSearchCV
opt = BayesSearchCV(gbc, search_space, n_iter=50, cv=None, n_jobs=-1, verbose=3)

In [15]:
from dataclasses import dataclass

@dataclass 
class EvaluatedModel:
    name: str
    pipeline: Pipeline
    score: float

In [16]:
# Monkey-patch deprecated Numpy functions (still used by skopt)
np.int = np.int_

In [19]:
from skopt.callbacks import DeadlineStopper, DeltaYStopper
overdone_control = DeltaYStopper(delta=0.001)                    # We stop if the gain of the optimization becomes too small

In [50]:
def fit_model_via_pipeline(name, pipeline, model):
    clf = Pipeline(
       steps=[('preprocessor', pipeline),
              ('model', model)])

    X = X_train.copy()
    y = y_train.copy()

    #kwargs = {"callback": overdone_control}
    kwargs = {"X":X_valid, "y":y_valid, "model__callback": overdone_control}
    clf.fit(**kwargs)
    score = clf.score(X=X_valid, y=y_valid)
    print(f"Evaluated model score: {score}")
    # Print the best parameters and score
    print("Best parameters found: ", clf.best_params_)
    print("Best score found: ", clf.best_score_)
    return EvaluatedModel(name=name, pipeline=clf, score=score)

In [51]:
fitted_pipelines = []
for key in preprocessors:
    model = fit_model_via_pipeline(key, preprocessors[key], opt)
    fitted_pipelines.append(model)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 3/5] END learning_rate=0.7859523653146779, max_depth=12, min_samples_leaf=4, min_samples_split=9, n_estimators=344;, score=0.690 total time= 3.2min
[CV 1/5] END learning_rate=0.7859523653146779, max_depth=12, min_samples_leaf=4, min_samples_split=9, n_estimators=344;, score=0.677 total time= 3.6min
[CV 2/5] END learning_rate=0.7859523653146779, max_depth=12, min_samples_leaf=4, min_samples_split=9, n_estimators=344;, score=0.679 total time= 3.6min
[CV 5/5] END learning_rate=0.7859523653146779, max_depth=12, min_samples_leaf=4, min_samples_split=9, n_estimators=344;, score=0.676 total time= 3.6min
[CV 4/5] END learning_rate=0.7859523653146779, max_depth=12, min_samples_leaf=4, min_samples_split=9, n_estimators=344;, score=0.678 total time= 3.7min
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 4/5] END learning_rate=0.768730400864134, max_depth=25, min_samples_leaf=1, min_samples_split=6, n_estimators=345;, sc

KeyboardInterrupt: 

In [None]:
best_pipeline = max(fitted_pipelines, key=lambda x:x.score)
#best_preprocessor_key = 'target_enc_cat'
print(f"The best preprocessor: {best_pipeline.name}, score: {best_pipeline.score}")

The best preprocessor: target_enc_cat, score: 0.6809347479902534


# Prediction on validation data

In [None]:
clf = best_pipeline.pipeline
preds_valid = clf.predict(X_valid)

In [None]:
from sklearn.metrics import f1_score
print(f"Score on validation data: {f1_score(y_valid, preds_valid, average='micro')}")

Score on validation data: 0.6809347479902534


# Prediction on test data

In [None]:
X_test = pd.read_csv('../data/test_values.csv', index_col='building_id')
# Drop building_id (index)
X_test.reset_index(drop=True, inplace=True)
fixup_geo_data(X_test)

In [None]:
preds_test = clf.predict(X_test)

# Results submission

In [None]:
submission_format = pd.read_csv('../data/submission_format.csv', index_col='building_id')
my_submission = pd.DataFrame(data=preds_test,
                             columns=submission_format.columns,
                             index=submission_format.index)
my_submission.head()

Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,3
99355,2
890251,2
745817,1
421793,3


In [None]:
#my_submission.to_csv('submission.csv')