In [1]:
%matplotlib inline

import numpy as np
import pandas as pd

from pathlib import Path
pd.options.plotting.backend = "plotly"
pd.set_option('display.max_columns', None)


import matplotlib.pyplot as plt
import seaborn as sns

# Load data

In [2]:
train_X = pd.read_csv('../data/train_values.csv', index_col='building_id')
train_y = pd.read_csv('../data/train_labels.csv', index_col='building_id')

X_test = pd.read_csv('../data/test_values.csv', index_col='building_id')

# Explore features

In [3]:
train_X[['geo_level_1_id','geo_level_2_id','geo_level_3_id']] = train_X[['geo_level_1_id','geo_level_2_id','geo_level_3_id']].astype(str)
X_test[['geo_level_1_id','geo_level_2_id','geo_level_3_id']] = X_test[['geo_level_1_id','geo_level_2_id','geo_level_3_id']].astype(str)

train_X[['count_floors_pre_eq','age','area_percentage','height_percentage']] = train_X[['count_floors_pre_eq','age','area_percentage','height_percentage']].astype(float)

train_X['volume_percentage']=train_X['area_percentage'] * train_X['height_percentage']
X_test['volume_percentage']=X_test['area_percentage'] * X_test['height_percentage']

# Categorical columns 
categorical_columns = [c for c in train_X.select_dtypes(include=['object'])]
numerical_columns= list(set(train_X.columns) - set(categorical_columns))

In [4]:
# Drop building_id (index) from X and y
train_X.reset_index(drop=True, inplace=True)
train_y.reset_index(drop=True, inplace=True)

In [5]:
duplicate_index_mask = train_X.index.duplicated(keep='first')
X = train_X[~duplicate_index_mask]
y = train_y[~duplicate_index_mask]

# Splitting the data

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

## Encode categorical values

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import TargetEncoder


# Preprocessing categorical data
categorical_transformer = Pipeline(steps=[
    ("target", TargetEncoder(target_type="continuous"))
])

# Bundle prepocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_columns),
        ("numerical", "passthrough", numerical_columns),
    ])

# Model Training

## XGBoost

In [8]:
y_train_encoded = y_train.copy() - 1
y_train_encoded

Unnamed: 0,damage_grade
5654,0
28094,0
151910,0
53449,1
202567,2
...,...
176963,0
117952,2
173685,2
43567,2


In [9]:
y_valid_encoded = y_valid.copy() - 1
y_valid_encoded 

Unnamed: 0,damage_grade
64404,1
258922,2
55765,1
103394,1
25942,2
...,...
222539,2
197929,1
24208,2
199919,0


In [12]:
from xgboost import XGBClassifier

xg_model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=57,
#    use_label_encoder=False, # Use this to avoid a deprecation warning from XGBoost
    objective='multi:softmax', # Specify the multi-class objective
    num_class=3 # Number of classes in the target variable
)

# Create the pipeline with XGBoost
xg_clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", xg_model)
])

# Fitting the model
xg_clf.fit(X_train, y_train_encoded["damage_grade"])
xg_clf



In [13]:
xg_preds = xg_clf.predict(X_valid)

In [15]:
from sklearn.metrics import f1_score
xg_f1_score = f1_score(y_valid_encoded, xg_preds, average='micro')

print(f"F1 score: {xg_f1_score}")

F1 score: 0.7363634619443219


In [16]:
import plotly.graph_objects as go

# Extract feature importances from the random forest model
feature_importances = xg_clf.named_steps['model'].feature_importances_

# Combine feature importances with their corresponding feature names
# This requires a bit of manipulation since you have a ColumnTransformer
# Assuming 'categorical_columns' and 'numerical_columns' are lists of feature names
features = categorical_columns + numerical_columns
sorted_idx = feature_importances.argsort()

# Create a bar plot
fig = go.Figure([go.Bar(x=feature_importances[sorted_idx], y=[features[i] for i in sorted_idx], orientation='h')])

# Update layout
fig.update_layout(title='Feature Importances in Random Forest Model',
                  xaxis_title='Importance',
                  yaxis_title='Feature',
                  yaxis={'categoryorder':'total ascending'},
                  height=600, width=800)

# Show the plot
fig.show()


## Grid Search

In [8]:
from sklearn.model_selection import GridSearchCV

In [17]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__learning_rate': [0.01, 0.05, 0.1],
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=xg_clf, param_grid=param_grid, 
                           scoring='f1_micro', n_jobs=-1, cv=3, verbose=3)


In [18]:
# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train_encoded["damage_grade"])

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[CV 1/3] END model__learning_rate=0.01, model__n_estimators=100;, score=0.728 total time= 5.1min
[CV 3/3] END model__learning_rate=0.01, model__n_estimators=100;, score=0.728 total time= 5.1min
[CV 2/3] END model__learning_rate=0.01, model__n_estimators=100;, score=0.729 total time= 5.2min
[CV 1/3] END model__learning_rate=0.01, model__n_estimators=200;, score=0.729 total time= 9.9min
[CV 2/3] END model__learning_rate=0.01, model__n_estimators=200;, score=0.731 total time=10.0min
[CV 3/3] END model__learning_rate=0.01, model__n_estimators=200;, score=0.730 total time=10.0min
[CV 1/3] END model__learning_rate=0.05, model__n_estimators=100;, score=0.733 total time= 5.0min
[CV 2/3] END model__learning_rate=0.05, model__n_estimators=100;, score=0.735 total time= 5.1min
[CV 3/3] END model__learning_rate=0.05, model__n_estimators=100;, score=0.734 total time= 5.4min
[CV 1/3] END model__learning_rate=0.01, model__n_estimators=300;, score=0.731 total time=15.5min
[CV 2/3] END model__learning_r

In [19]:
best_parameters = grid_search.best_params_
print("Best Parameters:", best_parameters)

Best Parameters: {'model__learning_rate': 0.1, 'model__n_estimators': 300}


In [20]:
best_score = grid_search.best_score_
print("Best Score:", best_score)

Best Score: 0.7386502737902004


In [21]:
best_estimator = grid_search.best_estimator_
print("Best Estimator:", best_estimator)

Best Estimator: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('target',
                                                                   TargetEncoder(target_type='continuous'))]),
                                                  ['geo_level_1_id',
                                                   'geo_level_2_id',
                                                   'geo_level_3_id',
                                                   'land_surface_condition',
                                                   'foundation_type',
                                                   'roof_type',
                                                   'ground_floor_type',
                                                   'other_floor_type',
                                                   'position',
                                                   'plan_configuration',
         

In [29]:
best_xg_preds = best_estimator.predict(X_test) +1

best_xg_submission = pd.DataFrame(data=best_xg_preds,
                             columns=['damage_grade'],
                             index=X_test.index)

best_xg_submission.to_csv('best_xg_submission.csv')

In [27]:
best_xg_submission_df.to_csv('best_xg_submission.csv')

In [29]:
xg_model = XGBClassifier(
    n_estimators=11,
    learning_rate=0.1,
    max_depth=5,
    random_state=57,
    use_label_encoder=False, # Use this to avoid a deprecation warning from XGBoost
    objective='multi:softmax', # Specify the multi-class objective
    num_class=3 # Number of classes in the target variable
)

# Create the pipeline with XGBoost
xg_clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", xg_model)
])

# Fitting the model
xg_clf.fit(X_train, y_train_encoded)




In [30]:
xg_preds = xg_clf.predict(X_valid)