In [1]:
import mlflow
import pickle
import pandas as pd
import numpy as np
from collections import Counter
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder, TargetEncoder
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTENC

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'http://127.0.0.1:5000'


In [4]:
# mlflow.search_experiments()

In [3]:
mlflow.set_experiment("earthquake_damage-experiment-1")

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1722947787483, experiment_id='1', last_update_time=1722947787483, lifecycle_stage='active', name='earthquake_damage-experiment-1', tags={}>

In [6]:
with mlflow.start_run():
    
    mlflow.log_param('data-preprocessing', 'target encoding')
    
    data_df = pd.read_csv('./data/train_values.csv')
    data_df.drop(columns='building_id', inplace=True)

    label_df = pd.read_csv('./data/train_labels.csv')
    y = label_df['damage_grade'] - 1
    
#     balancing the labels using SMOTENC
#     smotenc = SMOTENC(categorical_features=[7,8,9,10,11,12,13,14,25])
#     train_x, train_y = smotenc.fit_resample(data_df, y)
#     print(f"balanced labels: {Counter(train_y)}")

    train_x = data_df
    train_y = y
    
    # split the data into training and validation
    train_x, valid_x, train_y, valid_y = train_test_split(train_x,
                                                          train_y, 
                                                          test_size=0.3,
                                                          random_state=49
                                                         )
    # encoding
    # using target encoding
    te = TargetEncoder(smooth='auto')
    train_x = te.fit_transform(train_x, train_y)
    
    valid_x = te.fit_transform(valid_x, valid_y)
    
    
    # using label encoding
#     label_encoder = LabelEncoder()
#     cat_cols = train_x.select_dtypes('object').columns
#     for col in cat_cols:
#         train_x[col] = label_encoder.fit_transform(train_x[col])
        
#     label_encoder = LabelEncoder()
#     cat_cols = valid_x.select_dtypes('object').columns
#     for col in cat_cols:
#         valid_x[col] = label_encoder.fit_transform(valid_x[col])
    

    # start hyperparam tuning
    space = {
    #         'max_depth': hp.quniform('max_depth', 2, 8, 1), # tree
             'max_depth': hp.choice('max_depth', np.arange(2, 8, dtype=int)),
             'learning_rate': hp.loguniform('learning_rate', -5, -2), #boosting
             'subsample': hp.uniform('subsample', 0.5, 1), #stochastic
             'n_estimators': hp.choice('n_estimators', np.arange(300, 400, dtype=int))
            }

    # objective function to minimize

    def objective(params):
        xgb_model = xgb.XGBClassifier(**params)

        xgb_model.fit(train_x, train_y)

        preds = xgb_model.predict(valid_x)

        score = f1_score(valid_y, preds, average='micro')

        return {'loss': -score,
               'status': STATUS_OK}


    # perform the optimization
    trials = Trials()
    
    best_params = fmin(objective,
                       space,
                       algo=tpe.suggest,
                       max_evals=100,
                       trials=trials)
    
#     mlflow.log_param('Best F1 score', best)
    mlflow.log_param('Best params', best_params)
    mlflow.log_param('learning rate', best_params['learning_rate'])
    mlflow.log_param('subsample', best_params['subsample'])
    mlflow.log_param('max depth', best_params['max_depth'])
    mlflow.log_param('n_estimators', best_params['n_estimators'])
    mlflow.log_metric('Best F1 score', trials.best_trial['result']['loss'])

    print(f"best params: {best_params}")

100%|█| 100/100 [1:06:45<00:00, 40.06s/trial, best loss: -0.7197017


2024/08/09 13:08:01 INFO mlflow.tracking._tracking_service.client: 🏃 View run stately-dog-247 at: http://127.0.0.1:5000/#/experiments/1/runs/9b8f702fcd71462ea418418f0de1c001.
2024/08/09 13:08:01 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.


best params: {'learning_rate': 0.010553140697085937, 'max_depth': 5, 'n_estimators': 93, 'subsample': 0.554146890258596}


In [None]:
# best loss: -0.7177319297527532]
# best params: {'learning_rate': 0.1341710726910401, 'max_depth': 5, 'subsample': 0.7574310350346926}

In [7]:
test_df = pd.read_csv('./data/test_values.csv')
test_df.drop(columns=['building_id'], inplace=True)

# label_encoder = LabelEncoder()
# cat_cols = test_df.select_dtypes('object').columns
# for col in cat_cols:
#     test_df[col] = label_encoder.fit_transform(test_df[col])

# using target encoder
test_x = te.transform(test_df)

In [8]:
clf = xgb.XGBClassifier(best_params,
#                         early_stopping_rounds=50,
#                         n_estimators=500
                       )

clf.fit(train_x, train_y)

preds = clf.predict(test_x)
pred_labels = [p+1 for p in preds]
print(Counter(pred_labels))



Counter({2: 64365, 3: 21726, 1: 777})


In [9]:
test_df = pd.read_csv('./data/test_values.csv')
results = pd.DataFrame(data={'building_id':test_df['building_id'],
                             'damage_grade':pred_labels
                            },
                      dtype='int64')

In [10]:
results.to_csv('predictions.csv',
               index=False)