In [33]:
import mlflow
import pickle
import pandas as pd
import numpy as np
from collections import Counter
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTENC

ModuleNotFoundError: No module named 'imblearn'

In [10]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'http://127.0.0.1:5000'


In [11]:
mlflow.search_experiments()

[<Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1722946156504, experiment_id='0', last_update_time=1722946156504, lifecycle_stage='active', name='Default', tags={}>]

In [12]:
mlflow.set_experiment("earthquake_damage-experiment-1")

2024/08/06 12:36:27 INFO mlflow.tracking.fluent: Experiment with name 'earthquake_damage-experiment-1' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1722947787483, experiment_id='1', last_update_time=1722947787483, lifecycle_stage='active', name='earthquake_damage-experiment-1', tags={}>

In [32]:
with mlflow.start_run():
    
    mlflow.log_param('data-preprocessing', 'label encoding')
    
    data_df = pd.read_csv('./data/train_values.csv')
    data_df.drop(columns='building_id', inplace=True)

    label_encoder = LabelEncoder()
    cat_cols = data_df.select_dtypes('object').columns
    for col in cat_cols:
        data_df[col] = label_encoder.fit_transform(data_df[col])

    label_df = pd.read_csv('./data/train_labels.csv')
    y = label_df['damage_grade'] - 1  

    train_x, valid_x, train_y, valid_y = train_test_split(data_df,
                                                          y, 
                                                          test_size=0.3,
                                                          random_state=49
                                                         )

    space = {
    #         'max_depth': hp.quniform('max_depth', 2, 8, 1), # tree
             'max_depth': hp.choice('max_depth', np.arange(2, 8, dtype=int)),
             'learning_rate': hp.loguniform('learning_rate', -5, -2), #boosting
             'subsample': hp.uniform('subsample', 0.5, 1), #stochastic
             'n_estimators': hp.choice('n_estimators', np.arange(300, 400, dtype=int))
            }

    # objective function to minimize

    def objective(params):
        xgb_model = xgb.XGBClassifier(**params)

        xgb_model.fit(train_x, train_y)

        preds = xgb_model.predict(valid_x)

        score = f1_score(valid_y, preds, average='micro')

        return {'loss': -score,
               'status': STATUS_OK}


    # perform the optimization
    trials = Trials()
    
    best_params = fmin(objective,
                       space,
                       algo=tpe.suggest,
                       max_evals=100,
                       trials=trials)
    
#     mlflow.log_param('Best F1 score', best)
    mlflow.log_param('Best params', best_params)
    mlflow.log_param('learning rate', best_params['learning_rate'])
    mlflow.log_param('subsample', best_params['subsample'])
    mlflow.log_param('max depth', best_params['max_depth'])
    mlflow.log_param('n_estimators', best_params['n_estimators'])
    mlflow.log_metric('Best F1 score', trials.best_trial['result']['loss'])

    print(f"best params: {best_params}")

100%|█| 100/100 [30:46<00:00, 18.47s/trial, best loss: -0.738926337
best params: {'learning_rate': 0.11790157411707712, 'max_depth': 5, 'n_estimators': 73, 'subsample': 0.7373874185036384}


In [None]:
# best loss: -0.7177319297527532]
# best params: {'learning_rate': 0.1341710726910401, 'max_depth': 5, 'subsample': 0.7574310350346926}

In [21]:
test_df = pd.read_csv('./data/test_values.csv')
test_df.drop(columns=['building_id'], inplace=True)

label_encoder = LabelEncoder()
cat_cols = test_df.select_dtypes('object').columns
for col in cat_cols:
    test_df[col] = label_encoder.fit_transform(test_df[col])

In [23]:
clf = xgb.XGBClassifier(best_params,
#                         early_stopping_rounds=50,
#                         n_estimators=500
                       )

clf.fit(train_x, train_y)

preds = clf.predict(test_df)
pred_labels = [p+1 for p in preds]
print(Counter(pred_labels))



Counter({2: 58150, 3: 23162, 1: 5556})


In [30]:
test_df = pd.read_csv('./data/test_values.csv')
results = pd.DataFrame(data={'building_id':test_df['building_id'],
                             'damage_grade':pred_labels
                            },
                      dtype='int64')

In [31]:
results.to_csv('predictions.csv',
               index=False)

In [29]:
trials.best_trial['result']['loss']

-0.7366495695885189