In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgbm
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import time
from sklearn.metrics import f1_score

In [3]:
DATA_DIR = './data/'
train_values = pd.read_csv(DATA_DIR + 'train_values.csv', index_col='building_id')
train_labels = pd.read_csv(DATA_DIR + 'train_labels.csv', index_col='building_id')
test_values = pd.read_csv(DATA_DIR + 'test_values.csv', index_col='building_id')
submission_format = pd.read_csv(DATA_DIR + 'submission_format.csv', index_col='building_id')

In [4]:
numerical_columns = [ 'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage', 'count_families'] #'geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
categorical_columns = list(set(train_values.columns)-set(numerical_columns))

In [5]:
train_values = train_values.reset_index(drop=True)
train_labels = train_labels.reset_index(drop=True)
test_values = test_values.reset_index(drop=True)

In [6]:
# Handle categorical features that only appear at train set, but not test set.
train_feature = set(train_values['geo_level_2_id'].unique())
test_feature = set(test_values['geo_level_2_id'].unique())
unseen_features = test_feature - train_feature
for feature in unseen_features:
    # get geo_level_1_id of data that geo_level_2_id == feature
    geo_level_1_id = test_values[test_values['geo_level_2_id'] == feature]['geo_level_1_id'].values[0]
    # get the most common geo_level_1_id in the train_values taht has the same geo_level_2_id
    most_common = train_values[train_values['geo_level_1_id'] == geo_level_1_id]['geo_level_2_id'].value_counts().index[0]
    # replace the unseen feature with the most common one
    test_values.loc[test_values['geo_level_2_id'] == feature, 'geo_level_2_id'] = most_common

train_feature = set(train_values['geo_level_3_id'].unique())
test_feature = set(test_values['geo_level_3_id'].unique())
unseen_features = test_feature - train_feature
for feature in unseen_features:
    geo_level_2_id = test_values[test_values['geo_level_3_id'] == feature]['geo_level_2_id'].values[0]
    most_common = train_values[train_values['geo_level_2_id']==geo_level_2_id]['geo_level_3_id'].value_counts().index[0]
    test_values.loc[test_values['geo_level_3_id'] == feature, 'geo_level_3_id'] = most_common

In [7]:
#train_values = pd.get_dummies(train_values)# one-Hot Encoding
#test_values = pd.get_dummies(test_values) # one-Hot Encoding

categorical_index = [train_values.columns.get_loc(col) for col in categorical_columns]
for col in categorical_columns:
    train_values[col] = pd.Categorical(train_values[col])
    test_values[col] = pd.Categorical(test_values[col])


In [9]:
import optuna  # pip install optuna
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from optuna.integration import LightGBMPruningCallback

def f1_micro(labels, preds):
    preds = preds.reshape(3, -1).T
    preds = preds.argmax(axis = 1)
    #preds = preds.reshape(-1,3).argmax(axis=1)
    is_higher_better = True
    return "f1_micro", f1_score(labels, preds, average='micro'), is_higher_better

def objective(trial, X, y):
    param_grid = {
        # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 32, 1024, step=20),
        "max_depth": trial.suggest_int("max_depth", 5, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 100, step=10),
        #"lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        #"lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        #"min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        #"bagging_fraction": trial.suggest_float(
        #    "bagging_fraction", 0.2, 0.95, step=0.1
        #),
        #"bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        #"feature_fraction": trial.suggest_float(
        #    "feature_fraction", 0.2, 0.95, step=0.1
        #),
    }

    N_SPLITS = 5
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = np.empty(N_SPLITS)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        print("=" * 12 + f"Training fold {idx+1}" + 12 * "=")
        start = time.time()

        X_train, X_val = train_values.iloc[train_idx], train_values.iloc[test_idx]
        y_train, y_val = train_labels.iloc[train_idx], train_labels.iloc[test_idx]    

        lgbm_clf = lgbm.LGBMClassifier(objective='multiclass', **param_grid)

        lgbm_clf.fit(
            X_train, y_train,
            eval_set=[(X_train, y_train), (X_val, y_val)],
            categorical_feature=categorical_index,
            early_stopping_rounds=100,
            eval_metric=f1_micro,
            verbose=False,
            callbacks=[
                LightGBMPruningCallback(trial, 'f1_micro', valid_name="valid_1")
            ],
        )

        preds = lgbm_clf.predict(X_val)
        score = f1_score(y_val, preds, average='micro')
        scores[idx] = score
        runtime = time.time() - start
        print(f"Fold {idx+1} finished with score: {score:.5f} in {runtime:.2f} seconds.\n")

    return np.mean(scores)

In [11]:
study = optuna.create_study(direction="maximize", study_name="LGBM Classifier")
func = lambda trial: objective(trial, train_values, train_labels)
study.optimize(func, n_trials=20)

[32m[I 2023-04-17 00:04:20,802][0m A new study created in memory with name: LGBM Classifier[0m
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]






Fold 1 finished with score: 0.74778 in 32.90 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 2 finished with score: 0.75031 in 30.93 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 3 finished with score: 0.75079 in 34.17 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 4 finished with score: 0.75102 in 32.61 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]
[32m[I 2023-04-17 00:07:06,808][0m Trial 0 finished with value: 0.7496364244850112 and parameters: {'n_estimators': 10000, 'learning_rate': 0.24734684729571782, 'num_leaves': 732, 'max_depth': 9, 'min_data_in_leaf': 50}. Best is trial 0 with value: 0.7496364244850112.[0m


Fold 5 finished with score: 0.74829 in 35.36 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 1 finished with score: 0.74728 in 31.32 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 2 finished with score: 0.74854 in 32.65 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 3 finished with score: 0.75083 in 32.70 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 4 finished with score: 0.74891 in 33.32 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]
[32m[I 2023-04-17 00:09:48,665][0m Trial 1 finished with value: 0.7488114072127136 and parameters: {'n_estimators': 10000, 'learning_rate': 0.2486608694715617, 'num_leaves': 352, 'max_depth': 9, 'min_data_in_leaf': 50}. Best is trial 0 with value: 0.7496364244850112.[0m
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 5 finished with score: 0.74850 in 31.82 seconds.





Fold 1 finished with score: 0.74826 in 73.78 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 2 finished with score: 0.75017 in 74.36 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 3 finished with score: 0.75190 in 74.37 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 4 finished with score: 0.75048 in 77.73 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]
[32m[I 2023-04-17 00:16:05,470][0m Trial 2 finished with value: 0.750012477901538 and parameters: {'n_estimators': 10000, 'learning_rate': 0.10534606024975147, 'num_leaves': 572, 'max_depth': 5, 'min_data_in_leaf': 30}. Best is trial 2 with value: 0.750012477901538.[0m


Fold 5 finished with score: 0.74925 in 76.51 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 1 finished with score: 0.74857 in 38.35 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 2 finished with score: 0.75031 in 34.74 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 3 finished with score: 0.75125 in 28.10 seconds.





Fold 4 finished with score: 0.75092 in 39.54 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]
[32m[I 2023-04-17 00:19:02,653][0m Trial 3 finished with value: 0.7498014253037715 and parameters: {'n_estimators': 10000, 'learning_rate': 0.2026034129782126, 'num_leaves': 612, 'max_depth': 7, 'min_data_in_leaf': 100}. Best is trial 2 with value: 0.750012477901538.[0m


Fold 5 finished with score: 0.74797 in 36.41 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 1 finished with score: 0.74868 in 39.13 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 2 finished with score: 0.74946 in 39.01 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 3 finished with score: 0.75205 in 35.72 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 4 finished with score: 0.74944 in 39.94 seconds.



[32m[I 2023-04-17 00:22:15,556][0m Trial 4 finished with value: 0.7496632821145284 and parameters: {'n_estimators': 10000, 'learning_rate': 0.2152746791714886, 'num_leaves': 112, 'max_depth': 6, 'min_data_in_leaf': 30}. Best is trial 2 with value: 0.750012477901538.[0m


Fold 5 finished with score: 0.74868 in 39.05 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]
[32m[I 2023-04-17 00:22:16,009][0m Trial 5 pruned. Trial was pruned at iteration 0.[0m
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]




[32m[I 2023-04-17 00:22:16,885][0m Trial 6 pruned. Trial was pruned at iteration 0.[0m
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]




[32m[I 2023-04-17 00:22:18,866][0m Trial 7 pruned. Trial was pruned at iteration 8.[0m
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]




[32m[I 2023-04-17 00:22:19,240][0m Trial 8 pruned. Trial was pruned at iteration 0.[0m
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]




[32m[I 2023-04-17 00:22:20,082][0m Trial 9 pruned. Trial was pruned at iteration 0.[0m
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]




[32m[I 2023-04-17 00:22:21,439][0m Trial 10 pruned. Trial was pruned at iteration 4.[0m
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]




[32m[I 2023-04-17 00:22:21,834][0m Trial 11 pruned. Trial was pruned at iteration 0.[0m
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]




[32m[I 2023-04-17 00:22:22,293][0m Trial 12 pruned. Trial was pruned at iteration 0.[0m
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]




[32m[I 2023-04-17 00:22:22,840][0m Trial 13 pruned. Trial was pruned at iteration 0.[0m
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]




[32m[I 2023-04-17 00:22:23,294][0m Trial 14 pruned. Trial was pruned at iteration 0.[0m
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]




[32m[I 2023-04-17 00:22:23,696][0m Trial 15 pruned. Trial was pruned at iteration 0.[0m
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]




[32m[I 2023-04-17 00:22:24,292][0m Trial 16 pruned. Trial was pruned at iteration 0.[0m
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]




[32m[I 2023-04-17 00:22:24,775][0m Trial 17 pruned. Trial was pruned at iteration 0.[0m
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]




[32m[I 2023-04-17 00:22:25,668][0m Trial 18 pruned. Trial was pruned at iteration 0.[0m
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]




[32m[I 2023-04-17 00:22:26,054][0m Trial 19 pruned. Trial was pruned at iteration 0.[0m


In [12]:
print(f"\tBest value (f1 score): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

	Best value (f1 score): 0.75001
	Best params:
		n_estimators: 10000
		learning_rate: 0.10534606024975147
		num_leaves: 572
		max_depth: 5
		min_data_in_leaf: 30


In [8]:


params = {'n_estimators': 10000,
		'learning_rate': 0.10534606024975147,
		'num_leaves': 572,
		'max_depth': 5,
		'min_data_in_leaf': 30}

def f1_micro(labels, preds):
    preds = preds.reshape(3, -1).T
    preds = preds.argmax(axis = 1)
    #preds = preds.reshape(-1,3).argmax(axis=1)
    is_higher_better = True
    return "f1_micro", f1_score(labels, preds, average='micro'), is_higher_better


N_SPLITS = 5
strat_kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
scores = np.empty(N_SPLITS)
train_prediction = np.zeros((train_values.shape[0],3))
test_prediction = np.zeros((test_values.shape[0],3))
for idx, (train_idx, test_idx) in enumerate(strat_kf.split(train_values, train_labels)):
    print("=" * 12 + f"Training fold {idx}" + 12 * "=")
    start = time.time()

    X_train, X_val = train_values.iloc[train_idx], train_values.iloc[test_idx]
    y_train, y_val = train_labels.iloc[train_idx], train_labels.iloc[test_idx]    

    lgbm_clf = lgbm.LGBMClassifier(objective='multiclass', **params)

    lgbm_clf.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        categorical_feature=categorical_index,
        early_stopping_rounds=100,
        eval_metric=f1_micro,
        verbose=False,
    )

    preds = lgbm_clf.predict(X_val)
    score = f1_score(y_val, preds, average='micro')
    scores[idx] = score

    y_pred = lgbm_clf.predict_proba(X_val)
    train_prediction[test_idx, :] = y_pred

    y_pred = lgbm_clf.predict_proba(test_values)
    test_prediction += y_pred

    runtime = time.time() - start
    print(f"Fold {idx+1} finished with score: {score:.5f} in {runtime:.2f} seconds.\n")

test_prediction /= N_SPLITS
print('CV mean: {:.4f}, CV std: {:.4f}'.format(np.mean(scores), np.std(scores)))



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 1 finished with score: 0.74826 in 973.89 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 2 finished with score: 0.75017 in 958.53 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 3 finished with score: 0.75190 in 917.18 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 4 finished with score: 0.75048 in 1032.33 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 5 finished with score: 0.74925 in 1040.74 seconds.

CV mean: 0.7500, CV std: 0.0012


In [None]:
prediction_labels = np.argmax(test_prediction, axis=1)+1
submission = pd.DataFrame(data=prediction_labels,
                             columns=submission_format.columns,
                             index=submission_format.index)
submission.to_csv('lightgbm_submission.csv')
submission.value_counts()

damage_grade
2               55499
3               24533
1                6836
dtype: int64

In [None]:
# Pseudo-labeling

threshold = 0.8

test_X_pseudo_1 = test_values.iloc[np.where(test_prediction[:,0] > threshold)]
test_y_pseudo_1 = [1]*test_X_pseudo_1.shape[0]
test_X_pseudo_2 = test_values.iloc[np.where(test_prediction[:,1] > threshold)]
test_y_pseudo_2 = [2]*test_X_pseudo_2.shape[0]
test_X_pseudo_3 = test_values.iloc[np.where(test_prediction[:,2] > threshold)]
test_y_pseudo_3 = [3]*test_X_pseudo_3.shape[0]

test_pseudo = pd.concat([test_X_pseudo_1, test_X_pseudo_2, test_X_pseudo_3]).reset_index(drop = True)
test_pseudo['damage_grade'] = pd.Series(test_y_pseudo_1+test_y_pseudo_2+test_y_pseudo_3)
test_pseudo_y = pd.DataFrame(test_pseudo['damage_grade'])
test_pseudo_X = test_pseudo.drop('damage_grade', axis = 1)
#test_pseudo_sample = test_pseudo.sample(int(0.2*test_values.shape[0]))

train_X_pseudo = pd.concat([train_values, test_pseudo_X],ignore_index=True)
train_y_pseudo = pd.concat([train_labels, test_pseudo_y],ignore_index=True)

categorical_index = [train_values.columns.get_loc(col) for col in categorical_columns]
for col in categorical_columns:
    train_X_pseudo[col] = pd.Categorical(train_X_pseudo[col])

N_SPLITS = 5
strat_kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
scores = np.empty(N_SPLITS)
train_prediction = np.zeros((train_X_pseudo.shape[0],3))
test_prediction = np.zeros((test_values.shape[0],3))
for idx, (train_idx, test_idx) in enumerate(strat_kf.split(train_X_pseudo, train_y_pseudo)):
    print("=" * 12 + f"Training fold {idx}" + 12 * "=")
    start = time.time()

    X_train, X_val = train_X_pseudo.iloc[train_idx], train_X_pseudo.iloc[test_idx]
    y_train, y_val = train_y_pseudo.iloc[train_idx], train_y_pseudo.iloc[test_idx]    

    lgbm_clf = lgbm.LGBMClassifier(objective='multiclass', **params)

    lgbm_clf.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        categorical_feature=categorical_index,
        early_stopping_rounds=100,
        eval_metric=f1_micro,
        verbose=False,
    )

    preds = lgbm_clf.predict(X_val)
    score = f1_score(y_val, preds, average='micro')
    scores[idx] = score

    y_pred = lgbm_clf.predict_proba(X_val)
    train_prediction[test_idx, :] = y_pred

    y_pred = lgbm_clf.predict_proba(test_values)
    test_prediction += y_pred

    runtime = time.time() - start
    print(f"Fold {idx+1} finished with score: {score:.5f} in {runtime:.2f} seconds.\n")

test_prediction /= N_SPLITS
print('CV mean: {:.4f}, CV std: {:.4f}'.format(np.mean(scores), np.std(scores)))




  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 1 finished with score: 0.78605 in 80.24 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 2 finished with score: 0.78322 in 91.72 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 3 finished with score: 0.78400 in 100.75 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 4 finished with score: 0.78428 in 95.99 seconds.



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
New categorical_feature is [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]


Fold 5 finished with score: 0.77976 in 76.12 seconds.

CV mean: 0.7835, CV std: 0.0021


In [None]:

prediction_labels = np.argmax(test_prediction, axis=1)+1
submission = pd.DataFrame(data=prediction_labels,
                             columns=submission_format.columns,
                             index=submission_format.index)
submission.to_csv('pseudo_lightgbm_submission.csv')
submission.value_counts()

damage_grade
2               55548
3               24492
1                6828
dtype: int64

In [9]:
test_prediction_df = pd.DataFrame(data=test_prediction,
                             columns=['1','2','3'],
                             index=submission_format.index)
test_prediction_df.to_csv('test_pseudo_lightgbm_prediction.csv')
test_prediction_df.shape

(86868, 3)

In [10]:
train_prediction_df = pd.DataFrame(data=train_prediction[:train_values.shape[0],:],
                             columns=['1','2','3'])
train_prediction_df.to_csv('train_pseudo_lightgbm_prediction.csv')
train_prediction_df.shape

(260601, 3)

In [71]:
train_values.shape

(260601, 38)

In [72]:
test_values.shape

(86868, 38)