In [163]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold,  RepeatedStratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold, train_test_split, GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight

import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import cv

In [164]:
DATA_DIR = './data/'
train_values = pd.read_csv(DATA_DIR + 'train_values.csv', index_col='building_id')
train_labels = pd.read_csv(DATA_DIR + 'train_labels.csv', index_col='building_id')
test_values = pd.read_csv(DATA_DIR + 'test_values.csv', index_col='building_id')
submission_format = pd.read_csv(DATA_DIR + 'submission_format.csv', index_col='building_id')

In [165]:
numerical_columns = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage', 'count_families']
categorical_columns = list(set(train_values.columns)-set(numerical_columns))

In [166]:
train_values = train_values.reset_index(drop=True)
train_labels = train_labels.reset_index(drop=True)
test_values = test_values.reset_index(drop=True)

In [167]:
# Handle categorical features that only appear at train set, but not test set.
train_feature = set(train_values['geo_level_2_id'].unique())
test_feature = set(test_values['geo_level_2_id'].unique())
unseen_features = test_feature - train_feature
for feature in unseen_features:
    # get geo_level_1_id of data that geo_level_2_id == feature
    geo_level_1_id = test_values[test_values['geo_level_2_id'] == feature]['geo_level_1_id'].values[0]
    # get the most common geo_level_1_id in the train_values taht has the same geo_level_2_id
    most_common = train_values[train_values['geo_level_1_id'] == geo_level_1_id]['geo_level_2_id'].value_counts().index[0]
    # replace the unseen feature with the most common one
    test_values.loc[test_values['geo_level_2_id'] == feature, 'geo_level_2_id'] = most_common

train_feature = set(train_values['geo_level_3_id'].unique())
test_feature = set(test_values['geo_level_3_id'].unique())
unseen_features = test_feature - train_feature
for feature in unseen_features:
    geo_level_2_id = test_values[test_values['geo_level_3_id'] == feature]['geo_level_2_id'].values[0]
    most_common = train_values[train_values['geo_level_2_id']==geo_level_2_id]['geo_level_3_id'].value_counts().index[0]
    test_values.loc[test_values['geo_level_3_id'] == feature, 'geo_level_3_id'] = most_common

In [168]:
from sklearn.preprocessing import LabelEncoder

train_values = pd.get_dummies(train_values)# one-Hot Encoding
test_values = pd.get_dummies(test_values) # one-Hot Encoding
"""
train_values = train_values[categorical_columns].astype("category")
test_values = test_values[categorical_columns].astype("category")
"""


'\ntrain_values = train_values[categorical_columns].astype("category")\ntest_values = test_values[categorical_columns].astype("category")\n'

In [169]:
#train_labels['damage_grade'] = train_labels['damage_grade'] 
le = LabelEncoder()
train_labels['damage_grade']=le.fit_transform(train_labels['damage_grade'])
"""
le = LabelEncoder()
for col in categorical_columns:
    train_values[col]=le.fit_transform(train_values[col])
for col in categorical_columns:
    test_values[col]=le.fit_transform(test_values[col])
    """

'\nle = LabelEncoder()\nfor col in categorical_columns:\n    train_values[col]=le.fit_transform(train_values[col])\nfor col in categorical_columns:\n    test_values[col]=le.fit_transform(test_values[col])\n    '

In [116]:
import optuna  # pip install optuna
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from optuna.integration import XGBoostPruningCallback
import time
import numpy as np
import xgboost as xgb
import copy

def f1_micro(preds: np.ndarray, dtrain: xgb.DMatrix):
    labels = dtrain.get_label()
    return 'f1_micro', f1_score(labels, preds, average='micro')

early_stop = xgb.callback.EarlyStopping(rounds=100,
                                        metric_name='f1_micro',
                                        save_best=True,
                                        maximize=True,
                                        data_name='validation_1')

def objective(trial, X, y):
    param_grid = {
        # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_leaves": trial.suggest_int("max_leaves", 32, 1024, step=20),
        "max_depth": trial.suggest_int("max_depth", 5, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 50, step=5),
        #"lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        #"lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        #"min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        #"bagging_fraction": trial.suggest_float(
        #    "bagging_fraction", 0.2, 0.95, step=0.1
        #),
        #"bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        #"feature_fraction": trial.suggest_float(
        #    "feature_fraction", 0.2, 0.95, step=0.1
        #),
    }

    N_SPLITS = 5
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = np.empty(N_SPLITS)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        print("=" * 12 + f"Training fold {idx+1}" + 12 * "=")
        start = time.time()

        X_train, X_val = train_values.iloc[train_idx], train_values.iloc[test_idx]
        y_train, y_val = train_labels.iloc[train_idx], train_labels.iloc[test_idx]  

        model = XGBClassifier(objective='multi:softmax',
                              num_class=3,
                              #tree_method="gpu_hist", 
                              #enable_categorical=True,
                              disable_default_eval_metric=1,
                              gpu_id = 0,
                              seed = 42,
                              callbacks=[
                                    XGBoostPruningCallback(trial, 'validation_1-f1_micro'),
                                    copy.deepcopy(early_stop)
                                ],
                              **param_grid)
        
        model.fit(X_train, y_train,
                eval_set=[(X_train, y_train), (X_val, y_val)], 
                #early_stopping_rounds=100,
                eval_metric=f1_micro,
                verbose=False
                )
        
        # score = model.get_best_score()['validation']['mlogloss']
        #score = model.evals_result()['validation_1']['f1_micro'][-1]
        #scores[idx] = score 

        preds = model.predict(X_val)
        score = f1_score(y_val, preds, average='micro')
        scores[idx] = score

        runtime = time.time() - start
        print(f"Fold {idx+1} finished with score: {score:.5f} in {runtime:.2f} seconds.\n")

    return np.mean(scores)

In [117]:
study = optuna.create_study(direction="maximize", study_name="Xgboost Classifier")
func = lambda trial: objective(trial, train_values, train_labels)
study.optimize(func, n_trials=20)

[32m[I 2023-04-21 17:43:08,954][0m A new study created in memory with name: Xgboost Classifier[0m


Fold 1 finished with score: 0.74647 in 202.98 seconds.





Fold 2 finished with score: 0.74651 in 187.97 seconds.





Fold 3 finished with score: 0.74990 in 241.15 seconds.





Fold 4 finished with score: 0.74914 in 232.26 seconds.



[32m[I 2023-04-21 18:01:32,522][0m Trial 0 finished with value: 0.7475527763831559 and parameters: {'n_estimators': 10000, 'learning_rate': 0.2511143644835039, 'max_leaves': 332, 'max_depth': 5, 'min_child_weight': 26}. Best is trial 0 with value: 0.7475527763831559.[0m


Fold 5 finished with score: 0.74574 in 239.15 seconds.





Fold 1 finished with score: 0.74701 in 150.26 seconds.





Fold 2 finished with score: 0.74589 in 106.06 seconds.





Fold 3 finished with score: 0.75019 in 114.64 seconds.





Fold 4 finished with score: 0.74875 in 131.27 seconds.



[32m[I 2023-04-21 18:11:42,882][0m Trial 1 finished with value: 0.7471498579748233 and parameters: {'n_estimators': 10000, 'learning_rate': 0.18995576536879238, 'max_leaves': 912, 'max_depth': 9, 'min_child_weight': 1}. Best is trial 0 with value: 0.7475527763831559.[0m


Fold 5 finished with score: 0.74390 in 108.09 seconds.





Fold 1 finished with score: 0.74617 in 132.41 seconds.





Fold 2 finished with score: 0.74591 in 116.39 seconds.





Fold 3 finished with score: 0.74985 in 188.20 seconds.





Fold 4 finished with score: 0.74965 in 197.38 seconds.



[32m[I 2023-04-21 18:23:57,460][0m Trial 2 finished with value: 0.7470002065710991 and parameters: {'n_estimators': 10000, 'learning_rate': 0.27789460496465795, 'max_leaves': 172, 'max_depth': 6, 'min_child_weight': 26}. Best is trial 0 with value: 0.7475527763831559.[0m


Fold 5 finished with score: 0.74342 in 100.16 seconds.





Fold 1 finished with score: 0.74530 in 120.79 seconds.





Fold 2 finished with score: 0.74666 in 117.54 seconds.





Fold 3 finished with score: 0.74816 in 155.40 seconds.





Fold 4 finished with score: 0.75132 in 131.20 seconds.



[32m[I 2023-04-21 18:34:20,607][0m Trial 3 finished with value: 0.7473570786485187 and parameters: {'n_estimators': 10000, 'learning_rate': 0.22913429249197867, 'max_leaves': 432, 'max_depth': 8, 'min_child_weight': 1}. Best is trial 0 with value: 0.7475527763831559.[0m


Fold 5 finished with score: 0.74534 in 98.18 seconds.





Fold 1 finished with score: 0.74655 in 164.69 seconds.





Fold 2 finished with score: 0.74589 in 178.84 seconds.





Fold 3 finished with score: 0.75036 in 257.84 seconds.





Fold 4 finished with score: 0.74935 in 147.54 seconds.



[32m[I 2023-04-21 18:49:59,550][0m Trial 4 finished with value: 0.7477830140011738 and parameters: {'n_estimators': 10000, 'learning_rate': 0.11958651875669075, 'max_leaves': 612, 'max_depth': 9, 'min_child_weight': 16}. Best is trial 4 with value: 0.7477830140011738.[0m


Fold 5 finished with score: 0.74676 in 190.00 seconds.



[32m[I 2023-04-21 18:49:59,959][0m Trial 5 pruned. Trial was pruned at iteration 0.[0m




[32m[I 2023-04-21 18:50:00,418][0m Trial 6 pruned. Trial was pruned at iteration 0.[0m




[32m[I 2023-04-21 18:50:00,899][0m Trial 7 pruned. Trial was pruned at iteration 0.[0m




[32m[I 2023-04-21 18:50:01,378][0m Trial 8 pruned. Trial was pruned at iteration 0.[0m




[32m[I 2023-04-21 18:50:01,853][0m Trial 9 pruned. Trial was pruned at iteration 0.[0m




[32m[I 2023-04-21 18:50:05,183][0m Trial 10 pruned. Trial was pruned at iteration 9.[0m


Fold 1 finished with score: 0.74757 in 108.61 seconds.





Fold 2 finished with score: 0.74520 in 75.08 seconds.





Fold 3 finished with score: 0.74964 in 110.85 seconds.





Fold 4 finished with score: 0.74893 in 95.87 seconds.



[32m[I 2023-04-21 18:58:04,149][0m Trial 11 finished with value: 0.7472764866916397 and parameters: {'n_estimators': 10000, 'learning_rate': 0.28305439044510644, 'max_leaves': 272, 'max_depth': 9, 'min_child_weight': 16}. Best is trial 4 with value: 0.7477830140011738.[0m


Fold 5 finished with score: 0.74505 in 88.50 seconds.



[32m[I 2023-04-21 18:58:04,579][0m Trial 12 pruned. Trial was pruned at iteration 0.[0m


Fold 1 finished with score: 0.74751 in 122.41 seconds.





Fold 2 finished with score: 0.74513 in 90.92 seconds.





Fold 3 finished with score: 0.74933 in 117.47 seconds.





Fold 4 finished with score: 0.74900 in 83.88 seconds.



[32m[I 2023-04-21 19:06:19,862][0m Trial 13 finished with value: 0.7473800939731376 and parameters: {'n_estimators': 10000, 'learning_rate': 0.21615265065192985, 'max_leaves': 572, 'max_depth': 10, 'min_child_weight': 11}. Best is trial 4 with value: 0.7477830140011738.[0m


Fold 5 finished with score: 0.74593 in 80.55 seconds.





Fold 1 finished with score: 0.74694 in 87.56 seconds.





Fold 2 finished with score: 0.74637 in 117.85 seconds.





Fold 3 finished with score: 0.74946 in 112.88 seconds.





Fold 4 finished with score: 0.74802 in 76.25 seconds.



[32m[I 2023-04-21 19:14:36,442][0m Trial 14 finished with value: 0.7473493977934896 and parameters: {'n_estimators': 10000, 'learning_rate': 0.29745860601338137, 'max_leaves': 792, 'max_depth': 9, 'min_child_weight': 16}. Best is trial 4 with value: 0.7477830140011738.[0m


Fold 5 finished with score: 0.74595 in 101.99 seconds.



[32m[I 2023-04-21 19:14:36,877][0m Trial 15 pruned. Trial was pruned at iteration 0.[0m




[32m[I 2023-04-21 19:14:37,375][0m Trial 16 pruned. Trial was pruned at iteration 0.[0m




[32m[I 2023-04-21 19:14:37,926][0m Trial 17 pruned. Trial was pruned at iteration 0.[0m




[32m[I 2023-04-21 19:14:38,918][0m Trial 18 pruned. Trial was pruned at iteration 1.[0m




[32m[I 2023-04-21 19:14:39,474][0m Trial 19 pruned. Trial was pruned at iteration 0.[0m


In [118]:
print(f"\tBest value (f1 score): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

	Best value (f1 score): 0.74778
	Best params:
		n_estimators: 10000
		learning_rate: 0.11958651875669075
		max_leaves: 612
		max_depth: 9
		min_child_weight: 16


In [None]:
"""
def train(train_values, train_labels, test_values, folds, params):
    scores = []
    prediction = np.zeros((test_values.shape[0],3))
    for fold_n, (train_index, valid_index) in enumerate(folds.split(train_values, train_labels)):

        X_train, X_valid = train_values.iloc[train_index], train_values.iloc[valid_index] # train and validation data splits
        y_train, y_valid = train_labels.iloc[train_index], train_labels.iloc[valid_index]
        
        # sample_weights = compute_sample_weight(
        #   class_weight='balanced',
        #   y=y_train
        # )

        model = XGBClassifier(**params)
        model.fit(X_train, y_train,
                eval_set=[(X_train, y_train), (X_valid, y_valid)], 
                # sample_weight = sample_weights
                )
        
        # score = model.get_best_score()['validation']['mlogloss']
        score = model.evals_result()['validation_0']['merror'][-1]
        scores.append(score)
        
        y_pred = model.predict_proba(test_values)
        prediction += y_pred
    return prediction, scores
"""

In [172]:
params={'n_estimators': 10000,
		'learning_rate': 0.11958651875669075,
		'max_leaves': 612,
		'max_depth': 9,
		'min_child_weight': 16}

def f1_micro(preds: np.ndarray, dtrain: xgb.DMatrix):
    labels = dtrain.get_label()
    return 'f1_micro', f1_score(labels, preds, average='micro')

early_stop = xgb.callback.EarlyStopping(rounds=100,
                                        metric_name='f1_micro',
                                        save_best=True,
                                        maximize=True,
                                        data_name='validation_1')

N_SPLITS = 5
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = np.empty(N_SPLITS)
train_prediction = np.zeros((train_values.shape[0],3))
test_prediction = np.zeros((test_values.shape[0],3))
for idx, (train_idx, test_idx) in enumerate(cv.split(train_values, train_labels)):
    print("=" * 12 + f"Training fold {idx+1}" + 12 * "=")
    start = time.time()

    X_train, X_val = train_values.iloc[train_idx], train_values.iloc[test_idx]
    y_train, y_val = train_labels.iloc[train_idx], train_labels.iloc[test_idx]  

    model = XGBClassifier(objective='multi:softmax',
                          num_class=3, 
                            #tree_method="gpu_hist", 
                            #enable_categorical=True,
                            disable_default_eval_metric=1,
                            gpu_id = 0,
                            seed = 42,
                            callbacks=[
                                    copy.deepcopy(early_stop)
                                ],
                            **params)
    
    model.fit(X_train, y_train,
            eval_set=[(X_train, y_train), (X_val, y_val)], 
            #early_stopping_rounds=100,
            eval_metric=f1_micro,
            verbose=True,
            )
    
    # score = model.get_best_score()['validation']['mlogloss']
    #score = model.evals_result()['validation_1']['f1_micro'][-1]
    #scores[idx] = score 

    preds = model.predict(X_val)
    score = f1_score(y_val, preds, average='micro')
    scores[idx] = score

    y_pred = model.predict_proba(X_val)
    train_prediction[test_idx, :] = y_pred

    y_pred = model.predict_proba(test_values)
    test_prediction += y_pred

    runtime = time.time() - start
    print(f"Fold {idx+1} finished with score: {score:.5f} in {runtime:.2f} seconds.\n")

test_prediction /= N_SPLITS
print('CV mean: {:.4f}, CV std: {:.4f}'.format(np.mean(scores), np.std(scores)))






[0]	validation_0-f1_micro:0.69053	validation_1-f1_micro:0.68567
[1]	validation_0-f1_micro:0.69075	validation_1-f1_micro:0.68696
[2]	validation_0-f1_micro:0.69184	validation_1-f1_micro:0.68677
[3]	validation_0-f1_micro:0.69206	validation_1-f1_micro:0.68742
[4]	validation_0-f1_micro:0.69287	validation_1-f1_micro:0.68803
[5]	validation_0-f1_micro:0.69315	validation_1-f1_micro:0.68897
[6]	validation_0-f1_micro:0.69495	validation_1-f1_micro:0.69041
[7]	validation_0-f1_micro:0.69558	validation_1-f1_micro:0.69080
[8]	validation_0-f1_micro:0.69762	validation_1-f1_micro:0.69271
[9]	validation_0-f1_micro:0.69991	validation_1-f1_micro:0.69438
[10]	validation_0-f1_micro:0.70274	validation_1-f1_micro:0.69630
[11]	validation_0-f1_micro:0.70485	validation_1-f1_micro:0.69857
[12]	validation_0-f1_micro:0.70598	validation_1-f1_micro:0.69899
[13]	validation_0-f1_micro:0.70714	validation_1-f1_micro:0.70008
[14]	validation_0-f1_micro:0.70849	validation_1-f1_micro:0.70108
[15]	validation_0-f1_micro:0.70909	



[0]	validation_0-f1_micro:0.68918	validation_1-f1_micro:0.68477
[1]	validation_0-f1_micro:0.69031	validation_1-f1_micro:0.68569
[2]	validation_0-f1_micro:0.69175	validation_1-f1_micro:0.68776
[3]	validation_0-f1_micro:0.69182	validation_1-f1_micro:0.68862
[4]	validation_0-f1_micro:0.69247	validation_1-f1_micro:0.68834
[5]	validation_0-f1_micro:0.69382	validation_1-f1_micro:0.68973
[6]	validation_0-f1_micro:0.69594	validation_1-f1_micro:0.69211
[7]	validation_0-f1_micro:0.69774	validation_1-f1_micro:0.69330
[8]	validation_0-f1_micro:0.70092	validation_1-f1_micro:0.69589
[9]	validation_0-f1_micro:0.70356	validation_1-f1_micro:0.69837
[10]	validation_0-f1_micro:0.70603	validation_1-f1_micro:0.70031
[11]	validation_0-f1_micro:0.70697	validation_1-f1_micro:0.70079
[12]	validation_0-f1_micro:0.70885	validation_1-f1_micro:0.70272
[13]	validation_0-f1_micro:0.70972	validation_1-f1_micro:0.70301
[14]	validation_0-f1_micro:0.70980	validation_1-f1_micro:0.70328
[15]	validation_0-f1_micro:0.71082	



[0]	validation_0-f1_micro:0.69057	validation_1-f1_micro:0.68571
[1]	validation_0-f1_micro:0.69244	validation_1-f1_micro:0.68845
[2]	validation_0-f1_micro:0.69378	validation_1-f1_micro:0.68922
[3]	validation_0-f1_micro:0.69367	validation_1-f1_micro:0.68799
[4]	validation_0-f1_micro:0.69380	validation_1-f1_micro:0.68795
[5]	validation_0-f1_micro:0.69539	validation_1-f1_micro:0.68966
[6]	validation_0-f1_micro:0.69648	validation_1-f1_micro:0.69071
[7]	validation_0-f1_micro:0.69910	validation_1-f1_micro:0.69417
[8]	validation_0-f1_micro:0.70240	validation_1-f1_micro:0.69630
[9]	validation_0-f1_micro:0.70476	validation_1-f1_micro:0.69779
[10]	validation_0-f1_micro:0.70521	validation_1-f1_micro:0.69883
[11]	validation_0-f1_micro:0.70652	validation_1-f1_micro:0.69966
[12]	validation_0-f1_micro:0.70733	validation_1-f1_micro:0.70050
[13]	validation_0-f1_micro:0.70817	validation_1-f1_micro:0.70077
[14]	validation_0-f1_micro:0.70895	validation_1-f1_micro:0.70163
[15]	validation_0-f1_micro:0.70971	



[0]	validation_0-f1_micro:0.68937	validation_1-f1_micro:0.68632
[1]	validation_0-f1_micro:0.68993	validation_1-f1_micro:0.68707
[2]	validation_0-f1_micro:0.69023	validation_1-f1_micro:0.68692
[3]	validation_0-f1_micro:0.69149	validation_1-f1_micro:0.68693
[4]	validation_0-f1_micro:0.69307	validation_1-f1_micro:0.68858
[5]	validation_0-f1_micro:0.69489	validation_1-f1_micro:0.68958
[6]	validation_0-f1_micro:0.69561	validation_1-f1_micro:0.69037
[7]	validation_0-f1_micro:0.69723	validation_1-f1_micro:0.69206
[8]	validation_0-f1_micro:0.70013	validation_1-f1_micro:0.69388
[9]	validation_0-f1_micro:0.70163	validation_1-f1_micro:0.69482
[10]	validation_0-f1_micro:0.70356	validation_1-f1_micro:0.69743
[11]	validation_0-f1_micro:0.70464	validation_1-f1_micro:0.69820
[12]	validation_0-f1_micro:0.70561	validation_1-f1_micro:0.69916
[13]	validation_0-f1_micro:0.70579	validation_1-f1_micro:0.69971
[14]	validation_0-f1_micro:0.70733	validation_1-f1_micro:0.70019
[15]	validation_0-f1_micro:0.70798	



[0]	validation_0-f1_micro:0.69097	validation_1-f1_micro:0.68770
[1]	validation_0-f1_micro:0.69068	validation_1-f1_micro:0.68734
[2]	validation_0-f1_micro:0.69266	validation_1-f1_micro:0.68772
[3]	validation_0-f1_micro:0.69212	validation_1-f1_micro:0.68690
[4]	validation_0-f1_micro:0.69437	validation_1-f1_micro:0.68893
[5]	validation_0-f1_micro:0.69613	validation_1-f1_micro:0.69041
[6]	validation_0-f1_micro:0.69674	validation_1-f1_micro:0.69094
[7]	validation_0-f1_micro:0.69932	validation_1-f1_micro:0.69344
[8]	validation_0-f1_micro:0.70132	validation_1-f1_micro:0.69438
[9]	validation_0-f1_micro:0.70353	validation_1-f1_micro:0.69563
[10]	validation_0-f1_micro:0.70524	validation_1-f1_micro:0.69674
[11]	validation_0-f1_micro:0.70619	validation_1-f1_micro:0.69793
[12]	validation_0-f1_micro:0.70750	validation_1-f1_micro:0.69881
[13]	validation_0-f1_micro:0.70798	validation_1-f1_micro:0.69969
[14]	validation_0-f1_micro:0.70814	validation_1-f1_micro:0.69988
[15]	validation_0-f1_micro:0.70930	

In [153]:
prediction_labels = np.argmax(test_prediction, axis=1)+1
submission = pd.DataFrame(data=prediction_labels,
                             columns=submission_format.columns,
                             index=submission_format.index)
submission.to_csv('xgboost_submission.csv')
submission.value_counts()

damage_grade
2               56193
3               24281
1                6394
dtype: int64

In [154]:
test_prediction

array([[6.31937850e-04, 2.66852918e-01, 7.32515121e-01],
       [7.25242717e-04, 9.19799960e-01, 7.94747837e-02],
       [1.17800878e-02, 7.68895733e-01, 2.19324212e-01],
       ...,
       [4.12284277e-02, 6.94076586e-01, 2.64694983e-01],
       [2.25609125e-03, 8.22646415e-01, 1.75097552e-01],
       [5.19391775e-01, 4.58354151e-01, 2.22540708e-02]])

In [155]:
# Pseudo-labeling

threshold = 0.8

test_X_pseudo_1 = test_values.iloc[np.where(test_prediction[:,0] > threshold)]
test_y_pseudo_1 = [0]*test_X_pseudo_1.shape[0]
test_X_pseudo_2 = test_values.iloc[np.where(test_prediction[:,1] > threshold)]
test_y_pseudo_2 = [1]*test_X_pseudo_2.shape[0]
test_X_pseudo_3 = test_values.iloc[np.where(test_prediction[:,2] > threshold)]
test_y_pseudo_3 = [2]*test_X_pseudo_3.shape[0]

test_pseudo = pd.concat([test_X_pseudo_1, test_X_pseudo_2, test_X_pseudo_3]).reset_index(drop = True)
test_pseudo['damage_grade'] = pd.Series(test_y_pseudo_1+test_y_pseudo_2+test_y_pseudo_3)
test_pseudo_y = pd.DataFrame(test_pseudo['damage_grade'])
test_pseudo_X = test_pseudo.drop('damage_grade', axis = 1)
#test_pseudo_sample = test_pseudo.sample(int(0.2*test_values.shape[0]))

train_X_pseudo = pd.concat([train_values, test_pseudo_X],ignore_index=True)
train_y_pseudo = pd.concat([train_labels, test_pseudo_y],ignore_index=True)

N_SPLITS = 5
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = np.empty(N_SPLITS)
train_prediction = np.zeros((train_X_pseudo.shape[0],3))
test_prediction = np.zeros((test_values.shape[0],3))
for idx, (train_idx, test_idx) in enumerate(cv.split(train_X_pseudo, train_y_pseudo)):
    print("=" * 12 + f"Training fold {idx+1}" + 12 * "=")
    start = time.time()

    X_train, X_val = train_X_pseudo.iloc[train_idx], train_X_pseudo.iloc[test_idx]
    y_train, y_val = train_y_pseudo.iloc[train_idx], train_y_pseudo.iloc[test_idx]  

    model = XGBClassifier(objective='multi:softmax',
                          num_class=3, 
                            #tree_method="gpu_hist", 
                            #enable_categorical=True,
                            disable_default_eval_metric=1,
                            gpu_id = 0,
                            seed = 42,
                            callbacks=[
                                    copy.deepcopy(early_stop)
                                ],
                            **params)
    
    model.fit(X_train, y_train,
            eval_set=[(X_train, y_train), (X_val, y_val)], 
            #early_stopping_rounds=100,
            eval_metric=f1_micro,
            verbose=False,
            )
    
    # score = model.get_best_score()['validation']['mlogloss']
    #score = model.evals_result()['validation_1']['f1_micro'][-1]
    #scores[idx] = score 

    preds = model.predict(X_val)
    score = f1_score(y_val, preds, average='micro')
    scores[idx] = score

    y_pred = model.predict_proba(X_val)
    train_prediction[test_idx, :] = y_pred

    y_pred = model.predict_proba(test_values)
    test_prediction += y_pred

    runtime = time.time() - start
    print(f"Fold {idx+1} finished with score: {score:.5f} in {runtime:.2f} seconds.\n")

test_prediction /= N_SPLITS
print('CV mean: {:.4f}, CV std: {:.4f}'.format(np.mean(scores), np.std(scores)))







Fold 1 finished with score: 0.77872 in 224.18 seconds.





Fold 2 finished with score: 0.78073 in 212.16 seconds.





Fold 3 finished with score: 0.77767 in 192.65 seconds.





Fold 4 finished with score: 0.77765 in 214.61 seconds.





Fold 5 finished with score: 0.77958 in 249.44 seconds.

CV mean: 0.7789, CV std: 0.0012


In [159]:
prediction_labels = np.argmax(test_prediction, axis=1)+1
submission = pd.DataFrame(data=prediction_labels,
                             columns=submission_format.columns,
                             index=submission_format.index)
submission.to_csv('pseudo_xgboost_submission.csv')
submission.value_counts()

damage_grade
2               56063
3               24370
1                6435
dtype: int64

In [175]:
test_prediction_df = pd.DataFrame(data=test_prediction,
                             columns=['1','2','3'],
                             index=submission_format.index)
test_prediction_df.to_csv('test_pseudo_xgboost_prediction.csv')
test_prediction_df.shape

(86868, 3)

In [176]:
train_prediction_df = pd.DataFrame(data=train_prediction[:train_values.shape[0],:],
                             columns=['1','2','3'])
train_prediction_df.to_csv('train_pseudo_xgboost_prediction.csv')
train_prediction_df.shape

(260601, 3)

In [None]:
"""
params = {
    'booster':'gbtree',
    'gpu_id':-1,
    'objective':'multi:softmax', 
    'learning_rate':0.6,
    'max_depth':6,
    'reg_lambda':1, 
    'early_stopping_rounds':200,
    'eval_metric':['merror','mlogloss'],
    'seed':42
}

n_fold = 5 # amount of data folds
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)
prediction, scores  = train(train_values, train_labels-1, test_values, folds, params)
"""



In [None]:
def write(prediction, submission_format, file_name):
    prediction_labels = np.argmax(prediction, axis=1)+1
    print(pd.DataFrame(prediction_labels).value_counts())
    submission = pd.DataFrame(data=prediction_labels,
                                columns=submission_format.columns,
                                index=submission_format.index)
    submission.to_csv(file_name)
    prediction = pd.DataFrame(data=prediction,
                                columns=['1','2','3'],
                                index=submission_format.index)
    prediction.to_csv('xgboost_prediction.csv')
    
write(prediction, submission_format, file_name="xgboost_submission.csv")

In [None]:
# lr = 0.1
#[99]	validation_0-merror:0.28835	validation_0-mlogloss:0.64207	validation_1-merror:0.29781	validation_1-mlogloss:0.65580
# 2    60427
# 3    21494
# 1     4947

In [None]:
# lr = 0.3
# [99]	validation_0-merror:0.25355	validation_0-mlogloss:0.57991	validation_1-merror:0.27354	validation_1-mlogloss:0.61489
# 2    58271
# 3    23081
# 1     5516

In [None]:
# lr = 0.6
#2    57341
#3    23571
#1     5956

In [None]:
submission_format.columns