In [2]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from catboost import CatBoostClassifier
from catboost import Pool

In [3]:
DATA_DIR = './data/'
train_values = pd.read_csv(DATA_DIR + 'train_values.csv', index_col='building_id')
train_labels = pd.read_csv(DATA_DIR + 'train_labels.csv', index_col='building_id')
test_values = pd.read_csv(DATA_DIR + 'test_values.csv', index_col='building_id')
submission_format = pd.read_csv(DATA_DIR + 'submission_format.csv', index_col='building_id')

In [4]:
numerical_columns = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage', 'count_families']
categorical_columns = list(set(train_values.columns)-set(numerical_columns))

In [5]:
train_values = train_values.reset_index(drop=True)
train_labels = train_labels.reset_index(drop=True)
test_values = test_values.reset_index(drop=True)

In [6]:
# Handle categorical features that only appear at train set, but not test set.
train_feature = set(train_values['geo_level_2_id'].unique())
test_feature = set(test_values['geo_level_2_id'].unique())
unseen_features = test_feature - train_feature
for feature in unseen_features:
    # get geo_level_1_id of data that geo_level_2_id == feature
    geo_level_1_id = test_values[test_values['geo_level_2_id'] == feature]['geo_level_1_id'].values[0]
    # get the most common geo_level_1_id in the train_values taht has the same geo_level_2_id
    most_common = train_values[train_values['geo_level_1_id'] == geo_level_1_id]['geo_level_2_id'].value_counts().index[0]
    # replace the unseen feature with the most common one
    test_values.loc[test_values['geo_level_2_id'] == feature, 'geo_level_2_id'] = most_common

train_feature = set(train_values['geo_level_3_id'].unique())
test_feature = set(test_values['geo_level_3_id'].unique())
unseen_features = test_feature - train_feature
for feature in unseen_features:
    geo_level_2_id = test_values[test_values['geo_level_3_id'] == feature]['geo_level_2_id'].values[0]
    most_common = train_values[train_values['geo_level_2_id']==geo_level_2_id]['geo_level_3_id'].value_counts().index[0]
    test_values.loc[test_values['geo_level_3_id'] == feature, 'geo_level_3_id'] = most_common
    

In [61]:
import optuna  # pip install optuna
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from optuna.integration import CatBoostPruningCallback
import time

def objective(trial, X, y):
    param_grid = {
        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        #"num_leaves": trial.suggest_int("num_leaves", 32, 1024, step=20),
        "max_depth": trial.suggest_int("max_depth", 5, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 100, step=10),
        "l2_leaf_reg": trial.suggest_float('l2_leaf_reg', 1.0, 10.0, step=0.5)
        #"lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        #"lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        #"min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        #"bagging_fraction": trial.suggest_float(
        #    "bagging_fraction", 0.2, 0.95, step=0.1
        #),
        #"bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        #"feature_fraction": trial.suggest_float(
        #    "feature_fraction", 0.2, 0.95, step=0.1
        #),
    }

    N_SPLITS = 5
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = np.empty(N_SPLITS)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        print("=" * 12 + f"Training fold {idx+1}" + 12 * "=")
        start = time.time()

        X_train, X_val = train_values.iloc[train_idx], train_values.iloc[test_idx]
        y_train, y_val = train_labels.iloc[train_idx], train_labels.iloc[test_idx]  
        
        train_data = Pool(data=X_train, 
                      label=y_train,
                      cat_features=categorical_columns)
        valid_data = Pool(data=X_val, 
                        label=y_val,
                        cat_features=categorical_columns)
        
        model = CatBoostClassifier(loss_function ='MultiClass', 
                                   eval_metric='TotalF1',
                                   #task_type='GPU',
                                   random_seed = 42,
                                   cat_features = categorical_columns,
                                   **param_grid)
        
        model.fit(train_data,
                eval_set=valid_data, 
                use_best_model=True,
                early_stopping_rounds=100,
                verbose=False,
                callbacks=[
                    CatBoostPruningCallback(trial, 'TotalF1')
                ],
            )

        
        score = model.get_best_score()['validation']['TotalF1']
        scores[idx] = score
        runtime = time.time() - start
        print(f"Fold {idx+1} finished with score: {score:.5f} in {runtime:.2f} seconds.\n")

    return np.mean(scores)

In [62]:
study = optuna.create_study(direction="maximize", study_name="Catboost Classifier")
func = lambda trial: objective(trial, train_values, train_labels)
study.optimize(func, n_trials=20)

[32m[I 2023-04-20 07:21:43,721][0m A new study created in memory with name: Catboost Classifier[0m




  CatBoostPruningCallback(trial, 'TotalF1')


Fold 1 finished with score: 0.74393 in 220.59 seconds.





Fold 2 finished with score: 0.74563 in 142.48 seconds.





Fold 3 finished with score: 0.74515 in 283.22 seconds.





Fold 4 finished with score: 0.74714 in 254.63 seconds.



[32m[I 2023-04-20 07:40:45,175][0m Trial 0 finished with value: 0.7447483787031921 and parameters: {'n_estimators': 10000, 'learning_rate': 0.058372967095211045, 'max_depth': 10, 'min_data_in_leaf': 30, 'l2_leaf_reg': 7.0}. Best is trial 0 with value: 0.7447483787031921.[0m


Fold 5 finished with score: 0.74189 in 240.48 seconds.



  CatBoostPruningCallback(trial, 'TotalF1')


Fold 1 finished with score: 0.74413 in 199.49 seconds.





Fold 2 finished with score: 0.74394 in 81.55 seconds.





Fold 3 finished with score: 0.74538 in 227.60 seconds.

Fold 4 finished with score: 0.74662 in 136.07 seconds.



[32m[I 2023-04-20 07:54:10,951][0m Trial 1 finished with value: 0.7443167881242967 and parameters: {'n_estimators': 10000, 'learning_rate': 0.050326442627218776, 'max_depth': 7, 'min_data_in_leaf': 60, 'l2_leaf_reg': 7.5}. Best is trial 0 with value: 0.7447483787031921.[0m


Fold 5 finished with score: 0.74152 in 161.01 seconds.



  CatBoostPruningCallback(trial, 'TotalF1')


Fold 1 finished with score: 0.74362 in 118.12 seconds.





Fold 2 finished with score: 0.74524 in 113.45 seconds.





Fold 3 finished with score: 0.74516 in 147.73 seconds.





Fold 4 finished with score: 0.74698 in 138.05 seconds.



[32m[I 2023-04-20 08:03:56,396][0m Trial 2 finished with value: 0.744140219468424 and parameters: {'n_estimators': 10000, 'learning_rate': 0.06902661834821325, 'max_depth': 6, 'min_data_in_leaf': 30, 'l2_leaf_reg': 9.0}. Best is trial 0 with value: 0.7447483787031921.[0m


Fold 5 finished with score: 0.73970 in 68.04 seconds.



  CatBoostPruningCallback(trial, 'TotalF1')


Fold 1 finished with score: 0.74405 in 104.39 seconds.





Fold 2 finished with score: 0.74361 in 45.81 seconds.





Fold 3 finished with score: 0.74540 in 99.59 seconds.





Fold 4 finished with score: 0.74554 in 103.53 seconds.



[32m[I 2023-04-20 08:11:11,774][0m Trial 3 finished with value: 0.7440700188052808 and parameters: {'n_estimators': 10000, 'learning_rate': 0.22105182000034182, 'max_depth': 9, 'min_data_in_leaf': 70, 'l2_leaf_reg': 4.0}. Best is trial 0 with value: 0.7447483787031921.[0m


Fold 5 finished with score: 0.74176 in 81.99 seconds.



  CatBoostPruningCallback(trial, 'TotalF1')


Fold 1 finished with score: 0.74426 in 64.06 seconds.





Fold 2 finished with score: 0.74613 in 80.87 seconds.





Fold 3 finished with score: 0.74591 in 127.59 seconds.





Fold 4 finished with score: 0.74457 in 87.73 seconds.



[32m[I 2023-04-20 08:18:01,442][0m Trial 4 finished with value: 0.7442356595128929 and parameters: {'n_estimators': 10000, 'learning_rate': 0.2721389711676507, 'max_depth': 9, 'min_data_in_leaf': 40, 'l2_leaf_reg': 3.0}. Best is trial 0 with value: 0.7447483787031921.[0m


Fold 5 finished with score: 0.74032 in 49.37 seconds.



  CatBoostPruningCallback(trial, 'TotalF1')


Fold 1 finished with score: 0.68388 in 1.83 seconds.





Fold 2 finished with score: 0.68219 in 1.83 seconds.

Fold 3 finished with score: 0.68354 in 1.81 seconds.

Fold 4 finished with score: 0.68269 in 1.82 seconds.



[32m[I 2023-04-20 08:18:10,591][0m Trial 5 finished with value: 0.6831121089639673 and parameters: {'n_estimators': 10000, 'learning_rate': 0.13798432680647277, 'max_depth': 8, 'min_data_in_leaf': 90, 'l2_leaf_reg': 9.5}. Best is trial 0 with value: 0.7447483787031921.[0m


Fold 5 finished with score: 0.68326 in 1.81 seconds.



  CatBoostPruningCallback(trial, 'TotalF1')


Fold 1 finished with score: 0.74647 in 309.34 seconds.





Fold 2 finished with score: 0.74446 in 120.88 seconds.





Fold 3 finished with score: 0.74610 in 380.34 seconds.

Fold 4 finished with score: 0.74763 in 167.58 seconds.



[32m[I 2023-04-20 08:38:34,679][0m Trial 6 finished with value: 0.7455764426030733 and parameters: {'n_estimators': 10000, 'learning_rate': 0.08739798206271002, 'max_depth': 10, 'min_data_in_leaf': 60, 'l2_leaf_reg': 6.5}. Best is trial 6 with value: 0.7455764426030733.[0m


Fold 5 finished with score: 0.74323 in 245.90 seconds.



  CatBoostPruningCallback(trial, 'TotalF1')


Fold 1 finished with score: 0.67822 in 1.93 seconds.





Fold 2 finished with score: 0.68104 in 1.80 seconds.

Fold 3 finished with score: 0.68026 in 1.77 seconds.

Fold 4 finished with score: 0.68011 in 1.77 seconds.



[32m[I 2023-04-20 08:38:43,779][0m Trial 7 finished with value: 0.6797165641722442 and parameters: {'n_estimators': 10000, 'learning_rate': 0.01394195918377152, 'max_depth': 6, 'min_data_in_leaf': 40, 'l2_leaf_reg': 2.0}. Best is trial 6 with value: 0.7455764426030733.[0m


Fold 5 finished with score: 0.67895 in 1.78 seconds.



  CatBoostPruningCallback(trial, 'TotalF1')


Fold 1 finished with score: 0.68388 in 1.87 seconds.





Fold 2 finished with score: 0.68219 in 1.83 seconds.

Fold 3 finished with score: 0.68354 in 1.85 seconds.

Fold 4 finished with score: 0.68269 in 1.80 seconds.



[32m[I 2023-04-20 08:38:53,010][0m Trial 8 finished with value: 0.6831121089639673 and parameters: {'n_estimators': 10000, 'learning_rate': 0.20780017596544137, 'max_depth': 8, 'min_data_in_leaf': 60, 'l2_leaf_reg': 2.5}. Best is trial 6 with value: 0.7455764426030733.[0m


Fold 5 finished with score: 0.68326 in 1.84 seconds.



  CatBoostPruningCallback(trial, 'TotalF1')


Fold 1 finished with score: 0.67114 in 1.80 seconds.





Fold 2 finished with score: 0.68089 in 1.79 seconds.

Fold 3 finished with score: 0.67992 in 1.75 seconds.

Fold 4 finished with score: 0.67952 in 1.79 seconds.



[32m[I 2023-04-20 08:39:01,990][0m Trial 9 finished with value: 0.6780829762352222 and parameters: {'n_estimators': 10000, 'learning_rate': 0.17097105823974296, 'max_depth': 5, 'min_data_in_leaf': 60, 'l2_leaf_reg': 3.0}. Best is trial 6 with value: 0.7455764426030733.[0m


Fold 5 finished with score: 0.67895 in 1.78 seconds.



  CatBoostPruningCallback(trial, 'TotalF1')


Fold 1 finished with score: 0.68750 in 2.17 seconds.





Fold 2 finished with score: 0.68219 in 1.83 seconds.

Fold 3 finished with score: 0.68354 in 1.85 seconds.

Fold 4 finished with score: 0.68269 in 1.83 seconds.



[32m[I 2023-04-20 08:39:11,636][0m Trial 10 finished with value: 0.6850096933296335 and parameters: {'n_estimators': 10000, 'learning_rate': 0.12102978621705443, 'max_depth': 10, 'min_data_in_leaf': 100, 'l2_leaf_reg': 5.5}. Best is trial 6 with value: 0.7455764426030733.[0m


Fold 5 finished with score: 0.68912 in 1.90 seconds.



  CatBoostPruningCallback(trial, 'TotalF1')


Fold 1 finished with score: 0.68753 in 2.20 seconds.





Fold 2 finished with score: 0.68219 in 1.83 seconds.

Fold 3 finished with score: 0.68354 in 1.79 seconds.

Fold 4 finished with score: 0.68269 in 1.82 seconds.



[32m[I 2023-04-20 08:39:21,235][0m Trial 11 finished with value: 0.6850139140302572 and parameters: {'n_estimators': 10000, 'learning_rate': 0.09079943719281537, 'max_depth': 10, 'min_data_in_leaf': 20, 'l2_leaf_reg': 7.0}. Best is trial 6 with value: 0.7455764426030733.[0m


Fold 5 finished with score: 0.68912 in 1.89 seconds.



  CatBoostPruningCallback(trial, 'TotalF1')


Fold 1 finished with score: 0.68750 in 2.17 seconds.





Fold 2 finished with score: 0.68219 in 1.81 seconds.

Fold 3 finished with score: 0.68354 in 1.81 seconds.

Fold 4 finished with score: 0.68269 in 1.81 seconds.



[32m[I 2023-04-20 08:39:30,810][0m Trial 12 finished with value: 0.6850096933296335 and parameters: {'n_estimators': 10000, 'learning_rate': 0.0965905210897734, 'max_depth': 10, 'min_data_in_leaf': 40, 'l2_leaf_reg': 6.0}. Best is trial 6 with value: 0.7455764426030733.[0m


Fold 5 finished with score: 0.68912 in 1.91 seconds.



  CatBoostPruningCallback(trial, 'TotalF1')


Fold 1 finished with score: 0.68980 in 2.19 seconds.





Fold 2 finished with score: 0.68219 in 1.79 seconds.

Fold 3 finished with score: 0.68354 in 1.81 seconds.

Fold 4 finished with score: 0.68269 in 1.82 seconds.



[32m[I 2023-04-20 08:39:40,325][0m Trial 13 finished with value: 0.6842863355598249 and parameters: {'n_estimators': 10000, 'learning_rate': 0.03350149773130505, 'max_depth': 9, 'min_data_in_leaf': 20, 'l2_leaf_reg': 8.0}. Best is trial 6 with value: 0.7455764426030733.[0m


Fold 5 finished with score: 0.68321 in 1.84 seconds.



  CatBoostPruningCallback(trial, 'TotalF1')


Fold 1 finished with score: 0.69833 in 5.83 seconds.





Fold 2 finished with score: 0.68219 in 1.84 seconds.

Fold 3 finished with score: 0.68354 in 1.81 seconds.

Fold 4 finished with score: 0.68269 in 1.81 seconds.



[32m[I 2023-04-20 08:39:53,600][0m Trial 14 finished with value: 0.6871749328087671 and parameters: {'n_estimators': 10000, 'learning_rate': 0.07251690790490986, 'max_depth': 10, 'min_data_in_leaf': 70, 'l2_leaf_reg': 5.0}. Best is trial 6 with value: 0.7455764426030733.[0m


Fold 5 finished with score: 0.68912 in 1.92 seconds.



  CatBoostPruningCallback(trial, 'TotalF1')


Fold 1 finished with score: 0.68727 in 1.91 seconds.





Fold 2 finished with score: 0.68219 in 1.83 seconds.

Fold 3 finished with score: 0.68354 in 1.81 seconds.

Fold 4 finished with score: 0.68269 in 1.83 seconds.



[32m[I 2023-04-20 08:40:02,893][0m Trial 15 finished with value: 0.6837800233876024 and parameters: {'n_estimators': 10000, 'learning_rate': 0.01788336349120663, 'max_depth': 9, 'min_data_in_leaf': 50, 'l2_leaf_reg': 7.0}. Best is trial 6 with value: 0.7455764426030733.[0m


Fold 5 finished with score: 0.68321 in 1.85 seconds.



  CatBoostPruningCallback(trial, 'TotalF1')


Fold 1 finished with score: 0.68388 in 1.87 seconds.





Fold 2 finished with score: 0.68219 in 1.81 seconds.

Fold 3 finished with score: 0.68354 in 1.80 seconds.

Fold 4 finished with score: 0.68269 in 1.82 seconds.



[32m[I 2023-04-20 08:40:12,093][0m Trial 16 finished with value: 0.6831121089639673 and parameters: {'n_estimators': 10000, 'learning_rate': 0.10129852924151472, 'max_depth': 8, 'min_data_in_leaf': 80, 'l2_leaf_reg': 8.5}. Best is trial 6 with value: 0.7455764426030733.[0m


Fold 5 finished with score: 0.68326 in 1.83 seconds.



  CatBoostPruningCallback(trial, 'TotalF1')


Fold 1 finished with score: 0.68387 in 1.79 seconds.





Fold 2 finished with score: 0.68219 in 1.77 seconds.

Fold 3 finished with score: 0.68354 in 1.78 seconds.

Fold 4 finished with score: 0.68269 in 1.78 seconds.



[32m[I 2023-04-20 08:40:21,073][0m Trial 17 finished with value: 0.6837893166738712 and parameters: {'n_estimators': 10000, 'learning_rate': 0.05293289793334697, 'max_depth': 7, 'min_data_in_leaf': 30, 'l2_leaf_reg': 10.0}. Best is trial 6 with value: 0.7455764426030733.[0m


Fold 5 finished with score: 0.68665 in 1.79 seconds.



  CatBoostPruningCallback(trial, 'TotalF1')


Fold 1 finished with score: 0.68752 in 2.14 seconds.





Fold 2 finished with score: 0.68219 in 1.81 seconds.

Fold 3 finished with score: 0.68354 in 1.81 seconds.

Fold 4 finished with score: 0.68269 in 1.82 seconds.



[32m[I 2023-04-20 08:40:30,630][0m Trial 18 finished with value: 0.6850122168815417 and parameters: {'n_estimators': 10000, 'learning_rate': 0.11982999571267233, 'max_depth': 10, 'min_data_in_leaf': 50, 'l2_leaf_reg': 1.0}. Best is trial 6 with value: 0.7455764426030733.[0m


Fold 5 finished with score: 0.68912 in 1.90 seconds.



  CatBoostPruningCallback(trial, 'TotalF1')


Fold 1 finished with score: 0.69190 in 2.52 seconds.





Fold 2 finished with score: 0.68219 in 1.81 seconds.

Fold 3 finished with score: 0.68354 in 1.82 seconds.

Fold 4 finished with score: 0.68269 in 1.82 seconds.



[32m[I 2023-04-20 08:40:40,547][0m Trial 19 finished with value: 0.6847069055483523 and parameters: {'n_estimators': 10000, 'learning_rate': 0.05062764650864782, 'max_depth': 9, 'min_data_in_leaf': 80, 'l2_leaf_reg': 6.5}. Best is trial 6 with value: 0.7455764426030733.[0m


Fold 5 finished with score: 0.68321 in 1.87 seconds.



In [63]:
print(f"\tBest value (f1 score): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

	Best value (f1 score): 0.74558
	Best params:
		n_estimators: 10000
		learning_rate: 0.08739798206271002
		max_depth: 10
		min_data_in_leaf: 60
		l2_leaf_reg: 6.5


In [7]:
"""
params = {
    'loss_function':'MultiClass', # objective function
    'eval_metric':'TotalF1', # metric
    'cat_features': categorical_columns,
    'early_stopping_rounds': 100,
    'verbose': 200, # output to stdout info about training process every 200 iterations
    'random_seed': 42,
    'n_estimators': 10000,
    'learning_rate': 0.19971663552268112,
    'max_depth': 8,
    'min_data_in_leaf': 50
}
"""
params = {
    'loss_function':'MultiClass', # objective function
    'eval_metric':'TotalF1', # metric
    'cat_features': categorical_columns,
    'early_stopping_rounds': 100,
    'verbose': 200, # output to stdout info about training process every 200 iterations
    'random_seed': 42,
    'n_estimators': 10000,
    'learning_rate': 0.08739798206271002,
    'max_depth': 10,
    'min_data_in_leaf': 60,
    'l2_leaf_reg': 6.5
}

In [8]:
"""
params = {'loss_function':'MultiClass', # objective function
          'eval_metric':'TotalF1', # metric
          'cat_features': categorical_columns,
          'early_stopping_rounds': 200,
          'verbose': 200, # output to stdout info about training process every 200 iterations
          'random_seed': 42,
          'iterations': 1200,
          'depth': 8,
         }
"""
n_fold = 5 # amount of data folds
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)

test_data = Pool(data=test_values,
                 cat_features=categorical_columns)

scores = []
train_prediction = np.zeros((train_values.shape[0],3))
test_prediction = np.zeros((test_values.shape[0],3))
for fold_n, (train_index, valid_index) in enumerate(folds.split(train_values, train_labels)):

    X_train, X_valid = train_values.iloc[train_index], train_values.iloc[valid_index] # train and validation data splits
    y_train, y_valid = train_labels.iloc[train_index], train_labels.iloc[valid_index]
    
    train_data = Pool(data=X_train, 
                      label=y_train,
                      cat_features=categorical_columns)
    valid_data = Pool(data=X_valid, 
                      label=y_valid,
                      cat_features=categorical_columns)
    
    model = CatBoostClassifier(**params)
    
    model.fit(train_data,
              eval_set=valid_data, 
              use_best_model=True
             )
    
    score = model.get_best_score()['validation']['TotalF1']
    scores.append(score)

    y_pred = model.predict_proba(valid_data)
    train_prediction[valid_index, :] = y_pred
    
    y_pred = model.predict_proba(test_data)
    test_prediction += y_pred

test_prediction /= n_fold
print('Before Pseudo-labeling')
print('CV mean: {:.4f}, CV std: {:.4f}'.format(np.mean(scores), np.std(scores)))


0:	learn: 0.6864626	test: 0.6875259	best: 0.6875259 (0)	total: 324ms	remaining: 53m 59s
200:	learn: 0.7457952	test: 0.7400085	best: 0.7400144 (197)	total: 51.2s	remaining: 41m 34s
400:	learn: 0.7576795	test: 0.7421408	best: 0.7422539 (399)	total: 1m 46s	remaining: 42m 20s
600:	learn: 0.7682523	test: 0.7438693	best: 0.7440492 (592)	total: 2m 40s	remaining: 41m 56s
800:	learn: 0.7794053	test: 0.7449686	best: 0.7453196 (780)	total: 3m 37s	remaining: 41m 42s
1000:	learn: 0.7899731	test: 0.7462547	best: 0.7464703 (993)	total: 4m 32s	remaining: 40m 51s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.746470335
bestIteration = 993

Shrink model to first 994 iterations.
0:	learn: 0.6768078	test: 0.6821929	best: 0.6821929 (0)	total: 145ms	remaining: 24m 6s
200:	learn: 0.7449691	test: 0.7424643	best: 0.7424643 (200)	total: 50.5s	remaining: 41m 1s
400:	learn: 0.7579400	test: 0.7436989	best: 0.7444559 (337)	total: 1m 44s	remaining: 41m 53s
Stopped by overfitting detector  (100 

In [80]:
prediction_labels = np.argmax(test_prediction, axis=1)+1
submission = pd.DataFrame(data=prediction_labels,
                             columns=submission_format.columns,
                             index=submission_format.index)
submission.to_csv('catboost_submission.csv')
submission.value_counts()

damage_grade
2               56082
3               24329
1                6457
dtype: int64

In [81]:
# Pseudo-labeling

threshold = 0.8

test_X_pseudo_1 = test_values.iloc[np.where(prediction[:,0] > threshold)]
test_y_pseudo_1 = [1]*test_X_pseudo_1.shape[0]
test_X_pseudo_2 = test_values.iloc[np.where(prediction[:,1] > threshold)]
test_y_pseudo_2 = [2]*test_X_pseudo_2.shape[0]
test_X_pseudo_3 = test_values.iloc[np.where(prediction[:,2] > threshold)]
test_y_pseudo_3 = [3]*test_X_pseudo_3.shape[0]

test_pseudo = pd.concat([test_X_pseudo_1, test_X_pseudo_2, test_X_pseudo_3]).reset_index(drop = True)
test_pseudo['damage_grade'] = pd.Series(test_y_pseudo_1+test_y_pseudo_2+test_y_pseudo_3)
test_pseudo_y = pd.DataFrame(test_pseudo['damage_grade'])
test_pseudo_X = test_pseudo.drop('damage_grade', axis = 1)
#test_pseudo_sample = test_pseudo.sample(int(0.2*test_values.shape[0]))

train_X_pseudo = pd.concat([train_values, test_pseudo_X],ignore_index=True)
train_y_pseudo = pd.concat([train_labels, test_pseudo_y],ignore_index=True)

n_fold = 5 # amount of data folds
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)

test_data = Pool(data=test_values,
                 cat_features=categorical_columns)

scores = []
train_prediction = np.zeros((train_X_pseudo.shape[0],3))
test_prediction = np.zeros((test_values.shape[0],3))
for fold_n, (train_index, valid_index) in enumerate(folds.split(train_X_pseudo, train_y_pseudo)):

    X_train, X_valid = train_X_pseudo.iloc[train_index], train_X_pseudo.iloc[valid_index] # train and validation data splits
    y_train, y_valid = train_y_pseudo.iloc[train_index], train_y_pseudo.iloc[valid_index]
    
    train_data = Pool(data=X_train, 
                      label=y_train,
                      cat_features=categorical_columns)
    valid_data = Pool(data=X_valid, 
                      label=y_valid,
                      cat_features=categorical_columns)
    
    model = CatBoostClassifier(**params)
    model.fit(train_data,
              eval_set=valid_data, 
              use_best_model=True
             )
    
    score = model.get_best_score()['validation']['TotalF1']
    scores.append(score)

    y_pred = model.predict_proba(valid_data)
    train_prediction[valid_index, :] = y_pred
    
    y_pred = model.predict_proba(test_data)
    test_prediction += y_pred

test_prediction /= n_fold
print('After Pseudo-labeling')
print('CV mean: {:.4f}, CV std: {:.4f}'.format(np.mean(scores), np.std(scores)))

0:	learn: 0.7211742	test: 0.7231986	best: 0.7231986 (0)	total: 243ms	remaining: 40m 27s
200:	learn: 0.7771448	test: 0.7732626	best: 0.7732626 (200)	total: 57.4s	remaining: 46m 36s
400:	learn: 0.7870904	test: 0.7740902	best: 0.7744125 (386)	total: 2m	remaining: 48m 14s
600:	learn: 0.7971353	test: 0.7763373	best: 0.7765316 (596)	total: 3m 3s	remaining: 47m 53s
800:	learn: 0.8052758	test: 0.7767412	best: 0.7770490 (788)	total: 4m 8s	remaining: 47m 38s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7770490069
bestIteration = 788

Shrink model to first 789 iterations.
0:	learn: 0.7207934	test: 0.7234581	best: 0.7234581 (0)	total: 255ms	remaining: 42m 26s
200:	learn: 0.7772021	test: 0.7739632	best: 0.7739632 (200)	total: 58.1s	remaining: 47m 14s
400:	learn: 0.7877940	test: 0.7753828	best: 0.7756133 (392)	total: 2m	remaining: 48m 3s
600:	learn: 0.7976423	test: 0.7763167	best: 0.7764410 (599)	total: 3m 3s	remaining: 47m 57s
800:	learn: 0.8072582	test: 0.7763167	best: 0.77

In [82]:
prediction_labels = np.argmax(test_prediction, axis=1)+1
submission = pd.DataFrame(data=prediction_labels,
                             columns=submission_format.columns,
                             index=submission_format.index)
submission.to_csv('pseudo_catboost_submission.csv')
submission.value_counts()

damage_grade
2               56041
3               24286
1                6541
dtype: int64

In [9]:
test_prediction_df = pd.DataFrame(data=test_prediction,
                             columns=['1','2','3'],
                             index=submission_format.index)
test_prediction_df.to_csv('test_pseudo_catboost_prediction.csv')
test_prediction_df.shape

(86868, 3)

In [10]:
train_prediction_df = pd.DataFrame(data=train_prediction[:train_values.shape[0],:],
                             columns=['1','2','3'])
train_prediction_df.to_csv('train_pseudo_catboost_prediction.csv')
train_prediction_df.shape

(260601, 3)

In [None]:
"""
predictions = []
for model in models:
    predictions.append(model.predict(X_test))
predictions = np.concatenate(predictions, axis=1)
print(predictions.shape)
predictions = stats.mode(predictions, axis=1)[0].reshape(-1)
print(predictions.shape)
"""

"""
from imblearn.combine import SMOTETomek
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in train_values.select_dtypes(include='O').columns:
    train_values[col]=le.fit_transform(train_values[col])

smt = SMOTETomek(sampling_strategy='auto')
X_smt, y_smt = smt.fit_resample(train_values, train_labels)

#plot_2d_space(X_smt, y_smt, 'SMOTE + Tomek links')
"""

"\nfrom imblearn.combine import SMOTETomek\nfrom sklearn.preprocessing import LabelEncoder\n\nle = LabelEncoder()\nfor col in train_values.select_dtypes(include='O').columns:\n    train_values[col]=le.fit_transform(train_values[col])\n\nsmt = SMOTETomek(sampling_strategy='auto')\nX_smt, y_smt = smt.fit_resample(train_values, train_labels)\n\n#plot_2d_space(X_smt, y_smt, 'SMOTE + Tomek links')\n"