In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Sequence, List
from sklearn.model_selection import train_test_split, StratifiedKFold
SEED = 42
GPU_ID = 0

In [2]:
train_df = pd.read_csv("/kaggle/input/fall-ml2-mipt-2022/train.csv", index_col="Id")

In [3]:
train_df["Category"] = train_df["Category"].apply(int)

In [4]:
train_df.sample(5)

Unnamed: 0_level_0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,x_391,x_392,x_393,x_394,x_395,x_396,x_397,x_398,x_399,Category
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1215,0.006693,-0.001064,-0.006892,0.014032,0.054385,-0.005642,0.001221,0.002902,0.012434,0.003785,...,-1.276993,0.120938,1.201245,0.003317,0.155524,0.029784,0.575387,0.200899,0.040871,5
7779,-0.016078,-0.00921,-0.002227,0.002441,-0.009096,0.003098,-0.003394,0.004631,-0.008543,0.008599,...,1.534085,-0.030018,-2.340589,0.008323,0.551283,0.001453,2.098261,0.428228,-0.043324,9
7852,0.000577,0.000534,0.007742,0.012345,0.009163,0.004301,-0.011658,-0.008075,0.014636,0.011465,...,2.882422,0.059333,-1.924507,0.007635,-0.504841,0.044238,2.708727,1.124071,0.017038,15
3077,0.001272,0.012638,0.012538,-0.003586,0.012777,0.009809,-0.007599,0.000347,-0.008807,0.006637,...,0.239884,0.078222,-0.914482,0.025377,-1.149579,0.016397,-0.355844,-0.808708,0.018965,15
1138,0.007297,0.009756,0.003809,-0.024964,-0.003309,0.020713,0.004888,0.00042,0.010038,-0.018755,...,-0.896569,-0.011944,0.700887,-0.020561,-0.44827,-0.020873,-0.253505,0.052463,-0.007987,3


In [5]:
train_df.describe()

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,x_391,x_392,x_393,x_394,x_395,x_396,x_397,x_398,x_399,Category
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,0.019503,0.006501,0.015427,0.031731,0.033968,0.024515,0.006047,0.023426,0.024212,0.030499,...,-0.001936,0.000199,-0.007278,0.000172,-0.012318,0.000856,-0.00732,-0.00023,0.000224,17.0625
std,0.361609,0.347227,0.344725,0.368493,0.389563,0.368459,0.33187,0.373213,0.345279,0.384832,...,1.0048,0.045655,0.99883,0.032463,1.006192,0.03769,0.993976,1.002536,0.047652,9.724107
min,-3.164112,-3.010361,-3.308394,-2.881321,-3.439496,-2.698778,-3.180149,-3.014124,-2.824733,-3.439037,...,-3.634521,-0.325977,-3.938113,-0.270285,-3.523945,-0.265358,-3.933017,-3.77377,-0.297632,0.0
25%,-0.00752,-0.007291,-0.007233,-0.007286,-0.006951,-0.007153,-0.007183,-0.007398,-0.007143,-0.006988,...,-0.689239,-0.021368,-0.688154,-0.01264,-0.700943,-0.014773,-0.67776,-0.672713,-0.022221,9.0
50%,0.000691,0.000188,0.000622,0.000758,0.000727,0.000653,0.000171,0.000581,0.000663,0.000931,...,-0.011012,7.3e-05,0.001013,0.000126,-0.013112,0.00033,-0.023013,-0.000571,-0.000253,17.0
75%,0.008769,0.008157,0.008546,0.009497,0.009073,0.008897,0.008016,0.008989,0.009019,0.009359,...,0.688578,0.021935,0.66918,0.012842,0.6697,0.016359,0.663634,0.668707,0.022257,27.0
max,4.044388,3.526068,3.014865,2.999935,3.055828,3.344133,3.282431,2.998927,3.491133,3.406931,...,3.90998,0.246096,4.278834,0.238513,3.801937,0.247514,3.949601,3.671366,0.371345,31.0


In [6]:
def stratified_k_fold(ids: Sequence, target: Sequence, n_splits: int=3, random_state=42) -> List[List[List]]:
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    split = []
    for train, val_test in skf.split(ids, target):
        val_test_skf = StratifiedKFold(2)
        _target = [target[i] for i in val_test]
        val, test = next(val_test_skf.split(val_test, _target))
        split.append([train, [val_test[i] for i in val], [val_test[i] for i in test]])

    return split

In [7]:
split = stratified_k_fold(train_df.index.to_list(), train_df["Category"], 5)

In [8]:
len(split[0][0]), len(split[0][1]), len(split[0][2])

(8000, 1000, 1000)

In [9]:
import xgboost

In [10]:
models, val_preds = [], []
eval_metrics = ['mlogloss', 'auc', 'merror']
results = []

n_estimators = 1000
max_depth = 5
reg_alpha = 10

for i, (train_ids, val_ids, test_ids) in enumerate(split):
    print(f"FOLD #{i}")
    model = xgboost.XGBClassifier(n_estimators=n_estimators, tree_method='gpu_hist', reg_alpha=reg_alpha, learning_rate=0.01, gpu_id=GPU_ID,
                                  objective="multi:softprob", num_class=32, eval_metric=eval_metrics, max_depth=max_depth, seed=SEED)
    model.fit(train_df.loc[train_ids][[f"x_{i}" for i in range(400)]], train_df.loc[train_ids]["Category"], verbose=False,
             eval_set=[(train_df.loc[val_ids][[f"x_{i}" for i in range(400)]], train_df.loc[val_ids]["Category"])])
    models.append(model)
    results.append(model.evals_result())
    model.save_model(f'model_{i}.json')
    
    print({k: m[-1] for k, m in results[i]["validation_0"].items()})

FOLD #0
{'mlogloss': 2.6900833263099195, 'auc': 0.7891548532713187, 'merror': 0.733}
FOLD #1
{'mlogloss': 2.679531070411205, 'auc': 0.7915545341848914, 'merror': 0.749}
FOLD #2
{'mlogloss': 2.6473266729712486, 'auc': 0.7949330165991852, 'merror': 0.723}
FOLD #3
{'mlogloss': 2.6793478204905985, 'auc': 0.7926650707558932, 'merror': 0.739}
FOLD #4
{'mlogloss': 2.7003428756892682, 'auc': 0.7940933450599799, 'merror': 0.737}


In [11]:
test_df = pd.read_csv("/kaggle/input/fall-ml2-mipt-2022/test.csv", index_col="Id")
probas = np.mean([model.predict_proba(test_df[[f"x_{i}" for i in range(400)]]) for model in models], axis=0)
test_df["Category"] = np.argmax(probas, axis=1)

In [12]:
test_df[["Category"]]

Unnamed: 0_level_0,Category
Id,Unnamed: 1_level_1
0,31
1,3
2,31
3,3
4,30
...,...
995,3
996,6
997,16
998,12
