In [1]:
import warnings

warnings.filterwarnings(action='ignore')

import numpy as np
import pandas as pd
import scipy.stats as ss
import seaborn as sns
import matplotlib.pyplot as plt
import optuna

from tqdm.notebook import tqdm
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from feature_engine.outliers import Winsorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_classif, f_classif
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.kernel_approximation import AdditiveChi2Sampler
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.feature_selection import RFECV, SelectKBest

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.cluster import KMeans, FeatureAgglomeration

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [2]:
train = pd.read_csv('./data/train.csv').drop('id', axis=1)
test = pd.read_csv('./data/test.csv').drop('id', axis=1)
sub = pd.read_csv('./data/sample_submission.csv', index_col='id')

In [3]:
train

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,584,590,909972,909977,16,8,5,2274,113,140,...,-0.5000,-0.0104,0.1417,0,0,0,1,0,0,0
1,808,816,728350,728372,433,20,54,44478,70,111,...,0.7419,-0.2997,0.9491,0,0,0,0,0,0,1
2,39,192,2212076,2212144,11388,705,420,1311391,29,141,...,-0.0105,-0.0944,1.0000,0,0,1,0,0,0,0
3,781,789,3353146,3353173,210,16,29,3202,114,134,...,0.6667,-0.0402,0.4025,0,0,1,0,0,0,0
4,1540,1560,618457,618502,521,72,67,48231,82,111,...,0.9158,-0.2455,0.9998,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19214,749,757,143210,143219,17,4,4,2193,122,140,...,-0.1429,0.0044,0.2901,0,0,0,1,0,0,0
19215,723,735,2488529,2488541,231,17,26,27135,104,133,...,0.7222,-0.0989,0.5378,0,0,0,0,0,0,1
19216,6,31,1578055,1578129,780,114,98,71112,41,94,...,0.7719,-0.4283,0.9997,1,0,0,0,0,0,0
19217,9,18,1713172,1713184,126,13,26,14808,88,132,...,0.9610,-0.1162,0.3509,0,0,0,0,0,0,1


In [4]:
train.shape, test.shape, sub.shape

((19219, 34), (12814, 27), (12814, 7))

In [5]:
ys = list(train.columns[-7:])

In [6]:
Y = train.loc[:, ys]
X = train.drop(ys, axis=1)

In [7]:
X.isna().sum()

X_Minimum                0
X_Maximum                0
Y_Minimum                0
Y_Maximum                0
Pixels_Areas             0
X_Perimeter              0
Y_Perimeter              0
Sum_of_Luminosity        0
Minimum_of_Luminosity    0
Maximum_of_Luminosity    0
Length_of_Conveyer       0
TypeOfSteel_A300         0
TypeOfSteel_A400         0
Steel_Plate_Thickness    0
Edges_Index              0
Empty_Index              0
Square_Index             0
Outside_X_Index          0
Edges_X_Index            0
Edges_Y_Index            0
Outside_Global_Index     0
LogOfAreas               0
Log_X_Index              0
Log_Y_Index              0
Orientation_Index        0
Luminosity_Index         0
SigmoidOfAreas           0
dtype: int64

In [8]:
mut_info = pd.DataFrame(np.zeros((27)), index=train.columns[:-7])

for i in range(7):
    mut_info[i] = pd.Series(mutual_info_classif(train.iloc[:, :-7], Y.iloc[:, i]), index=train.columns[:-7])

mut_info

Unnamed: 0,0,1,2,3,4,5,6
X_Minimum,0.025458,0.034972,0.293916,0.024944,0.01526,0.060018,0.040471
X_Maximum,0.026515,0.033673,0.293229,0.021911,0.015916,0.055695,0.039463
Y_Minimum,0.007726,0.003044,0.152746,0.015953,0.003855,0.034486,0.024232
Y_Maximum,0.006852,0.002633,0.150847,0.016745,0.00432,0.031649,0.024016
Pixels_Areas,0.021564,0.010237,0.305906,0.080198,0.008623,0.066648,0.04218
X_Perimeter,0.015475,0.012842,0.304042,0.057974,0.008487,0.056322,0.035032
Y_Perimeter,0.017154,0.010074,0.278087,0.073169,0.007236,0.06586,0.029896
Sum_of_Luminosity,0.017132,0.012868,0.296848,0.071552,0.008003,0.063846,0.034652
Minimum_of_Luminosity,0.018078,0.012595,0.279333,0.032425,0.009289,0.056504,0.037018
Maximum_of_Luminosity,0.005376,0.010287,0.076491,0.021552,0.003297,0.013846,0.009348


In [9]:
X.corr()

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas
X_Minimum,1.0,0.989767,0.016071,0.014247,-0.464759,-0.451961,-0.426514,-0.430073,0.337748,-0.132955,...,-0.522112,0.112746,0.460421,0.222109,-0.505024,-0.530728,-0.375324,0.276871,-0.068844,-0.376973
X_Maximum,0.989767,1.0,0.018909,0.016513,-0.391937,-0.378434,-0.356528,-0.363092,0.275571,-0.129669,...,-0.432545,0.095799,0.391743,0.190474,-0.428875,-0.448348,-0.31677,0.237157,-0.079235,-0.319598
Y_Minimum,0.016071,0.018909,1.0,0.969552,-0.00737,-0.012712,-0.024876,-0.008189,-0.017321,-0.02916,...,0.019685,0.05271,-0.002647,-0.025374,-0.001754,0.015116,-0.023702,-0.044202,-0.023,-0.031511
Y_Maximum,0.014247,0.016513,0.969552,1.0,-0.007019,-0.012377,-0.024798,-0.007809,-0.018739,-0.028312,...,0.017271,0.051546,-0.001382,-0.023285,-0.000995,0.015624,-0.022941,-0.04249,-0.022869,-0.030345
Pixels_Areas,-0.464759,-0.391937,-0.00737,-0.007019,1.0,0.835079,0.834543,0.797843,-0.622867,0.130499,...,0.76481,-0.303335,-0.650704,-0.205579,0.77458,0.749731,0.640033,-0.269013,-0.015081,0.542672
X_Perimeter,-0.451961,-0.378434,-0.012712,-0.012377,0.835079,1.0,0.912579,0.802072,-0.603891,0.158036,...,0.75247,-0.378672,-0.664781,-0.174615,0.771546,0.750418,0.667441,-0.234969,0.005212,0.561256
Y_Perimeter,-0.426514,-0.356528,-0.024876,-0.024798,0.834543,0.912579,1.0,0.809171,-0.602826,0.160831,...,0.700036,-0.469092,-0.612157,-0.081225,0.790863,0.705657,0.761117,-0.113968,-0.00565,0.608167
Sum_of_Luminosity,-0.430073,-0.363092,-0.008189,-0.007809,0.797843,0.802072,0.809171,1.0,-0.580857,0.128569,...,0.703912,-0.315125,-0.608297,-0.177048,0.726483,0.698035,0.615651,-0.234625,-0.013845,0.520282
Minimum_of_Luminosity,0.337748,0.275571,-0.017321,-0.018739,-0.622867,-0.603891,-0.602826,-0.580857,1.0,0.397265,...,-0.638549,0.234592,0.467841,0.11248,-0.691406,-0.637711,-0.585194,0.179385,0.619174,-0.504692
Maximum_of_Luminosity,-0.132955,-0.129669,-0.02916,-0.028312,0.130499,0.158036,0.160831,0.128569,0.397265,1.0,...,0.07461,-0.014785,-0.153276,-0.120083,0.033191,0.079426,0.00465,-0.113531,0.853856,0.018501


In [10]:
X['col_1'] = np.sqrt(X['Pixels_Areas'])
X['col_2'] = np.exp(X['LogOfAreas'])
X['col_2_1'] = np.exp(X['Log_X_Index'])
X['col_2_2'] = np.exp(X['Log_Y_Index'])
X['col_3'] = X['X_Perimeter'] / X['Y_Perimeter']
X['col_4'] = (X['X_Maximum'] - X['X_Minimum']) * (X['Y_Maximum'] - X['Y_Minimum'])
X[['col_5', 'col_51']] = np.exp(X[['Log_X_Index', 'Log_Y_Index']])

_, bins_1 = pd.qcut(X['LogOfAreas'], q=10, retbins=True, duplicates='drop')
_, bins_2 = pd.qcut(X['Pixels_Areas'], q=10, retbins=True, duplicates='drop')
_, bins_3 = pd.qcut(X['Sum_of_Luminosity'], q=10, retbins=True, duplicates='drop')
_, bins_4 = pd.qcut(X['Steel_Plate_Thickness'], q=10, retbins=True, duplicates='drop')
_, bins_5 = pd.qcut(X['Length_of_Conveyer'], q=10, retbins=True, duplicates='drop')

X['col_6'] = pd.cut(X['LogOfAreas'], bins_1, labels=False, include_lowest=True)
X['col_7'] = pd.cut(X['Pixels_Areas'], bins_2, labels=False, include_lowest=True)
X['col_8'] = pd.cut(X['Sum_of_Luminosity'], bins_3, labels=False, include_lowest=True)
X['col_9'] = pd.cut(X['Steel_Plate_Thickness'], bins_4, labels=False, include_lowest=True)
X['col_10'] = pd.cut(X['Length_of_Conveyer'], bins_5, labels=False, include_lowest=True)

X['col_11'] = X['X_Minimum'] / X['X_Maximum']
X['col_12'] = X['Y_Minimum'] / X['Y_Maximum']
X['col_13'] = X['Sum_of_Luminosity'] / X['Pixels_Areas']
X['col_14'] = X['Minimum_of_Luminosity'] / X['Maximum_of_Luminosity']
X['col_15'] = X['X_Perimeter'] / X['Y_Perimeter']

X['col_16'] = np.cos(np.exp(X['LogOfAreas']))
X['col_17'] = X['Maximum_of_Luminosity'] - X['Minimum_of_Luminosity']
X['col_18'] = np.log1p(X['Sum_of_Luminosity'])
X['col_19'] = np.log1p(X['Length_of_Conveyer'])
X['col_20'] = np.log1p(X['Steel_Plate_Thickness'])

X['col_21'] = (X['Outside_Global_Index'] - X['Edges_Y_Index']) / (X['Outside_X_Index'] * X['Log_X_Index'] ** 2)
X['col_22'] = X['Steel_Plate_Thickness'] / X['Length_of_Conveyer']
X['col_23'] = X['Steel_Plate_Thickness'] / X['Edges_Y_Index'] * X['Outside_X_Index']
X['col_24'] = (X['Log_X_Index'] * X['LogOfAreas']) / X['Edges_Y_Index']
X['col_25'] = X['Pixels_Areas'] / (X['X_Perimeter'] + X['Y_Perimeter']) * X['Edges_Y_Index']
X['col_26'] = (X['Pixels_Areas'] / (X['Y_Perimeter'] * X['Edges_Y_Index'])) / (
        X['Sum_of_Luminosity'] - X['Minimum_of_Luminosity'])

In [11]:
# test dataset
test['col_1'] = np.sqrt(test['Pixels_Areas'])
test['col_2'] = np.exp(test['LogOfAreas'])
test['col_2_1'] = np.exp(test['Log_X_Index'])
test['col_2_2'] = np.exp(test['Log_Y_Index'])
test['col_3'] = test['X_Perimeter'] / test['Y_Perimeter']
test['col_4'] = (test['X_Maximum'] - test['X_Minimum']) * (test['Y_Maximum'] - test['Y_Minimum'])
test[['col_5', 'col_51']] = np.exp(test[['Log_X_Index', 'Log_Y_Index']])

test['col_6'] = pd.cut(test['LogOfAreas'], bins_1, labels=False, include_lowest=True)
test['col_7'] = pd.cut(test['Pixels_Areas'], bins_2, labels=False, include_lowest=True)
test['col_8'] = pd.cut(test['Sum_of_Luminosity'], bins_3, labels=False, include_lowest=True)
test['col_9'] = pd.cut(test['Steel_Plate_Thickness'], bins_4, labels=False, include_lowest=True)
test['col_10'] = pd.cut(test['Length_of_Conveyer'], bins_5, labels=False, include_lowest=True)

test['col_11'] = test['X_Minimum'] / test['X_Maximum']
test['col_12'] = test['Y_Minimum'] / test['Y_Maximum']
test['col_13'] = test['Sum_of_Luminosity'] / test['Pixels_Areas']
test['col_14'] = test['Minimum_of_Luminosity'] / test['Maximum_of_Luminosity']
test['col_15'] = test['X_Perimeter'] / test['Y_Perimeter']

test['col_16'] = np.cos(np.exp(test['LogOfAreas']))
test['col_17'] = test['Maximum_of_Luminosity'] - test['Minimum_of_Luminosity']
test['col_18'] = np.log1p(test['Sum_of_Luminosity'])
test['col_19'] = np.log1p(test['Length_of_Conveyer'])
test['col_20'] = np.log1p(test['Steel_Plate_Thickness'])

test['col_21'] = (test['Outside_Global_Index'] - test['Edges_Y_Index']) / (
        test['Outside_X_Index'] * test['Log_X_Index'] ** 2)
test['col_22'] = test['Steel_Plate_Thickness'] / test['Length_of_Conveyer']
test['col_23'] = test['Steel_Plate_Thickness'] / test['Edges_Y_Index'] * test['Outside_X_Index']
test['col_24'] = (test['Log_X_Index'] * test['LogOfAreas']) / test['Edges_Y_Index']
test['col_25'] = test['Pixels_Areas'] / (test['X_Perimeter'] + test['Y_Perimeter']) * test['Edges_Y_Index']
test['col_26'] = (test['Pixels_Areas'] / (test['Y_Perimeter'] * test['Edges_Y_Index'])) / (
        test['Sum_of_Luminosity'] - test['Minimum_of_Luminosity'])

In [12]:
test = test.fillna(0)

In [13]:
X.shape, test.shape

((19219, 56), (12814, 56))

In [14]:
pd.Series(ss.kurtosis(X), index=X.columns)

X_Minimum                   -1.327648
X_Maximum                   -1.257175
Y_Minimum                    8.865337
Y_Maximum                    8.839496
Pixels_Areas               181.689607
X_Perimeter                167.364618
Y_Perimeter                  9.289275
Sum_of_Luminosity          121.178595
Minimum_of_Luminosity       -0.003005
Maximum_of_Luminosity        9.781119
Length_of_Conveyer          -1.214170
TypeOfSteel_A300            -1.842475
TypeOfSteel_A400            -1.845782
Steel_Plate_Thickness        5.757569
Edges_Index                 -1.206891
Empty_Index                 -0.174450
Square_Index                -1.154852
Outside_X_Index             20.005039
Edges_X_Index               -0.779575
Edges_Y_Index               -0.561567
Outside_Global_Index        -1.819354
LogOfAreas                  -0.253556
Log_X_Index                 -0.064777
Log_Y_Index                 -0.173621
Orientation_Index           -1.093678
Luminosity_Index             6.899609
SigmoidOfAre

In [15]:
cap_cols = []

for c in X.columns:
    if c not in ['col_6', 'col_7', 'col_8', 'col_9', 'col_10', 'TypeOfSteel_A300', 'TypeOfSteel_A400']:
        cap_cols.append(c)

capper = Winsorizer(capping_method='iqr')
capper.fit(X[cap_cols])

In [16]:
X.loc[:, cap_cols] = capper.transform(X[cap_cols])
test.loc[:, cap_cols] = capper.transform(test[cap_cols])

In [17]:
scalable_cols = []

for c in X.columns:
    if c not in ['col_6', 'col_7', 'col_8', 'col_9', 'col_10', 'TypeOfSteel_A300', 'TypeOfSteel_A400']:
        scalable_cols.append(c)

scaler = MinMaxScaler()
scaler.fit(X[scalable_cols])

In [18]:
X.loc[:, scalable_cols] = scaler.transform(X[scalable_cols])

In [19]:
test.loc[:, scalable_cols] = scaler.transform(test[scalable_cols])

In [24]:
high_informative = ['Pixels_Areas', 'Outside_X_Index', 'LogOfAreas', 'Sum_of_Luminosity', 'X_Perimeter', 'Log_X_Index',
                    'X_Minimum', 'Y_Perimeter', 'X_Maximum', 'Edges_Index', 'SigmoidOfAreas', 'Log_Y_Index',
                    'Edges_Y_Index', 'Minimum_of_Luminosity', 'Steel_Plate_Thickness', 'Edges_X_Index',
                    'Length_of_Conveyer', ]

In [25]:
lda = LatentDirichletAllocation(9, n_jobs=-1).fit(X[high_informative])

X.loc[:, [f'lda_{i}' for i in range(9)]] = lda.transform(X[high_informative])
test.loc[:, [f'lda_{i}' for i in range(9)]] = lda.transform(np.abs(test[high_informative]))

In [26]:
svd = TruncatedSVD(n_components=3).fit(X[high_informative])

X.loc[:, [f'svd_{i}' for i in range(3)]] = svd.transform(X[high_informative])
test.loc[:, [f'svd_{i}' for i in range(3)]] = svd.transform(test[high_informative])

In [27]:
cluster = KMeans(n_clusters=5).fit(X[high_informative])

X.loc[:, 'clus'] = cluster.predict(X[high_informative])
test.loc[:, 'clus'] = cluster.predict(test[high_informative])

In [28]:
agg = FeatureAgglomeration(3).fit(X[high_informative])

X.loc[:, [f'agg_{i}' for i in range(3)]] = agg.transform(X[high_informative])
test.loc[:, [f'agg_{i}' for i in range(3)]] = agg.transform(test[high_informative])

In [29]:
X.shape, test.shape

((19219, 72), (12814, 72))

In [30]:
mut_info = pd.DataFrame(np.zeros((72)), index=X.columns)

for i in range(7):
    mut_info[i] = pd.Series(mutual_info_classif(X, Y.iloc[:, i]), index=X.columns)

mut_info

Unnamed: 0,0,1,2,3,4,5,6
X_Minimum,0.022298,0.032570,0.296045,0.026451,0.015946,0.061340,0.042482
X_Maximum,0.024751,0.033910,0.294665,0.021752,0.017448,0.056461,0.033952
Y_Minimum,0.007027,0.003878,0.152214,0.017140,0.004881,0.035755,0.023282
Y_Maximum,0.009078,0.002148,0.152822,0.016270,0.004085,0.032933,0.023726
Pixels_Areas,0.019161,0.010124,0.295656,0.080813,0.006974,0.064750,0.034954
...,...,...,...,...,...,...,...
svd_2,0.013561,0.010077,0.153069,0.022667,0.002046,0.017789,0.015875
clus,0.015461,0.028989,0.298976,0.021149,0.008830,0.057875,0.029375
agg_0,0.018236,0.013031,0.310666,0.075952,0.008524,0.071910,0.037641
agg_1,0.010268,0.029775,0.267286,0.021202,0.007744,0.058284,0.038953


In [31]:
estimator = RandomForestClassifier(max_depth=3, n_jobs=-1)
best_cols = set()

for i, c in enumerate(ys):
    selector = RFECV(estimator, min_features_to_select=18, cv=3, scoring='roc_auc', n_jobs=-1)
    selector.fit(X, Y.iloc[:, i])

    print(f'feat: {i}')
    print(selector.n_features_)

    if selector.n_features_ < 48:
        for col in selector.get_feature_names_out():
            best_cols.add(col)

feat: 0
62
feat: 1
29
feat: 2
45
feat: 3
68
feat: 4
60
feat: 5
71
feat: 6
72


In [35]:
best_cols = list(best_cols)

## models

In [33]:
cv = MultilabelStratifiedKFold(3, shuffle=True, random_state=48)

In [73]:
def object(trial):
    params = dict(iterations=100,
                  depth=trial.suggest_int('depth', 1, 6),
                  # num_leaves=trial.suggest_int('num_leaves ', 3, 15),
                  # bagging_freq=trial.suggest_int('bagging_freq', 32, 256),
                  # subsample=trial.suggest_float('subsample ', 5e-1, 1.0),
                  learning_rate=trial.suggest_float('learning_rate', 1e-4, 1e-1),
                  # gamma=trial.suggest_float('gamma ', 1e-3, 1e-1),
                  # feature_fraction=trial.suggest_float('feature_fraction ', 7e-1, 9e-1),
                  # colsample_bytree=trial.suggest_float('colsample_bytree', 6e-1, 9e-1),
                  # colsample_bylevel=trial.suggest_float('colsample_bylevel ', 6e-1, 9e-1),
                  # reg_alpha=trial.suggest_float('reg_alpha ', 1e-8, 1e-3),
                  # lambda_l1=trial.suggest_float('reg_lambda ', 1e-3, 1),
                  )
    estimator = make_pipeline(MultiOutputClassifier(CatBoostClassifier(**params,
                                                                       # device='gpu',
                                                                       task_type="GPU",
                                                                       cat_features=['col_7', 'col_9', 'col_10',
                                                                                     'TypeOfSteel_A300',
                                                                                     'TypeOfSteel_A400'],
                                                                       silent=True,
                                                                       # thread_count=-1
                                                                       # n_jobs=-1
                                                                       )))
    score = []

    for i, (t_idx, v_idx) in enumerate(cv.split(X[best_cols], Y)):
        train_x, valid_x = X.loc[t_idx, best_cols], X.loc[v_idx, best_cols]
        train_y, valid_y = Y.loc[t_idx, :], Y.loc[v_idx, :]

        estimator.fit(train_x, train_y)
        pred_vals = estimator.predict_proba(valid_x)
        avg_auc = np.mean([roc_auc_score(valid_y.iloc[:, i], pred_vals[i][:, 1]) for i in range(len(ys))])

        score.append(avg_auc)

    return np.mean(score)


study = optuna.create_study(direction='maximize')
study.optimize(object, n_trials=10)

[I 2024-03-30 23:54:02,407] A new study created in memory with name: no-name-fd0147a5-651d-4f66-94f3-9ad54a89c13d
[I 2024-03-30 23:55:15,999] Trial 0 finished with value: 0.880554124356779 and parameters: {'depth': 2, 'learning_rate': 0.07312909163565622}. Best is trial 0 with value: 0.880554124356779.
[I 2024-03-30 23:56:20,920] Trial 1 finished with value: 0.8640079559138293 and parameters: {'depth': 1, 'learning_rate': 0.04677669220148422}. Best is trial 0 with value: 0.880554124356779.
[I 2024-03-30 23:57:25,002] Trial 2 finished with value: 0.8725806268708336 and parameters: {'depth': 1, 'learning_rate': 0.0893862054296131}. Best is trial 0 with value: 0.880554124356779.
[I 2024-03-30 23:58:33,847] Trial 3 finished with value: 0.8819773863024064 and parameters: {'depth': 3, 'learning_rate': 0.04897910349889187}. Best is trial 3 with value: 0.8819773863024064.
[I 2024-03-30 23:59:57,622] Trial 4 finished with value: 0.8447285817295603 and parameters: {'depth': 1, 'learning_rate': 0

In [74]:
study.best_params

{'depth': 6, 'learning_rate': 0.03701738998059757}

In [55]:
cv = MultilabelStratifiedKFold(n_splits=7, shuffle=True)

for i, (t_idx, v_idx) in enumerate(cv.split(X[best_cols], Y)):
    train_x, valid_x = X.loc[t_idx, best_cols], X.loc[v_idx, best_cols]
    train_y, valid_y = Y.loc[t_idx, :], Y.loc[v_idx, :]

    model = XGBClassifier(n_estimators=1000,
                          max_depth=6,
                          subsample=0.5342564004636723,
                          learning_rate=0.0064835939090773,
                          colsample_bytree=0.8195724454605662,
                          reg_lambda=0.09829546268908843,
                          device='cuda:0',
                          verbosity=0)
    estimator = make_pipeline(MultiOutputClassifier(model))
    estimator.fit(train_x, train_y)

    pred_vals = estimator.predict_proba(valid_x)

    print(f'fold: {i}')
    print(np.mean([roc_auc_score(valid_y.iloc[:, i], pred_vals[i][:, 1]) for i in range(len(ys))]))

fold: 0
0.8875536474481811
fold: 1
0.8832327893500873
fold: 2
0.8814863150952492
fold: 3
0.8903093520283354
fold: 4
0.8917283800580417
fold: 5
0.8854142812883155
fold: 6
0.891819768782588


In [66]:
for i, (t_idx, v_idx) in enumerate(cv.split(X[best_cols], Y)):
    train_x, valid_x = X.loc[t_idx, best_cols], X.loc[v_idx, best_cols]
    train_y, valid_y = Y.loc[t_idx, :], Y.loc[v_idx, :]

    model = LGBMClassifier(num_iterations=1000,
                           max_depth=3,
                           learning_rate=0.0080988815333333,
                           feature_fraction=0.8578138132619784,
                           colsample_bytree=0.8768880865721203,
                           reg_lambda=0.7173767456929144,
                           device="gpu",
                           verbose=-1)
    estimator = make_pipeline(MultiOutputClassifier(model))
    estimator.fit(train_x, train_y)

    pred_vals = estimator.predict_proba(valid_x)

    print(f'fold: {i}')
    print(np.mean([roc_auc_score(valid_y.iloc[:, i], pred_vals[i][:, 1]) for i in range(len(ys))]))

fold: 0
0.8920737970472583
fold: 1
0.8870478399153396
fold: 2
0.890077136260705
fold: 3
0.8841728971014969
fold: 4
0.8837622048330978
fold: 5
0.8871467629125925
fold: 6
0.8751664505575947


In [79]:
for i, (t_idx, v_idx) in enumerate(cv.split(X[best_cols], Y)):
    train_x, valid_x = X.loc[t_idx, best_cols], X.loc[v_idx, best_cols]
    train_y, valid_y = Y.loc[t_idx, :], Y.loc[v_idx, :]

    model = CatBoostClassifier(iterations=110,
                               depth=6,
                               learning_rate=0.05201739,
                               task_type="GPU",
                               cat_features=['col_7', 'col_9', 'col_10',
                                             'TypeOfSteel_A300',
                                             'TypeOfSteel_A400'],
                               silent=True)
    estimator = make_pipeline(MultiOutputClassifier(model))
    estimator.fit(train_x, train_y)

    pred_vals = estimator.predict_proba(valid_x)

    print(f'fold: {i}')
    print(np.mean([roc_auc_score(valid_y.iloc[:, i], pred_vals[i][:, 1]) for i in range(len(ys))]))

fold: 0
0.8890593369897309
fold: 1
0.8789952136800532
fold: 2
0.8969150694199223
fold: 3
0.8859010073870588
fold: 4
0.8765044889605511
fold: 5
0.8873215540463276
fold: 6
0.8826735494017937


In [80]:
estimators = dict(lgb=LGBMClassifier(num_iterations=1000,
                                     max_depth=3,
                                     learning_rate=0.0080988815333333,
                                     feature_fraction=0.8578138132619784,
                                     colsample_bytree=0.8768880865721203,
                                     reg_lambda=0.7173767456929144,
                                     device="gpu",
                                     verbose=-1),
                  xgb=XGBClassifier(n_estimators=1000,
                                    max_depth=6,
                                    subsample=0.5342564004636723,
                                    learning_rate=0.0064835939090773,
                                    colsample_bytree=0.8195724454605662,
                                    reg_lambda=0.09829546268908843,
                                    device='cuda:0',
                                    verbosity=0),
                  cgb=CatBoostClassifier(iterations=110,
                                         depth=6,
                                         learning_rate=0.05201739,
                                         task_type="GPU",
                                         cat_features=['col_7', 'col_9', 'col_10',
                                                       'TypeOfSteel_A300',
                                                       'TypeOfSteel_A400'],
                                         silent=True)
                  )

In [None]:
preds = []

for k, est in estimators.items():
    for i, (t_idx, v_idx) in enumerate(cv.split(X[best_cols], Y)):
        train_x, valid_x = X.loc[t_idx, best_cols], X.loc[v_idx, best_cols]
        train_y, valid_y = Y.loc[t_idx, :], Y.loc[v_idx, :]
    
        estimator = make_pipeline(MultiOutputClassifier(est))
        estimator.fit(train_x, train_y)
    
        pred_vals = estimator.predict_proba(valid_x)
    
        print(f'fold: {i}')
        print(np.mean([roc_auc_score(valid_y.iloc[:, i], pred_vals[i][:, 1]) for i in range(len(ys))]))
        
        pred_test = estimator.predict_proba(test)