In [1]:
# !pip install tabpfn --no-index --find-links=file:///kaggle/input/tab-pfn-dataset
# !mkdir -p /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
# !cp /kaggle/input/tab-pfn-dataset/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

# !pip install adjdatatools --no-index --find-links=file:///kaggle/input/adjdatatools
# !pip -q install featurewiz --no-index --find-links=file:///kaggle/input/featurewiz

In [2]:
import sys
sys.path.append('/kaggle/input/iter-strat/iter_strat')

import math
import copy 

import numpy as np
import pandas as pd

import lightgbm as lgb
import catboost as cat
from catboost import Pool
import xgboost as xgb
from tabpfn import TabPFNClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
import category_encoders as encoders

import itertools
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.model_selection._split import _BaseKFold, _RepeatedSplits, BaseShuffleSplit, _validate_shuffle_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.impute import SimpleImputer, KNNImputer
from adjdatatools.preprocessing import AdjustedScaler
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.model_selection import GridSearchCV, KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures

from sklearn.utils import check_random_state
from sklearn.utils.validation import _num_samples, check_array
from sklearn.utils.multiclass import type_of_target

from scipy import stats

import eli5
from IPython.display import display
from eli5.permutation_importance import get_score_importances
from eli5.sklearn import PermutationImportance

import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import seaborn as sns

import optuna

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)


from colorama import Style, Fore

palette = ['#302c36', '#037d97', '#E4591E', '#C09741',
           '#EC5B6D', '#90A6B1', '#6ca957', '#D8E3E2']

blk = Style.BRIGHT + Fore.BLACK
red = Style.BRIGHT + Fore.RED
blu = Style.BRIGHT + Fore.BLUE
res = Style.RESET_ALL


class CFG:
    # main
    kaggle = False
    test = False
    
    # features
    fe_drop = True
    
    fix_errs = False
    err_objs = [292, 102, 509, 367, 313, 380, 556]

    del_outliers = False
    del_outliers_adj = True
    
    feature_sel = False
    n_feature_sel_repeats = 5
    n_feature_sel_folds = 5
    
    undersample = False
    oversample = False
    
    nan_impute = True
    encode_cat = False
    standard_scale = False
    log = False
    
    # optimization
    n_estimators = 3000
    early_stopping_rounds = 100
    
    lgbm_optimize = False
    lgbm_optimize2 = False
    xgb_optimize = False
    cb_optimize = False
    
    n_trials = 500
    n_optimize_folds = 5
    n_optimize_repeats = 3
    
    # train
    k_fold = False
    strat_k_fold = True
    add_err_objs = False
    select_best_fold = False
    
    lgbm_train = True
    xgb_train = False
    cb_train = False
    tabpfn_train = False
    logreg_train = False

    # inference
    n_stacking_folds = 5
    n_stacking_folds_min = 1
    n_stacking_folds_max = 5

    n_stacking_models_lgbm = 5
    n_stacking_models_xgb = 10
    n_stacking_models_cb = 5
    n_stacking_models_tabpfn = 20

    adjust_class_threshold = False
    

# Load Data

In [3]:
if CFG.kaggle:
    COMP_PATH = "/kaggle/input/icr-identify-age-related-conditions"
else:
    COMP_PATH = "icr-identify-age-related-conditions"

train_df = pd.read_csv(f'{COMP_PATH}/train.csv')
test_df = pd.read_csv(f'{COMP_PATH}/test.csv')
greeks = pd.read_csv(f"{COMP_PATH}/greeks.csv")
sample_submission = pd.read_csv(f"{COMP_PATH}/sample_submission.csv")

train_df['EJ'] = train_df['EJ'].replace({'A': 0, 'B': 1})
test_df['EJ'] = test_df['EJ'].replace({'A': 0, 'B': 1})
test_df['EJ'] = test_df['EJ'].fillna(0)

train_df.columns = train_df.columns.str.replace(' ', '')
test_df.columns = test_df.columns.str.replace(' ', '')

# NaN impute

In [4]:
if CFG.nan_impute:
    #EL datasets
    train_el_df = train_df[~train_df.EL.isna()]
    X_train_el_df=train_el_df.drop(['BQ', 'EL', 'Class', 'Id'], axis=1)
    y_train_el_df=train_el_df.EL

    val_el_df = train_df[train_df.EL.isna()]
    X_val_el_df = val_el_df.drop(['BQ', 'EL', 'Class', 'Id'], axis=1)

#     test_df['EL'] = [0, 0, np.nan, 0, np.nan]
#     test_df['CU'] = [np.nan, 0, np.nan, np.nan, 0]
    
    test_el_df = test_df[test_df.EL.isna()]
    X_test_el_df = test_el_df.drop(['BQ', 'EL', 'Id'], axis=1)

    #making grid for hyperparamters optimization for feature selection
    el_grid_fs = GridSearchCV(xgb.XGBRegressor(), param_grid={'n_estimators':[50,80,100], 'eta': [0.001, 0.005, 0.01, 0.03, 0.1, 1]},
                            n_jobs=-1, cv=10, verbose=1,scoring='neg_mean_squared_error')
    el_grid_fs.fit(X_train_el_df, y_train_el_df)

    #making model for features selection
    el_model_fs = xgb.XGBRegressor(n_estimators=el_grid_fs.best_params_['n_estimators'], eta=el_grid_fs.best_params_['eta'])
    el_model_fs.fit(X_train_el_df, y_train_el_df)

    #chosing 10 most important features
    feature_importances = el_model_fs.feature_importances_
    sorted_indices = np.argsort(feature_importances)[::-1]
    features_el = sorted_indices[:10]

    X_train_el_df = X_train_el_df.iloc[:,features_el]
    X_val_el_df = X_val_el_df.iloc[:,features_el]
    X_test_el_df = X_test_el_df.iloc[:,features_el]

    #making grid for hyperparamters optimization for prediction
    el_grid = GridSearchCV(xgb.XGBRegressor(), param_grid={'n_estimators':[50,80,100], 'eta': [0.001, 0.005, 0.01, 0.03, 0.1, 1]},
                            n_jobs=-1, cv=10, verbose=1, scoring='neg_mean_squared_error')
    el_grid.fit(X_train_el_df, y_train_el_df)

    #making model for prediction
    el_model = xgb.XGBRegressor(n_estimators=el_grid.best_params_['n_estimators'], eta=el_grid.best_params_['eta'])
    el_model.fit(X_train_el_df, y_train_el_df)

    el_pred = el_model.predict(X_val_el_df)
    train_df.loc[train_df.EL.isna(), 'EL'] = el_pred

    if X_test_el_df.shape[0] > 0:
        try:
            el_pred_test = el_model.predict(X_test_el_df)
            test_df.loc[test_df.EL.isna(), 'EL'] = el_pred_test
        except:
            el_pred_test = el_model.predict(X_test_el_df.fillna(X_test_el_df.mean()))
            test_df.loc[test_df.EL.isna(), 'EL'] = el_pred_test

Fitting 10 folds for each of 18 candidates, totalling 180 fits
Fitting 10 folds for each of 18 candidates, totalling 180 fits


# Drop not necessary features

In [5]:
if CFG.fe_drop:
    features = [fe for fe in train_df.columns if fe not in ['CF', 'CB', 'DV', 'BR', 'DF', 'GB', 'AH',
                                                            'CW', 'CL', 'BP', 'BD', 'FC', 'GE', 'GF',
                                                            'AR', 'GI', 'Id', 'Class', 'AX', 'DA']]
else:
    features = [fe for fe in train_df.columns if fe not in ['Id', 'Class', 'EJ']]

num_cols = [nc for nc in train_df.select_dtypes(include=['float64']).columns if nc != 'Class']
    
# clip values to avoid different values in the test set from train !!!
# test_df[features] = test_df[features].clip(train_df[features].min(axis=0).values, train_df[features].max(axis=0).values, axis=1)
    
len(train_df.columns), len(features), len(num_cols)

(58, 38, 55)

# Delete outliers

In [6]:
features_with_outliers = [fe for fe in train_df.columns if fe not in ['BN', 'BQ', 'CW', 'EL', 'GH', 
                                                                      'GI', 'GL', 'Id', 'Class', 'EJ']]

if CFG.del_outliers:
    for f in features_with_outliers:
        train_df[f] = train_df[f].clip(upper=train_df[f].quantile(0.99))
        test_df[f] = test_df[f].clip(upper=test_df[f].quantile(0.99))

if CFG.del_outliers_adj:
    adj_scaler = AdjustedScaler(with_centering=True)
    # adj_features = [f for f in features if f != 'EJ']
    adj_features = ['EL']
    
    adj_scaler.fit(train_df[adj_features])
    train_df['EL_adj'] = adj_scaler.transform(train_df[adj_features])
    test_df['EL_adj'] = adj_scaler.transform(test_df[adj_features])

# Load LGBM parameters

In [7]:
import glob

def load_lgbm_parameters(filename):
    param_list = glob.glob(filename)

    models = list()
    best_lgbm_params = list()

    lgbm_params = pd.DataFrame()

    for f in param_list:
        tmp = pd.read_csv(f, index_col='Unnamed: 0')
        if lgbm_params.shape[0] == 0:
            lgbm_params = tmp
        else:
            lgbm_params = pd.concat([lgbm_params, tmp])
            
    lgbm_params = lgbm_params.sort_values('value').head(CFG.n_stacking_models_lgbm)
    param_cols = [c for c in lgbm_params.columns if c.startswith('params_')]
    lgbm_params = lgbm_params[param_cols]

    for idx, row in lgbm_params.iterrows():
        row_dict = {k[7:]: v for k, v in row.items()}
        row_dict['objective'] = 'binary'
        row_dict['metric'] = 'none'
    #     row_dict['subsample_for_bin'] = 300000
        row_dict['force_col_wise'] = False
        row_dict['verbose'] = -1
        # row_dict['max_bin'] = 255
        
        if CFG.n_stacking_folds > 0:
            row_dict['n_estimators'] = CFG.n_estimators
            row_dict['early_stopping_round'] = CFG.early_stopping_rounds
        else:
            row_dict['n_estimators'] = int(row_dict['n_estimators'])
        row_dict['num_leaves'] = int(row_dict['num_leaves'])
        row_dict['max_depth'] = int(row_dict['max_depth'])
        row_dict['min_child_samples'] = int(row_dict['min_child_samples'])
        row_dict['subsample_freq'] = int(row_dict['subsample_freq'])
        row_dict['learning_rate'] = float(row_dict['learning_rate'])
        row_dict['max_bin'] = int(row_dict['max_bin'])
        
        if not CFG.oversample and not CFG.undersample:
            row_dict['is_unbalance'] = True
            row_dict['class_weight'] = 'balanced'
        else:
            row_dict['scale_pos_weight'] = class_imbalance
        
        if row_dict['boosting_type'] == 'goss':
            row_dict['subsample'] = None
            
        best_lgbm_params.append(row_dict)
    return best_lgbm_params

best_lgbm_params = load_lgbm_parameters('optuna_lgbm.csv')

if CFG.test:
    best_lgbm_params = [{
            'boosting_type': 'goss',
            'n_estimators': 50000,
            'early_stopping_round': 300,
            'max_depth': 8,
            'learning_rate': 0.06733232950390658,
            'subsample': 0.6970532011679706,
            'colsample_bytree': 0.6055755840633003,
            'is_unbalance': True, 
            'class_weight': 'balanced',
            'metric':'none',
            'verbose': -1,
            'random_state': 42,
        }
    ]

                         

# Train models

In [8]:
def balanced_log_loss(y_true, y_pred):
    # Nc is the number of observations
    N_1 = np.sum(y_true == 1, axis=0)
    N_0 = np.sum(y_true == 0, axis=0)

    N_inv_0 = 1/N_0 if N_0 > 0 else 0
    N_inv_1 = 1/N_1 if N_1 > 0 else 0

    # In order to avoid the extremes of the log function, each predicted probability 𝑝 is replaced with max(min(𝑝,1−10−15),10−15)
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)

    # balanced logarithmic loss
    loss_numerator = - N_inv_0 * np.sum((1 - y_true) * np.log(1 - y_pred)) - N_inv_1 * np.sum(y_true * np.log(y_pred))

    return loss_numerator / 2

def bll_metric(y_true, y_pred):
    return 'balanced_log_loss', balanced_log_loss(y_true, y_pred), False

def pp_prob(p):
    c0 = p[:,0].sum()
    c1 = p[:,1:].sum()
    new_p = p * np.array([[1/(c0 if i==0 else c1) for i in range(p.shape[1])]])
    new_p = new_p / np.sum(new_p,axis=1,keepdims=1)
    return np.sum(new_p[:,1:],1,keepdims=False)

def model_train(how, best_params, X, y, test):
    oof_level2 = np.zeros([y.shape[0], len(best_params) + 1])
    oof_level2[:, len(best_params)] = y
    oof_level2_test = np.zeros([test_df.shape[0], len(best_params)])
    oof_val = np.zeros([CFG.n_stacking_folds, len(best_params)])
    
    for i, params in tqdm(enumerate(best_params), total=len(best_params)):
        model_dict = dict()
    
        if CFG.n_stacking_folds > 0:
            if CFG.k_fold:
                kf = KFold(n_splits=CFG.n_stacking_folds, shuffle=True, random_state=3082023)
                y_fold = y
            elif CFG.strat_k_fold:
                kf = StratifiedKFold(n_splits=CFG.n_stacking_folds, shuffle=True, random_state=3082023)
                y_fold = y
            else:
                kf = MultilabelStratifiedKFold(n_splits=CFG.n_stacking_folds, shuffle=True, random_state=3082023)
                y_fold = greeks.iloc[:,1:4]
            
            print(f"Training with {blu}{len(features)}{res} features")

            best_val = np.inf
            
            for fold, (fit_idx, val_idx) in enumerate(kf.split(X=X, y=y_fold)):
                
                # Split the dataset according to the fold indexes.
                X_train = X.iloc[fit_idx]
                X_val = X.iloc[val_idx]
                y_train = y.iloc[fit_idx]
                y_val = y.iloc[val_idx]

                # Make random under- or oversampling to balance classes
                if CFG.undersample or CFG.oversample:
                    if CFG.undersample:
                        positive_count_train = y_train.value_counts()[1]
                        sampler = RandomUnderSampler(sampling_strategy={0: positive_count_train * class_imbalance, 
                                                                        1: positive_count_train}, 
                                                    random_state=3082023, 
                                                    replacement=True)
                    elif CFG.oversample:
                        negative_count_train = y_train.value_counts()[0]
                        sampler = RandomOverSampler(sampling_strategy={0: negative_count_train, 
                                                                    1: negative_count_train // class_imbalance}, 
                                                    random_state=3082023)

                    X_train, y_train = sampler.fit_resample(X_train, y_train)
                
                if how == 'lgbm':
                    model = lgb.LGBMClassifier(**params)
                    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric=bll_metric, verbose=0)
                    best_iter = model.best_iteration_
                elif how == 'xgboost':
                    model = xgb.XGBClassifier(**params)
                    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)
                    best_iter = model.get_booster().best_iteration
                elif how == 'catboost':
                    train_pool = Pool(X_train, y_train, cat_features=['EJ'])
                    val_pool = Pool(X_val, y_val, cat_features=['EJ'])   
                    model = cat.CatBoostClassifier(**params)
                    model.fit(train_pool, eval_set=val_pool, verbose=0)
                    best_iter = model.best_iteration_
                elif how == 'tabpfn':
                    model = TabPFNClassifier(N_ensemble_configurations=64, device='cuda:0')
                    model.fit(X_train, y_train, overwrite_warning=True)
                    best_iter = 0
                elif how == 'logreg':
                    model = LogisticRegression(random_state=2306020231+i, C=0.1, n_jobs=-1, max_iter=2000, class_weight='balanced')
                    model.fit(X_train, y_train)
                    best_iter = 0
                else:
                    return None, None, None
                    
                try:
                    if how == 'tabpfn':
                        val_preds = pp_prob(model.predict_proba(X_val))
                        val_score = balanced_log_loss(y_val, val_preds)
                        pp = pp_prob(model.predict_proba(test))
                    else:
                        val_preds = model.predict_proba(X_val)[:,1]
                        val_score = balanced_log_loss(y_val, val_preds)
                        pp = model.predict_proba(test)[:,1]
                except:
                    val_score = 100
                    pp = np.zeros(test.shape[0])

                model_dict[val_score] = pp
                
                oof_level2[val_idx, i] = val_preds
                oof_val[fold, i] = val_score
                
                print(f'Fold: {blu}{fold:>3}{res}| bll_metric: {blu}{val_score:.5f}{res}'
                      f' | Best iteration: {blu}{best_iter:>4}{res}')  
            
            model_dict = sorted(model_dict.items(), key=lambda x: x[0])
            
            n_stacking_folds = CFG.n_stacking_folds
            
            for j, _pp in enumerate(model_dict):
                # if j >= CFG.n_stacking_folds_min or _pp[0] >= 0.1:
                if _pp[0] >= 0.1:
                    oof_level2_test[:, i] += _pp[1]
                else:
                    n_stacking_folds -= 1
            oof_level2_test[:, i] = oof_level2_test[:, i] / max(1, n_stacking_folds)
        else:
            if how == 'lgbm':
                model = lgb.LGBMClassifier(**params)
                model.fit(X, y, verbose=0)
            elif how == 'xgboost':
                model = xgb.XGBClassifier(**params)
                model.fit(X, y, verbose=0)
            elif how == 'catboost':
                train_pool = Pool(X, y, cat_features=['EJ'])
                model = cat.CatBoostClassifier(**params)
                model.fit(train_pool, verbose=0)
            elif how == 'tabpfn':
                model = TabPFNClassifier(N_ensemble_configurations=64, device='cuda:0')
                model.fit(X, y, overwrite_warning=True)
            elif how == 'logreg':
                model = LogisticRegression(random_state=2306020231+i, C=0.1, n_jobs=-1, max_iter=2000, class_weight='balanced')
                model.fit(X, y)
            else:
                return None, None, None
    
    return oof_level2, oof_level2_test, oof_val

oof_train_list = list()
oof_test_list = list()
oof_val_list = list()

if CFG.lgbm_train:
    oof_level2_lgbm, oof_level2_test_lgbm, oof_val_lgbm = model_train('lgbm', best_lgbm_params, train_df[features], train_df['Class'], test_df[features])
    oof_train_list.append(oof_level2_lgbm[:,:-1])
    oof_test_list.append(oof_level2_test_lgbm)
    oof_val_list.append(oof_val_lgbm)
    y = oof_level2_lgbm[:,-1]

if CFG.xgb_train:
    oof_level2_xgb, oof_level2_test_xgb, oof_val_xgb = model_train('xgboost', best_xgb_params, train_df[features], train_df['Class'], test_df[features])
    oof_train_list.append(oof_level2_xgb[:,:-1])
    oof_test_list.append(oof_level2_test_xgb)
    oof_val_list.append(oof_val_xgb)
    y = oof_level2_xgb[:,-1]

if CFG.cb_train:
    oof_level2_cb, oof_level2_test_cb, oof_val_cb = model_train('catboost', best_cb_params, train_df[features], train_df['Class'], test_df[features])
    oof_train_list.append(oof_level2_cb[:,:-1])
    oof_test_list.append(oof_level2_test_cb)
    oof_val_list.append(oof_val_cb)
    y = oof_level2_cb[:,-1]

if CFG.tabpfn_train:
    oof_level2_tabpfn, oof_level2_test_tabpfn, oof_val_tabpfn = model_train('tabpfn', [i for i in range(CFG.n_stacking_models_tabpfn)], 
                                                                             train_df[features], train_df['Class'], test_df[features])
    oof_train_list.append(oof_level2_tabpfn[:,:-1])
    oof_test_list.append(oof_level2_test_tabpfn)
    oof_val_list.append(oof_val_tabpfn)
    y = oof_level2_tabpfn[:,-1]

# if CFG.logreg_train:
#     oof_level2_logreg, oof_level2_test_logreg, oof_val_logreg = model_train('logreg', [i for i in range (10)])
#     oof_train_list.append(oof_level2_logreg[:,:-1])
#     oof_test_list.append(oof_level2_test_logreg)
#     oof_val_list.append(oof_val_logreg)
#     y = oof_level2_logreg[:,-1]

  0%|          | 0/5 [00:00<?, ?it/s]

Training with [1m[34m38[0m features
Fold: [1m[34m  0[0m| bll_metric: [1m[34m0.11412[0m | Best iteration: [1m[34m  80[0m
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.15002[0m | Best iteration: [1m[34m 116[0m
Fold: [1m[34m  2[0m| bll_metric: [1m[34m0.24423[0m | Best iteration: [1m[34m  75[0m
Fold: [1m[34m  3[0m| bll_metric: [1m[34m0.27084[0m | Best iteration: [1m[34m  74[0m
Fold: [1m[34m  4[0m| bll_metric: [1m[34m0.17767[0m | Best iteration: [1m[34m 169[0m
Training with [1m[34m38[0m features
Fold: [1m[34m  0[0m| bll_metric: [1m[34m0.16482[0m | Best iteration: [1m[34m 112[0m
Fold: [1m[34m  1[0m| bll_metric: [1m[34m0.16766[0m | Best iteration: [1m[34m  68[0m
Fold: [1m[34m  2[0m| bll_metric: [1m[34m0.29180[0m | Best iteration: [1m[34m  69[0m
Fold: [1m[34m  3[0m| bll_metric: [1m[34m0.25362[0m | Best iteration: [1m[34m  55[0m
Fold: [1m[34m  4[0m| bll_metric: [1m[34m0.15666[0m | Best iteration: [1m[34m 

# Blending and Stacking with Logistic Regression

In [9]:
oof_level2_LGBM = np.concatenate(oof_train_list, axis=1)
oof_level2_LGBM_test = np.concatenate(oof_test_list, axis=1)
# oof_level2_val = np.concatenate(oof_val_list, axis=1).reshape(-1, )

X = oof_level2_LGBM

# mean bll
print(balanced_log_loss(y, np.mean(X, axis=1)))

# lr bll
lr = LogisticRegression(class_weight='balanced')
lr.fit(X, y)

pred = lr.predict_proba(X)[:,1]
weights = lr.coef_[0]
print(balanced_log_loss(y, (weights * X).sum(axis=1) / sum(weights)))

0.1839780951638027
0.1793033734474975


# Use Optuna to calculate model weights

In [10]:
from functools import partial

class OptunaWeights:
    def __init__(self, random_state, n_trials=2000):
        self.study = None
        self.weights = None
        self.random_state = random_state
        self.n_trials = n_trials

    def _objective(self, trial, y_true, y_preds):
        # Define the weights for the predictions from each model
        weights = [trial.suggest_float(f"weight{n}", 1e-13, 1) for n in range(len(y_preds))]

        # Calculate the weighted prediction
        weighted_pred = np.average(np.array(y_preds).T, axis=1, weights=weights)

        # Calculate the score for the weighted prediction
        # score = log_loss(y_true, weighted_pred)
        score = balanced_log_loss(y_true, weighted_pred)
        return score

    def fit(self, y_true, y_preds):
        optuna.logging.set_verbosity(optuna.logging.ERROR)
        sampler = optuna.samplers.CmaEsSampler(seed=self.random_state)
        pruner = optuna.pruners.HyperbandPruner()
        self.study = optuna.create_study(sampler=sampler, pruner=pruner, study_name="OptunaWeights", direction='minimize')
        objective_partial = partial(self._objective, y_true=y_true, y_preds=y_preds)
        self.study.optimize(objective_partial, n_trials=self.n_trials)
        self.weights = [self.study.best_params[f"weight{n}"] for n in range(len(y_preds))]

    def predict(self, y_preds):
        assert self.weights is not None, 'OptunaWeights error, must be fitted before predict'
        weighted_pred = np.average(np.array(y_preds).T, axis=1, weights=self.weights)
        return weighted_pred

    def fit_predict(self, y_true, y_preds):
        self.fit(y_true, y_preds)
        return self.predict(y_preds)
    
    def weights(self):
        return self.weights

# Use Optuna to find the best ensemble weights
optweights = OptunaWeights(random_state=19072023)
y_val_pred = optweights.fit_predict(y, [oof_level2_LGBM[:,i] for i in range(oof_level2_LGBM.shape[1])])
optuna_weights_LGBM = np.array(optweights.weights)
display(balanced_log_loss(y, y_val_pred))

oof_level2_LGBM = (optuna_weights_LGBM * oof_level2_LGBM).sum(axis=1) / sum(optuna_weights_LGBM)
oof_level2_LGBM_test = (optuna_weights_LGBM * oof_level2_LGBM_test).sum(axis=1) / sum(optuna_weights_LGBM)

# oof_level2_LGBM = oof_level2_LGBM.mean(axis=1)
# oof_level2_LGBM_test = oof_level2_LGBM_test.mean(axis=1)

0.1728000019463083

# =================================================================
# Logistic Regression

# Load data

In [11]:
train = pd.read_csv(f'{COMP_PATH}/train.csv')
test = pd.read_csv(f'{COMP_PATH}/test.csv')
greeks = pd.read_csv(f'{COMP_PATH}/greeks.csv')

train.columns = train.columns.str.replace(' ', '')
test.columns = test.columns.str.replace(' ', '')

# Greeks will be used in the stratified k-fold strategy

In [12]:
greeks['k'] = greeks['Alpha'] + greeks['Beta'] + greeks['Gamma'] + greeks['Delta']
train = pd.merge( greeks[['k', 'Id']],train,on='Id')

names = ['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BD', 'BN',
         'BP', 'BQ', 'BR', 'BZ', 'CB', 'CC', 'CD', 'CF', 'CH', 'CL', 'CR', 'CS',
         'CU', 'CW', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY',
         'EB', 'EE', 'EG', 'EH', 'EJ', 'EL', 'EP', 'EU', 'FC', 'FD', 'FE', 'FI',
         'FL', 'FR', 'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL']
target_name = 'Class'

# Data Cleaning

In [13]:
train['EJ'] = pd.Series(np.where(train.EJ.values == 'A', 1, 0), train.index)
test['EJ'] = pd.Series(np.where(test.EJ.values == 'A', 1, 0), test.index)

# fill nan data with mean values 
train[names] = train[names].fillna(train[names].mean())
test[names] = test[names].fillna(train[names].mean())
# clip values to avoid different values in the test set from train
test = test[names].clip(train[names].min(axis=0).values,train[names].max(axis=0).values, axis=1)

# data scaled to allow the features interaction (by multiplication)
scaler = StandardScaler()

train2 = copy.copy(train)
teste2 = copy.copy(test)

vals = scaler.fit_transform(train[names])
vals_test = scaler.transform(test[names])

train2[names] = vals
teste2[names] = vals_test

if CFG.nan_impute:
    train2['EL'] = train_df['EL_adj']
    teste2['EL'] = test_df['EL_adj']

# Defining 2 order interactions

In [14]:
# def multiply and make a array of all interactions
def mab(df,nome1,nome2):
    a  = df[nome1]*df[nome2]
    return(a/max(a))

h = []
ht = []

n = 1
for n1 in names:
    for n2 in names[n:]:
        h.append(mab(train2,n1,n2).rename(n1+'_mul_'+n2))
        ht.append(mab(teste2,n1,n2).rename(n1+'_mul_'+n2))
        
    n+=1
    
newF = pd.DataFrame(h)
newF_test = pd.DataFrame(ht)

# Get IV and WOE features

In [15]:
#https://lucastiagooliveira.github.io/datascience/iv/woe/python/2020/12/15/iv_woe.html
def iv_woe(data, target, bins=10, show_woe=False):
    
    #Empty Dataframe
    newDF,woeDF = pd.DataFrame(), pd.DataFrame()
    
    #Extract Column Names
    cols = data.columns
    
    #Run WOE and IV on all the independent variables
    for ivars in cols[~cols.isin([target])]:
        if (data[ivars].dtype.kind in 'bifc') and (len(np.unique(data[ivars]))>10):
            binned_x = pd.qcut(data[ivars], bins,  duplicates='drop')
            d0 = pd.DataFrame({'x': binned_x, 'y': data[target]})
        else:
            d0 = pd.DataFrame({'x': data[ivars], 'y': data[target]})

        
        # Calculate the number of events in each group (bin)
        d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
        d.columns = ['Cutoff', 'N', 'Events']
        
        # Calculate % of events in each group.
        d['% of Events'] = np.maximum(d['Events'], 0.5) / d['Events'].sum()

        # Calculate the non events in each group.
        d['Non-Events'] = d['N'] - d['Events']
        # Calculate % of non events in each group.
        d['% of Non-Events'] = np.maximum(d['Non-Events'], 0.5) / d['Non-Events'].sum()

        # Calculate WOE by taking natural log of division of % of non-events and % of events
        d['WoE'] = np.log(d['% of Events']/d['% of Non-Events'])
        d['IV'] = d['WoE'] * (d['% of Events'] - d['% of Non-Events'])
        d.insert(loc=0, column='Variable', value=ivars)
        #print("Information value of " + ivars + " is " + str(round(d['IV'].sum(),6)))
        temp =pd.DataFrame({"Variable" : [ivars], "IV" : [d['IV'].sum()]}, columns = ["Variable", "IV"])
        newDF=pd.concat([newDF,temp], axis=0)
        woeDF=pd.concat([woeDF,d], axis=0)

        #Show WOE Table
        if show_woe == True:
            print(d)
    return newDF, woeDF

a,b = iv_woe(train2, target_name, bins=10, show_woe=False)

In [16]:
# most important features based on IV
a.sort_values(by='IV',ascending=False).Variable.values 

array(['k', 'Id', 'DU', 'GL', 'FL', 'CR', 'DA', 'AF', 'AB', 'BQ', 'DI',
       'EB', 'FD', 'EE', 'EH', 'FR', 'CD', 'DE', 'CC', 'BN', 'FI', 'FE',
       'DH', 'EU', 'GF', 'DF', 'BC', 'DL', 'AM', 'BP', 'AH', 'AR', 'GH',
       'DN', 'CS', 'GB', 'DY', 'CF', 'CB', 'GI', 'BD', 'FC', 'BR', 'CU',
       'EL', 'FS', 'AZ', 'EJ', 'CW', 'AX', 'GE', 'AY', 'EG', 'EP', 'CH',
       'CL', 'BZ', 'DV'], dtype=object)

# Prepare dataset with the new features

In [17]:
# Reordering the dataframe to keep IV with higger values in front
trainE = train[a.sort_values(by='IV',ascending=False).Variable.values]
trainE[target_name] = train[target_name]
testeE = test[a.sort_values(by='IV',ascending=False).Variable.values[2:]]

# join the original vars and the interactions between them
ff = pd.concat([trainE,newF.T],axis=1)
ff_teste = pd.concat([testeE,newF_test.T],axis=1)

a,b = iv_woe(ff, target_name, bins=10, show_woe=False)

# deleting all IVs below 0.05
a = a.loc[a['IV']> 0.05]

allNames = a.sort_values(by='IV',ascending=False).Variable.values
crossNames = [x for x in allNames if '_mul_' in x]

nomes2 = list(trainE)+crossNames
nomes2.remove('Class')

# Set threshold for correlation features

In [18]:
threshold = 0.3

cc = ff[nomes2[2:]].corr()

mat_x = abs(cc)>threshold
mat_x = mat_x.to_numpy()

# Select variables with low correlation

In [19]:
# there are +- 70 features with low correlation
var1 = []
nomes = list(cc)
var1.append(nomes[0])
max_vars = 100

count = 1
for n in range(1,len(nomes)):
    
    if (mat_x[n,:n+1].sum() ) == 1:
        
        var1.append(nomes[n])        
        count+=1
        
        if(count == max_vars):
            break

# Drop features that get low score

In [20]:
# 'CW', 'AZ', 'FS', 'BR', 'FE', 'BN', 'DE', 'AF', 'CR',

features_to_drop = ['CR_mul_DE', 'BQ_mul_FE', 'CR_mul_GE', 'EE_mul_GF', 
                    'CR_mul_FE', 'BQ_mul_FC', 'DE_mul_DL', 'AZ_mul_GL', 'CW_mul_DL', 
                    'BN_mul_CR', 'DN_mul_FI', 'AZ_mul_FE', 'CW_mul_EL', 'AZ_mul_CU',
                    'CW_mul_DY', 'DH_mul_DL', 'AX_mul_CU', 'BN_mul_DE', 'BN_mul_CW', 
                    'AZ_mul_EL', 'AZ_mul_DE']

var1 = [v for v in var1 if v not in features_to_drop]

# Create dict with WoE transformation

In [21]:
# create dic with WoE transformation
list_dics = []

for var in var1:
    df_temp = b.loc[b['Variable']==var].reset_index()
    # crieate dict
    dict_var = {}
    for x in range(len(df_temp)):
        line = df_temp.iloc[x]
        dict_var[line['Cutoff']] = line['WoE']
    list_dics.append(dict_var)

# Prepare train and test data

In [22]:
# train and test data
df_original = ff[var1+[target_name] + ['k'] ]
df_test2 = ff_teste[var1]
names = var1

In [23]:
# In this part there is some data leakage as the map is using the full dataset
n=0

for var in var1:
    df_original.loc[:,var] = df_original[var].map(list_dics[n])
    df_test2.loc[:,var] = df_test2[var].map(list_dics[n])
    n = n + 1

# Fill NaNs

In [24]:
df_original.loc[:,names] = df_original[names].fillna(df_original[names].mean())
df_test2.loc[:,names] = df_test2[names].fillna(df_original[names].mean())

# Train LR

In [25]:
n_splits = 10

predictions_LR = 0
cv_score_LR = 0

rr = [42, 21, 100, 45, 1, 228]

oof_level2_LR = np.zeros([df_original['Class'].shape[0], len(rr)])
oof_level2_LR_test = np.zeros([test_df.shape[0], len(rr)])

for f, v_fold in enumerate(rr):
    skf = StratifiedKFold(n_splits=CFG.n_stacking_folds, shuffle=True, random_state=3082023)
    for i, (train_index, val_index) in enumerate(skf.split(df_original[names], df_original['Class'])):

            model = LogisticRegression(random_state=3082023+i, C=0.1, n_jobs=-1, max_iter=2000, class_weight='balanced')

            df_train = df_original.iloc[train_index]
            df_val = df_original.iloc[val_index]
            
            df_train1 = df_train[names].to_numpy()
            df_val1 = df_val[names].to_numpy()            
            
            model.fit(df_train1, df_train['Class'])
            
            y_hat_val_LR = model.predict_proba(df_val1)[:,1]
            val = balanced_log_loss(df_val[target_name], y_hat_val_LR.reshape(-1, ))
            
            try:
                oof_level2_LR[val_index, f] = model.predict_proba(df_val1)[:,1]
                oof_level2_LR_test[:,f] = model.predict_proba(df_test2[names])[:,1]
            except:
                oof_level2_LR[val_index, f] = np.zeros(len(val_index))
                oof_level2_LR_test[:,f] = np.zeros(df_test2.shape[0])

# Use Optuna to find the best ensemble weights
optweights = OptunaWeights(random_state=10082023)
y_val_pred = optweights.fit_predict(y, [oof_level2_LR[:,i] for i in range(oof_level2_LR.shape[1])])
optuna_weights_LR = np.array(optweights.weights)
display(balanced_log_loss(y, y_val_pred))

# oof_level2_LR = (optuna_weights_LR * oof_level2_LR).sum(axis=1) / sum(optuna_weights_LR)
# oof_level2_LR_test = (optuna_weights_LR * oof_level2_LR_test).sum(axis=1) / sum(optuna_weights_LR)

oof_level2_LR = oof_level2_LR.mean(axis=1)
oof_level2_LR_test = oof_level2_LR_test.mean(axis=1)

0.18202025653081474

# ======================================================
# Get ensemble predictions

In [26]:
oof_level2 = [oof_level2_LGBM, oof_level2_LR]
oof_level2_test = [oof_level2_LGBM_test, oof_level2_LR_test]

# Use Optuna to find the best ensemble weights
optweights = OptunaWeights(random_state=19072023)
y_val_pred = optweights.fit_predict(y, [oof_level2[i] for i in range(len(oof_level2))])
optuna_weights = np.array(optweights.weights)
# optuna_weights[optuna_weights < 0.05] = 0
display(optuna_weights)
balanced_log_loss(y, y_val_pred)

array([0.63276243, 0.34647374])

0.15979566349415253

In [27]:
# array([0.9439178 , 0.88226912])

# 0.15875810163589982

# Predict test

In [28]:
def predict(X):
    y = np.zeros_like(X[0])
    for i in range(len(X)):
        # y += oof_level2_test[i]
        y += optuna_weights[i] * X[i]
    # return y / len(X)
    return y / sum(optuna_weights)

def lr_predict(X):
    return lr.predict_proba(X)[:,1]

predictions = predict(oof_level2_test)
# predictions = lr_predict(oof_level2_test)

if CFG.adjust_class_threshold:
    _, predictions = inflate_preds(y, np.mean(X, axis=1), predictions)

predictions = np.nan_to_num(predictions)
test_df['class_1'] = np.round(predictions, 15)
test_df['class_0'] = 1 - predictions

sample_submission[['class_0', 'class_1']] = test_df[['class_0', 'class_1']]
sample_submission.to_csv(r"submission.csv", index=False)
sample_submission

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.467,0.533
1,010ebe33f668,0.467,0.533
2,02fa521e1838,0.467,0.533
3,040e15f562a2,0.467,0.533
4,046e85c7cc7f,0.467,0.533
