In [2]:
!pip install lightgbm
!pip install catboost
!pip install xgboost
!pip install tabpfn
!pip install seaborn
!pip install optuna

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer
from sklearn import metrics
from sklearn.preprocessing import RobustScaler
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.model_selection import StratifiedKFold

import optuna

import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date, datetime
import warnings
warnings.filterwarnings("ignore")

In [4]:
#from tabpfn import TabPFNClassifier
from tabpfn.scripts.transformer_prediction_interface import TabPFNClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Read DATAS

In [5]:
df = pd.read_csv("../input/icr-identify-age-related-conditions/train.csv", index_col=[0])
greeks_df = pd.read_csv("../input/icr-identify-age-related-conditions/greeks.csv", index_col=[0])

# マジック（Epsilonを入れる）　🌟
times = greeks_df.Epsilon.copy()
times[greeks_df.Epsilon != 'Unknown'] = greeks_df.Epsilon[greeks_df.Epsilon != 'Unknown'].map(lambda x: datetime.strptime(x,'%m/%d/%Y').toordinal())
times[greeks_df.Epsilon == 'Unknown'] = greeks_df.Epsilon[greeks_df.Epsilon != 'Unknown'].map(lambda x: datetime.strptime(x,'%m/%d/%Y').toordinal()).median()
greeks_df['times'] = times
df = df.merge(greeks_df['times'], left_index = True, right_index = True, how='left')
new_order = df.columns.to_list()
new_order[-2], new_order[-1] = new_order[-1], new_order[-2]
df_train = df.reindex(columns=new_order)
del df

In [6]:
df_test = pd.read_csv("../input/icr-identify-age-related-conditions/test.csv", index_col=[0])
df_test['times'] = times.max()+1
df_test

In [7]:
df_train['EJ'] = df_train['EJ'].map({'A': 0.0, 'B': 1.0}, na_action=None)
df_test['EJ'] = df_test['EJ'].map({'A': 0.0, 'B': 1.0}, na_action=None)

## nanをmedianに変更

In [8]:
col_mean = df_train.median(axis=0)
for i, colname in enumerate(df_train.columns):
    df_train[colname].fillna(col_mean[i], inplace=True)
df_train

In [9]:
for i, colname in enumerate(df_test.columns):
    df_test[colname].fillna(col_mean[i], inplace=True)
df_test

## スケールを変更

In [10]:
df_train_noscale = df_train.copy()

scaler = RobustScaler().set_output(transform="pandas")
df_train.loc[:, df_train.columns[:-2]] = scaler.fit_transform(df_train[df_train.columns[:-2]])
df_train

In [11]:
df_test_noscale = df_test.copy()

df_test.loc[:,df_test.columns[:-1]] = scaler.transform(df_test[df_test.columns[:-1]])
df_test

In [12]:
df_train['times'] = df_train.times.astype(int)
df_test['times'] = df_test.times.astype(int)

# LightGBM/XGBoost/CatBoost

In [13]:
classes_weights = compute_sample_weight(
    class_weight='balanced',
    y=df_train.Class
)

In [14]:
def balanced_log_loss(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    nc = np.bincount(y_true)
    w0, w1 = 1/(nc[0]/y_true.shape[0]), 1/(nc[1]/y_true.shape[0])
    balanced_log_loss_score = (-w0/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(1-y_pred))) - w1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred)))) / (w0+w1)
    return balanced_log_loss_score

# 独自の評価関数をスコアリング関数に変換
balanced_log_loss_scorer = make_scorer(balanced_log_loss, greater_is_better=False)

## optuna

In [15]:
do_opt = False

lgbm_params = {'n_estimators': 590,
 'reg_alpha': 3.3000000000000003,
 'reg_lambda': 36.4,
 'num_leaves': 25,
 'min_child_samples': 40,
 'colsample_bytree': 0.7,
 'subsample': 0.9,
 'subsample_freq': 1}
 
xgb_params = {'n_estimators': 700,
 'max_depth': 3,
 'learning_rate': 0.03560016067885742,
 'subsample': 0.7408505704121144,
 'colsample_bytree': 0.5705779083310649,
 'gamma': 0.5286651204230014}
 
cb_params = {'iterations': 114,
 'depth': 4,
 'learning_rate': 0.08433020501242801,
 'random_strength': 1,
 'bagging_temperature': 0.1111714827952525,
 'od_wait': 23}

### lightGBM

In [16]:
seed = 57

def obj_lgbm(trial):
    params = {
        'boosting_type':'gbdt',
        'class_weight':'balanced',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 100, step=0.1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 100, step=0.1),
        'num_leaves': trial.suggest_int('num_leaves', 2, 64, step=1),
        'min_child_samples': trial.suggest_int('min_child_samples', 8, 64, step=1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0, step=0.1),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0, step=0.1),
        'subsample_freq': trial.suggest_int('subsample_freq', 0, 3, step=1),
    }
    
    model = LGBMClassifier(**params)
    
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

    score = 0
    for train_idx, valid_idx in kf.split(df_train, df_train.Class):
        train_df = df_train.iloc[train_idx]
        valid_df = df_train.iloc[valid_idx]
        X_train, y_train = train_df[train_df.columns[:-1]], train_df.Class
        X_valid, y_valid = valid_df[valid_df.columns[:-1]], valid_df.Class

        weights = classes_weights[train_idx]
        model.fit(X_train, y_train, sample_weight=weights)
        y_pred = model.predict_proba(X_valid)
        loss = balanced_log_loss(y_valid, y_pred[:, 1])
        score += loss

    return score/5
        

if do_opt :
    study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=42))#direction='minimize')
    study.enqueue_trial(lgbm_params)
    study.optimize(obj_lgbm, n_trials=100)

In [17]:
#0.18265121471197596
if do_opt :
    lgbm_params = dict(study.best_trial.params.items())
lgbm_params

In [18]:
def obj_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.001, 1.0, log=True),
        'objective': 'binary:logistic', 
        'eval_metric': 'logloss', 
    }

    model = XGBClassifier(**params)
    
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

    score = 0
    for train_idx, valid_idx in kf.split(df_train, df_train.Class):
        train_df = df_train.iloc[train_idx]
        valid_df = df_train.iloc[valid_idx]
        X_train, y_train = train_df[train_df.columns[:-1]], train_df.Class
        X_valid, y_valid = valid_df[valid_df.columns[:-1]], valid_df.Class

        weights = classes_weights[train_idx]
        model.fit(X_train, y_train, sample_weight=weights)
        y_pred = model.predict_proba(X_valid)
        loss = balanced_log_loss(y_valid, y_pred[:, 1])
        score += loss

    return score/5
        
if do_opt :    
    study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=42))#direction='minimize')
    study.enqueue_trial(xgb_params)
    study.optimize(obj_xgb, n_trials=100)

In [19]:
if do_opt :
    xgb_params = dict(study.best_trial.params.items())

xgb_params

In [20]:
def obj_cb(trial):
    params = {
        'iterations' : trial.suggest_int('iterations', 50, 300),                         
        'depth' : trial.suggest_int('depth', 4, 10),                                       
        'learning_rate' : trial.suggest_float('learning_rate', 0.01, 0.3, log=True),               
        'random_strength' :trial.suggest_int('random_strength', 0, 100),                       
        'bagging_temperature' :trial.suggest_float('bagging_temperature', 0.01, 100.00, log=True), 
#         'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'od_wait' :trial.suggest_int('od_wait', 10, 50)
    }

    model = CatBoostClassifier(**params)
    
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

    score = 0
    for train_idx, valid_idx in kf.split(df_train, df_train.Class):
        train_df = df_train.iloc[train_idx]
        valid_df = df_train.iloc[valid_idx]
        X_train, y_train = train_df[train_df.columns[:-1]], train_df.Class
        X_valid, y_valid = valid_df[valid_df.columns[:-1]], valid_df.Class

        weights = classes_weights[train_idx]
        model.fit(X_train, y_train, logging_level="Silent", sample_weight=weights)
        y_pred = model.predict_proba(X_valid)
        loss = balanced_log_loss(y_valid, y_pred[:, 1])
        score += loss

    return score/5
        
if do_opt :
    study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=42))#direction='minimize')
    study.enqueue_trial(cb_params)
    study.optimize(obj_cb, n_trials=50)

In [21]:
if do_opt :
    cb_params = dict(study.best_trial.params.items())
cb_params

In [22]:
"""
lgbm_params = {'n_estimators': 600,
 'reg_alpha': 2.8000000000000003,
 'reg_lambda': 12.4,
 'num_leaves': 41}
 
xgb_params = {'n_estimators': 800,
 'max_depth': 4,
 'learning_rate': 0.07321832691960338,
 'subsample': 0.7576690053727902,
 'colsample_bytree': 0.7481165187056286,
 'gamma': 0.4338012239235796}
 
cb_params = {'iterations': 114,
 'depth': 4,
 'learning_rate': 0.08433020501242801,
 'random_strength': 1,
 'bagging_temperature': 0.1111714827952525,
 'od_wait': 23}
"""

In [23]:
# lgb = LGBMClassifier(boosting_type='gbdt', class_weight='balanced', colsample_bytree=1.0,
#                 importance_type='split', learning_rate=0.1, max_depth=-1,
#                 min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
#                 n_estimators=100, n_jobs=-1, num_leaves=31, objective="binary",
#                 random_state=42, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
#                 subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
# xgb = XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
#                colsample_bylevel=None, colsample_bynode=None,
#                colsample_bytree=None, early_stopping_rounds=None,
#                enable_categorical=False, eval_metric=None, feature_types=None,
#                gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
#                interaction_constraints=None, learning_rate=None, max_bin=None,
#                max_cat_threshold=None, max_cat_to_onehot=None,
#                max_delta_step=None, max_depth=None, max_leaves=None,
#                min_child_weight=None, monotone_constraints=None,
#                n_estimators=100, n_jobs=-1, num_parallel_tree=None,
#                objective='binary:logistic', predictor=None)

# cb = CatBoostClassifier()
# tab = TabPFNClassifier(N_ensemble_configurations=64)

In [24]:
lgb = LGBMClassifier(boosting_type='gbdt', class_weight='balanced', **lgbm_params)
xgb = XGBClassifier(
    objective = 'binary:logistic', 
    eval_metric = 'logloss', 
    **xgb_params,
)

cb = CatBoostClassifier(**cb_params)
tab = TabPFNClassifier(N_ensemble_configurations=64)

## calc cv (5 fold)

In [25]:
seed = 57
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

score = 0
for train_idx, valid_idx in kf.split(df_train, df_train.Class):
    train_df = df_train.iloc[train_idx]
    valid_df = df_train.iloc[valid_idx]
    X_train, y_train = train_df[train_df.columns[:-1]], train_df.Class
    X_valid, y_valid = valid_df[valid_df.columns[:-1]], valid_df.Class

    weights = classes_weights[train_idx]
#     weights = compute_sample_weight(class_weight='balanced',y=y_train)                              

    print("lgbm..")
    lgb.fit(X_train, y_train, sample_weight=weights)
    y_lgb = lgb.predict_proba(X_valid)
    print("xgboost..")    
    xgb.fit(X_train, y_train, sample_weight=weights)
    y_xgb = xgb.predict_proba(X_valid)
    print("catboost..")
    cb.fit(X_train, y_train, logging_level="Silent", sample_weight=weights)
    y_cb = cb.predict_proba(X_valid)

    
    train_df = df_train_noscale.iloc[train_idx]
    valid_df = df_train_noscale.iloc[valid_idx]
    X_train, y_train = train_df[train_df.columns[:-1]], train_df.Class
    X_valid, y_valid = valid_df[valid_df.columns[:-1]], valid_df.Class
    
    print("TabPFN..")
    tab.fit(X_train, y_train)
    y_tab = tab.predict_proba(X_valid)

    lgb_loss = balanced_log_loss(y_valid, y_lgb[:, 1])
    xgb_loss = balanced_log_loss(y_valid, y_xgb[:, 1])
    cb_loss = balanced_log_loss(y_valid, y_cb[:, 1])
    tab_loss = balanced_log_loss(y_valid, y_tab[:, 1])
    score +=  balanced_log_loss(y_valid, ((y_lgb + y_xgb + y_cb + y_tab)/4)[:, 1])
    print("lgb", lgb_loss, "xgb", xgb_loss, "catboost", cb_loss, "TabPFN", tab_loss, "ensamble",  
          balanced_log_loss(y_valid, ((y_lgb + y_xgb + y_cb + y_tab)/4)[:, 1]))
    
score/5

In [None]:
# display(df_train)
# display(df_train_noscale)

## fit models(final parameter)

In [None]:
X = df_train[df_train.columns[:-1]]
y = df_train.Class

In [None]:
lgb.fit(X, y, sample_weight=classes_weights)
y_lgb = lgb.predict_proba(X)

In [None]:
xgb.fit(X, y, sample_weight=classes_weights)
y_xgb = xgb.predict_proba(X)

In [None]:
cb.fit(X, y, logging_level="Silent", sample_weight=classes_weights)
y_cb = cb.predict_proba(X)

In [None]:
# tabpfnには正則化する前のデータを入力する
X = df_train_noscale[df_train.columns[:-1]]
y = df_train_noscale.Class

In [None]:
tab.fit(X, y)
y_tab = tab.predict_proba(X)

In [None]:
balanced_log_loss(y, ((y_lgb + y_xgb + y_cb + y_tab)/4)[:, 1])

# infer

In [None]:
pred_lgb = lgb.predict_proba(df_test)
pred_xgb = xgb.predict_proba(df_test)
pred_cb = cb.predict_proba(df_test)
pred_tab = tab.predict_proba(df_test)
pred = (pred_lgb + pred_xgb + pred_cb + pred_tab)/4
pred

In [None]:
submission = pd.read_csv('../input/icr-identify-age-related-conditions/sample_submission.csv')
submission['class_0'] = pred[:,0]
submission['class_1'] = pred[:,1]

In [None]:
submission.to_csv('submission.csv', index=False)
submission