In [0]:
!git clone https://github.com/acmilannesta/Adult_readmission

In [0]:
!pip install catboost

**Catboost model**

In [0]:
import os,  pandas as pd,  numpy as np, gc
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
from hyperopt import fmin, hp, tpe, STATUS_OK, Trials

In [0]:
analysis = pd.read_sas('Adult_readmission//combined_vars.sas7bdat', encoding='latin1').drop('TRR_ID', axis=1)
biomarker = pd.read_csv('Adult_readmission/feature_extracted.csv').drop('CODE_REHOSP', 1)
X = analysis.merge(biomarker, on='PERSON_ID').drop(['CODE_REHOSP', 'PERSON_ID'], 1)
y = analysis['CODE_REHOSP'].replace(2, 0)
cat_colidx = [X.columns.get_loc(col) for col in X.columns if X[col].nunique() <= 10]

In [0]:
for col in cat_colidx:
    if X[X.columns[col]].dtype == 'float64':
        X[X.columns[col]] = X[X.columns[col]].fillna(-1).astype(int)
    else:
        X[X.columns[col]] = X[X.columns[col]].fillna('')

In [0]:
cbc_params = {
    'max_depth': hp.choice('max_depth', np.arange(2, 11)),
    'l2_leaf_reg': hp.uniform('l2_leaf_reg', 0, 100),
#     'colsample_bylevel': hp.uniform('colsample_bylevel', 0.1, 1),
    'subsample': hp.uniform('subsample', 0.1, 1),
    'eta': hp.uniform('eta', 0.01, 0.1)
}

In [0]:
def f_cbc(params):
    kfold = StratifiedKFold(5, True, 2019)
    auc = np.zeros(kfold.get_n_splits())
    cbc_pred = np.zeros(len(X))
    cbc = CatBoostClassifier(
        **params,
        n_estimators=2000,
        random_state=2019,
        eval_metric='AUC',
        cat_features=cat_colidx,
        silent=True,
        one_hot_max_size=2,
        bootstrap_type='Bernoulli',
        boosting_type='Plain',
        task_type='GPU',
    )
    for i, (tr_idx, val_idx) in enumerate(kfold.split(X, y)):
        clf = cbc.fit(X.iloc[tr_idx], 
                      y[tr_idx], 
                      use_best_model=True,
                      eval_set=(X.iloc[val_idx], y[val_idx]),
                      early_stopping_rounds=100,
                      verbose_eval=False)
        cbc_pred[val_idx] = clf.predict_proba(X.iloc[val_idx])[:, 1]
        auc[i] = roc_auc_score(y[val_idx], cbc_pred[val_idx])
        del clf
        gc.collect()
        # print("Mean AUC(%g|%g): %.5f" %(i, kfold.get_n_splits(), np.sum(auc)/i))
    return {'loss': -np.mean(auc).round(5), 'status': STATUS_OK}

trials = Trials()
cbc_best = fmin(f_cbc, cbc_params, algo=tpe.suggest, rstate=np.random.RandomState(2019), max_evals=100, trials=trials)

**LGBM model**

In [0]:
!pip uninstall lightgbm
!pip install lightgbm --install-option=--gpu

In [0]:
import os,  pandas as pd,  numpy as np, gc
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from hyperopt import fmin, hp, tpe, STATUS_OK, Trials
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgbm
import warnings
warnings.filterwarnings("ignore")

In [0]:
analysis = pd.read_sas('Adult_readmission//combined_vars.sas7bdat', encoding='latin1').drop('TRR_ID', axis=1)
biomarker = pd.read_csv('Adult_readmission/feature_extracted.csv').drop('CODE_REHOSP', 1)
X = analysis.merge(biomarker, on='PERSON_ID').drop(['CODE_REHOSP', 'PERSON_ID'], 1)
y = analysis['CODE_REHOSP'].replace(2, 0)
cat_col = [X.columns.get_loc(col) for col in X.columns if 2 <= X[col].nunique() <= 10]
for col in X.columns:
    if X[col].dtype == 'O':
        X[col] = LabelEncoder().fit_transform(X[col].fillna('Unknown'))
    elif 2 < X[col].nunique() <= 10:
        X[col] = LabelEncoder().fit_transform(X[col].fillna(99))

In [0]:
lgbm_param = {
        'num_leaves': hp.choice('num_leaves', np.arange(2, 21)),
        'learning_rate': hp.uniform('learning_rate', 0.005, 0.1),
        'feature_fraction': hp.uniform('feature_fraction', 0.01, 0.5),
        'max_depth': hp.choice('max_depth', np.arange(2, 11)),
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'metric': 'auc',
        'verbose': -1,
       'device_type': 'gpu'
    }

def f_lgbm(params):
    lgbm_pred = np.zeros((len(X), ))
    auc = np.zeros(5)
    for i, (tr_idx, te_idx) in enumerate(StratifiedKFold(5, True, 2019).split(X, y)):
        tr_data = lgbm.Dataset(X.values[tr_idx], y.ravel()[tr_idx], categorical_feature=cat_col)
        te_data = lgbm.Dataset(X.values[te_idx], y.ravel()[te_idx], categorical_feature=cat_col)
        clf = lgbm.train(params,
                         tr_data,
                         num_boost_round=9999999,
                         verbose_eval=False,
                         valid_sets=[tr_data, te_data],
                         early_stopping_rounds=500,
                    )
        lgbm_pred[te_idx] = clf.predict(X.values[te_idx], num_iteration=clf.best_iteration)
        auc[i] = roc_auc_score(y.ravel()[te_idx], lgbm_pred[te_idx])
        del clf
        gc.collect()
    return {'loss': -np.mean(auc).round(5), 'status': STATUS_OK}

trials = Trials()
lgbm_best = fmin(f_lgbm, lgbm_param, algo=tpe.suggest, rstate=np.random.RandomState(2019), max_evals=100, trials=trials)

  2%|▏         | 2/100 [00:47<44:35, 27.30s/it, best loss: -0.67622]