In [0]:
!git clone https://github.com/acmilannesta/Adult_readmission

In [0]:
!pip install catboost

#**Catboost model**

In [0]:
import os,  pandas as pd,  numpy as np, gc
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier, Pool, cv
from hyperopt import fmin, hp, tpe, STATUS_OK, Trials

In [0]:
analysis = pd.read_sas('Adult_readmission//combined_vars_04092019.sas7bdat', encoding='latin1').drop(['TRR_ID', 'TRANSPLANT_DT', 'TRANSPLANT_DISCHARGE_DT', 'READMISSION_DT'], axis=1)
biomarker = pd.read_csv('Adult_readmission/feature_extracted_365days.csv')
X = analysis.merge(biomarker, on='PERSON_ID').drop(['CODE_REHOSP', 'PERSON_ID'], 1)
# X = biomarker.drop('PERSON_ID', 1)
y = analysis['CODE_REHOSP'].replace(2, 0)
cat_colidx = [X.columns.get_loc(col) for col in X.columns if X[col].nunique() <= 10]

In [0]:
for col in cat_colidx:
    if X[X.columns[col]].dtype == 'float64':
        X[X.columns[col]] = X[X.columns[col]].fillna(-1).astype(int)
    else:
        X[X.columns[col]] = X[X.columns[col]].fillna('')

In [0]:
cbc_params = {
    'max_depth': hp.quniform('max_depth', 2, 11, 1),
    'l2_leaf_reg': hp.uniform('l2_leaf_reg', 0, 500),
    'eta': hp.uniform('eta', 0.1, 0.5),
    'silent': True,
    'eval_metric': 'AUC',
    'objective': 'Logloss',
    'task_type': 'GPU',
    'one_hot_max_size': 5
#     'colsample_bylevel': hp.uniform('colsample_bylevel', 0.1, 1),
#     'subsample': hp.uniform('subsample', 0.1, 1),    
#     'bootstrap_type': hp.choice('bootstrap_type', ['Bernoulli', 'Poisson', 'No']),
#     'one_hot_max_size': hp.choice('one_hot_max_size', np.arange(2,6))
}

In [0]:
def f_cbc(params):
  data = Pool(X, y, cat_colidx)
  out = cv(
      pool = data, 
      params = params, 
      nfold = 3, 
      early_stopping_rounds = 50, 
      iterations = 500, 
      # metric_period = 20
      )
  # kfold = StratifiedKFold(5, True, 2019)
  # auc = np.zeros(kfold.get_n_splits())
  # cbc_pred = np.zeros(len(X))
  # featureimp = np.zeros(X.shape[1])
  # cbc = CatBoostClassifier(
  #     **params,
  #     n_estimators=2000,
  #     random_state=2019,
  #     eval_metric='AUC',
  #     cat_features=cat_colidx,
  #     silent=True,
  #     one_hot_max_size=2,
  #     bootstrap_type='Bernoulli',
  #     boosting_type='Plain',
  #     task_type='GPU',
  # )
  # for i, (tr_idx, val_idx) in enumerate(kfold.split(X, y)):
  #     clf = cbc.fit(X.iloc[tr_idx], 
  #                   y[tr_idx], 
  #                   use_best_model=True,
  #                   eval_set=(X.iloc[val_idx], y[val_idx]),
  #                   early_stopping_rounds=200,
  #                   verbose_eval=False)
  #     cbc_pred[val_idx] = clf.predict_proba(X.iloc[val_idx])[:, 1]
  #     featureimp += np.asarray(clf.get_feature_importance()) / kfold.n_splits
  #     auc[i] = roc_auc_score(y[val_idx], cbc_pred[val_idx])
  #     del clf
  #     gc.collect()
  #     print("Mean AUC(%g|%g): %.5f" %(i, kfold.get_n_splits(), np.sum(auc)/i))  +
  # return {'loss': -np.mean(auc).round(5), 'status': STATUS_OK, 'featureimp': featureimp}
  return {'loss': -round(out['test-AUC-mean'].max(), 5), 'status': STATUS_OK}

trials = Trials()
cbc_best = fmin(f_cbc, cbc_params, algo=tpe.suggest, rstate=np.random.RandomState(2019), max_evals=10, trials=trials)

  0%|          | 0/10 [00:00<?, ?it/s, best loss: ?]




 10%|█         | 1/10 [00:54<08:09, 54.37s/it, best loss: -0.6758]




 20%|██        | 2/10 [03:35<11:30, 86.29s/it, best loss: -0.6758]




 30%|███       | 3/10 [06:43<13:38, 116.87s/it, best loss: -0.67986]




 40%|████      | 4/10 [07:55<10:20, 103.43s/it, best loss: -0.67986]




 50%|█████     | 5/10 [12:53<13:28, 161.73s/it, best loss: -0.67986]




 60%|██████    | 6/10 [15:47<11:01, 165.45s/it, best loss: -0.67986]




In [0]:
pd.DataFrame({'features': X.columns, 'importance': trials.best_trial['result']['featureimp']}).sort_values('importance', ascending=False).head(20)

Unnamed: 0,features,importance
63,hemoglobin_min_tx,7.698757
45,creatinine_min_discharge,7.001471
72,prior_tx,6.792803
66,hemoglobin_min_discharge,4.884014
38,albumin_min_discharge,4.40439
71,prograf_max_discharge,3.345072
59,a1c_tx,3.066204
133,GLUCOSE__skewness,2.426604
54,wbc_min_tx,2.226028
110,MAGNESIUM__energy_ratio_by_chunks__num_segment...,2.207351


#**LGBM model**

In [9]:
!pip uninstall lightgbm -y
!pip install lightgbm --install-option=--gpu

Uninstalling lightgbm-2.2.3:
  Would remove:
    /usr/local/lib/python3.6/dist-packages/lightgbm-2.2.3.dist-info/*
    /usr/local/lib/python3.6/dist-packages/lightgbm/*
Proceed (y/n)? y
  Successfully uninstalled lightgbm-2.2.3
  cmdoptions.check_install_build_global(options)
Collecting lightgbm
[?25l  Downloading https://files.pythonhosted.org/packages/c9/ce/3aff55e25e282383c19c5a5fb7387fd400e64b1a1036671aefa63ceeaaf4/lightgbm-2.2.3.tar.gz (649kB)
[K     |████████████████████████████████| 655kB 2.6MB/s 
Skipping bdist_wheel for lightgbm, due to binaries being disabled for it.
Installing collected packages: lightgbm
  Running setup.py install for lightgbm ... [?25l[?25hdone
Successfully installed lightgbm-2.2.3


In [0]:
import os,  pandas as pd,  numpy as np, gc
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from hyperopt import fmin, hp, tpe, STATUS_OK, Trials
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgbm
import warnings
warnings.filterwarnings("ignore")

In [0]:
analysis = pd.read_sas('Adult_readmission//combined_vars_04092019.sas7bdat', encoding='latin1').drop(['TRR_ID', 'TRANSPLANT_DT', 'TRANSPLANT_DISCHARGE_DT', 'READMISSION_DT'], axis=1)
biomarker = pd.read_csv('Adult_readmission/feature_extracted_365days.csv')
X = analysis.merge(biomarker, on='PERSON_ID').drop(['CODE_REHOSP', 'PERSON_ID'], 1)
y = analysis['CODE_REHOSP'].replace(2, 0)
cat_col = [X.columns.get_loc(col) for col in X.columns if 2 <= X[col].nunique() <= 10]
for col in X.columns:
    if X[col].dtype == 'O':
        X[col] = LabelEncoder().fit_transform(X[col].fillna('Unknown'))
    elif 2 < X[col].nunique() <= 10:
        X[col] = LabelEncoder().fit_transform(X[col].fillna(99))

In [0]:
lgbm_param = {
        'num_leaves': hp.choice('num_leaves', np.arange(2, 21)),
        'learning_rate': hp.uniform('learning_rate', 0.005, 0.1),
        'feature_fraction': hp.uniform('feature_fraction', 0.01, 0.5),
        'max_depth': hp.choice('max_depth', np.arange(2, 11)),
        'objective': 'binary',
        # 'boosting_type': 'dart',
        'metric': 'auc',
        'verbose': -1,
        'device_type': 'gpu'
    }

def f_lgbm(params):
    lgbm_pred = np.zeros((len(X), ))
    auc = np.zeros(5)
    for i, (tr_idx, te_idx) in enumerate(StratifiedKFold(5, True, 2019).split(X, y)):
        tr_data = lgbm.Dataset(X.values[tr_idx], y.ravel()[tr_idx], categorical_feature=cat_col)
        te_data = lgbm.Dataset(X.values[te_idx], y.ravel()[te_idx], categorical_feature=cat_col)
        clf = lgbm.train(params,
                         tr_data,
                         num_boost_round=2000,
                         verbose_eval=200,
                         valid_sets=[tr_data, te_data],
                         early_stopping_rounds=False,
                    )
        lgbm_pred[te_idx] = clf.predict(X.values[te_idx], num_iteration=clf.best_iteration)
        auc[i] = roc_auc_score(y.ravel()[te_idx], lgbm_pred[te_idx])
        del clf
        gc.collect()
    return {'loss': -np.mean(auc).round(5), 'status': STATUS_OK}

trials = Trials()
lgbm_best = fmin(f_lgbm, lgbm_param, algo=tpe.suggest, rstate=np.random.RandomState(2019), max_evals=20, trials=trials)