In [1]:
import pandas as pd
import numpy as np
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.compose import make_column_transformer, ColumnTransformer

from interpret import show
from interpret.data import ClassHistogram
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.metrics import confusion_matrix
from dash import html
import sys
from interpret.perf import ROC

from sklearn.metrics import matthews_corrcoef,accuracy_score,make_scorer,balanced_accuracy_score,classification_report,roc_auc_score,confusion_matrix,plot_roc_curve,f1_score,precision_score,recall_score
from sklearn.model_selection import cross_val_score

import lightgbm as lgb
from sklearn.linear_model import LogisticRegression

from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import TomekLinks 
from imblearn.under_sampling import ClusterCentroids

In [2]:
dataset= pd.read_csv ('eICU_readmission_280122.csv')
pd.set_option('display.max_columns', None)
dataset.shape

(149009, 297)

In [3]:
print('Before deletion of missing values' + str(dataset.shape))
thresh = len(dataset) * 0.8
dataset.dropna(axis=1, thresh=thresh, inplace=True)
print('After deletion of missing values' + str(dataset.shape))

Before deletion of missing values(149009, 297)
After deletion of missing values(149009, 185)


# create sub datasets

In [4]:
def cohort_2_transform_df(X_train, X_test, imputer, scaler = True):
    values_num = X_train.dtypes != object
    values_cat = X_train.dtypes == object

    si_0 = imputer 
    ss = StandardScaler() 
    ohe = OneHotEncoder(handle_unknown = 'ignore') #for extrimly unbalanced cases
    # define column groups with same processing
    cat_vars = values_cat
    num_vars = values_num
    # set up pipelines for each column group
    categorical_pipe = Pipeline([('ohe', ohe)])
    
    if scaler:
        numeric_pipe = Pipeline([('si_0', si_0), ('ss', ss)])
    else:
        numeric_pipe = Pipeline([('si_0', si_0)])
    # set up columnTransformer
    col_transformer = ColumnTransformer(
                        transformers=[
                            ('nums', numeric_pipe, num_vars),
                            ('cats', categorical_pipe, cat_vars)
                        ],
                        remainder='drop',
                        n_jobs=-1
                        )


    X_train_np = col_transformer.fit_transform(X_train)
    X_test_np = col_transformer.transform(X_test)
    
    # getting names for transform data
    # categorical values
    pipe_cats_actual = col_transformer.named_transformers_['cats']
    names_cats = pipe_cats_actual['ohe'].get_feature_names()
    #print('Number of categorical names: %d ' %  len(names_cats))
    # numerical values
    names = [name for name, value in num_vars.iteritems() if value]
    names_num = names
    #print('Number of numerical names: %d ' %  len(names_num))
    # lasst of all new names
    names_all = list(names_num) + list(names_cats)

    X_train_df = pd.DataFrame(X_train_np,columns= names_all)
    X_test_df = pd.DataFrame(X_test_np,columns= names_all)
    return X_train_np, X_train_df, X_test_np,X_test_df

In [5]:
imputers=[SimpleImputer(missing_values=np.NaN,strategy='median'), KNNImputer(n_neighbors=3, weights="uniform")] #IterativeImputer(max_iter=10, random_state=0, verbose=2) # #

In [6]:
def return_scores(model,  X_train, X_test, y_train, y_test, model_name):
    Matthew = make_scorer(matthews_corrcoef)
    if model_name != 'ebm':    
        cv_roc_auc_mean = round(np.mean(cross_val_score(model, X_train, y_train , cv=10, scoring='roc_auc')),2)
        cv_mcc_mean = round(np.mean(cross_val_score(model, X_train, y_train ,cv=10,scoring=Matthew)),2)    
        cv_accuracy_mean = round(np.mean(cross_val_score(model, X_train, y_train , cv=10, scoring='accuracy')),2)
        cv_f1_mean = round(np.mean(cross_val_score(model, X_train, y_train , cv=10, scoring='f1')),2)
        cv_precision_mean = round(np.mean(cross_val_score(model, X_train, y_train , cv=10, scoring='precision')),2)
        #display(cv_precision_mean)
        cv_recall_mean = round(np.mean(cross_val_score(model, X_train, y_train , cv=10, scoring='recall')),2)
    #for blind test
    roc_auc = round(roc_auc_score(y_test, model.predict_proba(X_test)[:,1]),2)
    y_pred_best = model.predict(X_test)
    mcc = round(matthews_corrcoef(y_test, y_pred_best),2)
    accuracy = round(accuracy_score(y_test, y_pred_best),2)
    f1 = round(f1_score(y_test, y_pred_best, average='binary'),2)
    precision = round(precision_score(y_test, y_pred_best, average='binary'),2)
    recall = round(recall_score(y_test, y_pred_best, average='binary'),2)
    if model_name == 'ebm': 
        return 0,0,0,0,0,0, roc_auc,mcc,accuracy,f1,precision,recall
    else:
        return cv_roc_auc_mean,cv_mcc_mean,cv_accuracy_mean,cv_f1_mean,cv_precision_mean,cv_recall_mean, roc_auc,mcc,accuracy,f1,precision,recall

In [11]:
def transform_model(dataset, model_name):
    X = dataset.copy()
    y = X['label_30days']
    X.drop(columns=['label_30days','patientunitstayid','label_72', 'label_48', 'label_7days'], inplace=True) 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
    X_train_np, X_train_df, X_test_np, X_test_df = cohort_2_transform_df(X_train, X_test, imputers[0], scaler = False)
    if model_name=='ebm':
        rus = RandomUnderSampler(random_state=0)
        X_resampled, y_resampled = rus.fit_resample(X_train_df, y_train)
        #----
        #tl = TomekLinks(sampling_strategy ='majority')
        #X_resampled, y_resampled = tl.fit_resample(X_train_df, y_train)
        #---
        #cc = ClusterCentroids(sampling_strategy ='majority')
        #X_resampled, y_resampled = cc.fit_resample(X_train_df, y_train)
        print(X_resampled.shape)
            ebm = ExplainableBoostingClassifier(random_state=42)
        ebm.fit(X_resampled, y_resampled)
        ebm_global = ebm.explain_global(name='EBM')
        show(ebm_global)
        ebm_perf = ROC(ebm.predict_proba).explain_perf(X_test_df, y_test, name='EBM')
        show(ebm_perf)
        y_pred=ebm.predict(X_test_df)
        cm=confusion_matrix(y_test, y_pred) 
        cv_roc_auc_mean,cv_mcc_mean,cv_accuracy_mean,cv_f1_mean,cv_precision_mean,cv_recall_mean, roc_auc,mcc,accuracy,f1,precision,recall=return_scores(ebm,X_resampled, X_test_df, y_resampled, y_test,'ebm')
    elif  model_name=='lgbm':
        clf = lgb.LGBMClassifier(class_weight= 'balanced')
        clf.fit(X_train_df, y_train)
        y_pred=clf.predict(X_test_df)
        cm=confusion_matrix(y_test, y_pred)
        cv_roc_auc_mean,cv_mcc_mean,cv_accuracy_mean,cv_f1_mean,cv_precision_mean,cv_recall_mean, roc_auc,mcc,accuracy,f1,precision,recall=return_scores(clf,X_train_df, X_test_df, y_train, y_test,'lgbm')
        feature_imp = pd.DataFrame({'Value':list(clf.feature_importances_),'Feature':list(X_train_df.columns)})
        #print(cm)
        #print(cv_roc_auc_mean,cv_mcc_mean,cv_accuracy_mean,cv_f1_mean,cv_precision_mean,cv_recall_mean, roc_auc,mcc,accuracy,f1,precision,recall)
    elif model_name=='lr':
        clf = LogisticRegression(random_state=0, max_iter=1000, class_weight='balanced')
        clf.fit(X_train_df, y_train)
        y_pred=clf.predict(X_test_df)
        cm=confusion_matrix(y_test, y_pred)
        cv_roc_auc_mean,cv_mcc_mean,cv_accuracy_mean,cv_f1_mean,cv_precision_mean,cv_recall_mean, roc_auc,mcc,accuracy,f1,precision,recall=return_scores(clf,X_train_df, X_test_df, y_train, y_test, 'lr')
        # get importance
        importance = clf.coef_[0]
        # summarize feature importance
        for i,v in enumerate(importance):
            print('Feature: %0d, Score: %.5f' % (i,v))
    print(cm)
    d = {'metric': ['cv_roc_auc_mean','cv_mcc_mean','cv_accuracy_mean','cv_f1_mean','cv_precision_mean','cv_recall_mean', 'roc_auc','mcc','accuracy','f1','precision','recall'], 'value': [cv_roc_auc_mean,cv_mcc_mean,cv_accuracy_mean,cv_f1_mean,cv_precision_mean,cv_recall_mean, roc_auc,mcc,accuracy,f1,precision,recall]}
    result=pd.DataFrame(data=d)
    print(result)
    return feature_imp


In [16]:
transform_model(dataset, 'ebm')

EBM lib loading.
Loading native on win32 | debug = False
Passing a numpy array to schema autogen when it should be dataframe.


(8132, 219)


Passing a numpy array to schema autogen when it should be dataframe.
Detected non-cloud environment.
The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`
  import dash_html_components as html
The dash_table package is deprecated. Please replace
`import dash_table` with `from dash import dash_table`

Also, if you're using any of the table format helpers (e.g. Group), replace 
`from dash_table.Format import Group` with 
`from dash.dash_table.Format import Group`
  import dash_table as dt
Generating mini dash
Generated mini dash


Generating mini dash
Generated mini dash


[[29311 17907]
 [  748  1207]]
               metric  value
0     cv_roc_auc_mean   0.00
1         cv_mcc_mean   0.00
2    cv_accuracy_mean   0.00
3          cv_f1_mean   0.00
4   cv_precision_mean   0.00
5      cv_recall_mean   0.00
6             roc_auc   0.67
7                 mcc   0.10
8            accuracy   0.62
9                  f1   0.11
10          precision   0.06
11             recall   0.62


UnboundLocalError: local variable 'feature_imp' referenced before assignment

# Prediction

In [14]:
feature_imp=transform_model(dataset, 'lgbm')



[[35346 11872]
 [  991   964]]
               metric  value
0     cv_roc_auc_mean   0.70
1         cv_mcc_mean   0.12
2    cv_accuracy_mean   0.75
3          cv_f1_mean   0.14
4   cv_precision_mean   0.08
5      cv_recall_mean   0.49
6             roc_auc   0.68
7                 mcc   0.11
8            accuracy   0.74
9                  f1   0.13
10          precision   0.08
11             recall   0.49


In [17]:
feature_imp.sort_values(["Value"], ascending=False)


Unnamed: 0,Value,Feature
4,99,hospitalid
76,66,respiratoryrate_avg_last_24
0,61,age
3,60,hospitaladmitoffset
147,55,predictedhospitallos_APACHE_IVa
...,...,...
112,0,NIV_last_24
133,0,midur_apache_pred_var
110,0,dialysis_last_24
134,0,ventday1_apache_pred_var


In [18]:
transform_model(dataset, 'lr')


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase 

Feature: 0, Score: -0.01522
Feature: 1, Score: -0.00270
Feature: 2, Score: 0.00478
Feature: 3, Score: -0.00001
Feature: 4, Score: 0.00095
Feature: 5, Score: -0.00171
Feature: 6, Score: -0.00676
Feature: 7, Score: 0.00643
Feature: 8, Score: -0.01335
Feature: 9, Score: -0.00016
Feature: 10, Score: -0.00070
Feature: 11, Score: -0.03684
Feature: 12, Score: -0.01415
Feature: 13, Score: 0.00027
Feature: 14, Score: 0.00015
Feature: 15, Score: -0.00024
Feature: 16, Score: 0.00299
Feature: 17, Score: -0.01833
Feature: 18, Score: -0.00193
Feature: 19, Score: 0.00317
Feature: 20, Score: -0.00117
Feature: 21, Score: 0.00501
Feature: 22, Score: -0.00356
Feature: 23, Score: 0.00772
Feature: 24, Score: 0.01024
Feature: 25, Score: 0.02166
Feature: 26, Score: -0.01370
Feature: 27, Score: -0.00404
Feature: 28, Score: -0.01426
Feature: 29, Score: -0.00656
Feature: 30, Score: -0.00827
Feature: 31, Score: 0.00009
Feature: 32, Score: 0.00039
Feature: 33, Score: 0.02080
Feature: 34, Score: -0.00231
Feature: 

UnboundLocalError: local variable 'feature_imp' referenced before assignment

In [None]:
#ebm_local = ebm.explain_local(X_test_df, y_test)
#show(ebm_local)

In [None]:
for name in list(dataset.unittype.unique()):
    new_dataset=dataset[(dataset['unittype']==name)]
    print(name, new_dataset.shape)
    transform_model(new_dataset, 'lgbm')

In [None]:
for name in list(dataset.apachedxgroup.unique()):
    new_dataset=dataset[(dataset['apachedxgroup']==name)]
    print(name, new_dataset.shape)
    transform_model(new_dataset, 'lgbm')

In [None]:
for name in list(dataset.numbedscategory.unique()):
    new_dataset=dataset[(dataset['numbedscategory']==name)]
    print(name, new_dataset.shape)
    transform_model(new_dataset, 'lgbm')

In [19]:
for name in list(dataset.hospitalid.unique()):
    new_dataset=dataset[(dataset['hospitalid']==name)]
    print(name, new_dataset.shape)
    transform_model(new_dataset, 'lgbm')

256 (713, 185)



Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 du

[[227   0]
 [  9   0]]
               metric  value
0     cv_roc_auc_mean   0.59
1         cv_mcc_mean  -0.01
2    cv_accuracy_mean   0.96
3          cv_f1_mean   0.00
4   cv_precision_mean   0.00
5      cv_recall_mean   0.00
6             roc_auc   0.62
7                 mcc   0.00
8            accuracy   0.96
9                  f1   0.00
10          precision   0.00
11             recall   0.00
258 (319, 185)



Scoring failed. The score on this train-test partition for these parameters will be set to nan. Details: 
Traceback (most recent call last):
  File "C:\Users\fedyu\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\fedyu\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\fedyu\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 381, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\fedyu\anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 567, in roc_auc_score
    return _average_binary_score(
  File "C:\Users\fedyu\anaconda3\lib\site-packages\sklearn\metrics\_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\fedyu\anaconda3\lib\

[[103   0]
 [  3   0]]
               metric  value
0     cv_roc_auc_mean    NaN
1         cv_mcc_mean   0.00
2    cv_accuracy_mean   0.97
3          cv_f1_mean   0.00
4   cv_precision_mean   0.00
5      cv_recall_mean   0.00
6             roc_auc   0.42
7                 mcc   0.00
8            accuracy   0.97
9                  f1   0.00
10          precision   0.00
11             recall   0.00
259 (527, 185)



Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.



[[167   1]
 [  6   0]]
               metric  value
0     cv_roc_auc_mean   0.62
1         cv_mcc_mean  -0.01
2    cv_accuracy_mean   0.95
3          cv_f1_mean   0.00
4   cv_precision_mean   0.00
5      cv_recall_mean   0.00
6             roc_auc   0.84
7                 mcc  -0.01
8            accuracy   0.96
9                  f1   0.00
10          precision   0.00
11             recall   0.00
262 (173, 185)



Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


The least populated class in y has only 7 members, which is less than n_splits=10.


Scoring failed. The score on this train-test partition for these parameters will be set to nan. Details: 
Traceback (most recent call last):
  File "C:\Users\fedyu\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\fedyu\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\fedyu\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 381, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "C:\Users\fedyu\anaconda3\lib\site-packages\sklearn\metrics\_ranking.py", line 567, in roc_auc_score
    return _aver

[[55  0]
 [ 2  1]]
               metric  value
0     cv_roc_auc_mean    NaN
1         cv_mcc_mean  -0.02
2    cv_accuracy_mean   0.92
3          cv_f1_mean   0.00
4   cv_precision_mean   0.00
5      cv_recall_mean   0.00
6             roc_auc   0.67
7                 mcc   0.57
8            accuracy   0.97
9                  f1   0.50
10          precision   1.00
11             recall   0.33
263 (59, 185)



Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.



ValueError: Shape of passed values is (39, 188), indices imply (39, 197)