# Diabetes readmission notebook
Compiled by Victor Ruiz

In [None]:
import pandas as pd

# Ingest the dataset from our github repo
# For more information about the data, see on Kaggle: https://www.kaggle.com/brandao/diabetes#description.pdf

diabetes_data = pd.read_csv('https://raw.githubusercontent.com/arcus/education-materials/master/ml-intermediate/datasets/diabetes/diabetic_data.csv')


In [None]:
diabetes_data.info()

In [None]:
diabetes_data.head()

What is the outcome?

In [None]:
diabetes_data.readmitted.value_counts()

In [None]:
### convert class to binary
diabetes_data.loc[:, 'readmitted'] = diabetes_data.readmitted == '<30' # but why?
diabetes_data.readmitted.value_counts()

Things to note from the data above:


*   There ARE missing data
*   Object columns have binary and categorical values

Let's replace '?' with np.nan and identify boolean features

In [None]:
import numpy as np
diabetes_data = diabetes_data.replace('?', np.nan)
binary_features = [
      var for var in diabetes_data.columns if diabetes_data[var].isin(['Yes', 'No', np.nan]).all()
]

In [None]:
binary_features

In [None]:
diabetes_data.tolazamide.value_counts() # see anything problematic????

In [None]:
diabetes_data.info()

So we hace both numeric and categorical features, what do we do about that? --> Encoding! (dummy, one-hot, ordinal, ..., )

sklearn requires numeric features only, so we will use dummy encoding for non-numeric features

In [None]:
numeric = diabetes_data.dtypes[(diabetes_data.dtypes == 'int') | (diabetes_data.dtypes == 'float')].index.values
non_numeric = diabetes_data.columns[~diabetes_data.columns.isin(numeric)].values

non_numeric

Just in case, let's check how many unique values each of these feature has

In [None]:
diabetes_data[non_numeric].apply(lambda x: x.unique().size, axis=0)
diabetes_data.diag_1.value_counts().head(20)

For simplicity, let's assume that diag_1 is the primary diagnosis and drop the rest

In [None]:
del diabetes_data['diag_2']
del diabetes_data['diag_3'] # yes, I know how ugly this looks

Are numeric features really numeric?


In [None]:
diabetes_data[numeric].head()



*   encounter_id should not be used for prediction, why?
*   ^ditto for patient_nbr
* What about admission_type_id, discharge_disposition_id, admission_source_id?



In [None]:
non_numeric = np.concatenate((non_numeric, np.array(['admission_type_id', 'discharge_disposition_id', 'admission_source_id'])))
del diabetes_data['encounter_id'] # still ugly -.-'
del diabetes_data['patient_nbr']

OK, encoding time!

In [None]:
non_numeric = non_numeric[~np.isin(non_numeric, ['diag_2', 'diag_3', 'readmitted', 'encounter_id', 'patient_nbr'])]
encoded_data = pd.get_dummies(data=diabetes_data, prefix_sep='__dummycat__', dummy_na=True, columns=non_numeric, drop_first=True)

In [None]:
encoded_data.shape

In [None]:
diabetes_data.shape

In [None]:
encoded_data.head()

Clean garbage from encoding

In [None]:
nunique = encoded_data.nunique()
constant_features = nunique[nunique == 1].index.values
constant_features

In [None]:
encoded_data = encoded_data.reindex(encoded_data.columns[~encoded_data.columns.isin(constant_features)], axis=1)

OK time for machine learning -.-'

# Set up cross-validation

In [None]:
### create 5 stratified folds
from sklearn.model_selection import StratifiedKFold
class_labels = encoded_data.readmitted.values
data = encoded_data.values
skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
train_sets = []
test_sets = []

#split data between variables and outcome
X, y = encoded_data[encoded_data.columns[encoded_data.columns != 'readmitted']].copy(), encoded_data.readmitted.copy()
for train_index, test_index in skf.split(data, class_labels):
  train_sets += [(X.iloc[train_index].copy(), y.iloc[train_index].copy())]
  test_sets += [(X.iloc[test_index].copy(), y.iloc[test_index].copy())]
  print(train_index.shape, test_index.shape)

What does one fold look like?

In [None]:
train_sets[0][0]

In [None]:
train_sets[0][1]

In [None]:
train_sets[0][0].shape

In [None]:
test_sets[0][0].shape

# Define preprocessing and classification pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
pipe = Pipeline([
                 ('impute', SimpleImputer(strategy='median')),
                 ('rf', DecisionTreeClassifier())
])

# Fit a decision tree for each fold

In [None]:
from copy import deepcopy  # to clone pipeline
models = []
class_probs_train = []
class_probs_test = []
from tqdm import tqdm
for ii in tqdm(range(5)):
  X_train, y_train = train_sets[ii]
  X_test, y_test = test_sets[ii]
  model = deepcopy(pipe)
  model.fit(X_train, y=y_train)
  models += [model]
  class_probs_train += [model.predict_proba(X_train)]
  class_probs_test += [model.predict_proba(X_test)]

# Compute AUC, sensitivity, specificity, and PPV

In [None]:
#how do probabilities look like?
class_probs_train[0]

In [None]:
class_probs_train[0].shape

In [None]:
#what about models?
models[0]

In [None]:
models[0]['rf']

In [None]:
models[0]['rf'].__dict__

In [None]:
def get_metrics(y_true, y_score, fold, desc, threshold=0.5):
  from sklearn.metrics import roc_auc_score, confusion_matrix
  import pandas as pd
  tn, fp, fn, tp = confusion_matrix(y_true, y_score >= threshold).ravel()
  sensitivity = tp / (tp + fn)
  specificity = tn / (tn + fp)
  ppv = tp / (tp + fp)
  auc = roc_auc_score(y_true, y_score)
  vals = [sensitivity, specificity, ppv, auc, fold, desc]
  names = ['sensitivity', 'specificity', 'ppv', 'auc', 'fold', 'desc']
  return pd.Series(vals, names)


In [None]:
metrics = pd.DataFrame()
for ii in tqdm(range(5)):
  probs_train, probs_test = class_probs_train[ii][:, 1], class_probs_test[ii][:, 1]
  y_train, y_test = train_sets[ii][1], test_sets[ii][1]
  metrics_train = get_metrics(y_train, probs_train, ii, 'training')
  metrics_test = get_metrics(y_test, probs_test, ii, 'test')
  metrics = metrics.append(metrics_train, ignore_index=True, sort=False)
  metrics = metrics.append(metrics_test, ignore_index=True, sort=False)

In [None]:
metrics

# Get average statistics

In [None]:
metrics.groupby(['desc']).agg([np.mean, np.std])

In [None]:
# CHALLENGE: Would you like to use a specific or different measure? Play around a bit here if you'd like!

# Results' interpretation
* Perfect classification in training
* Poor classification in test
* Model has high specificity but low sensitivity. How can this be used?
* Very little variation in test performance (still poor though)
* Could model complexity (N features) be causing overfitting?

Let's repeat the experiment with fewer features and see what happens

In [None]:
# metrics = pd.DataFrame()
# models = []
# from sklearn.feature_selection import SelectKBest, mutual_info_classif
# for ii in tqdm(range(5)):
#   X_train, y_train = train_sets[ii]
#   X_test, y_test = test_sets[ii]
#   model = Pipeline([
#                 ('impute', SimpleImputer(strategy='median')),
#                 ('select', SelectKBest(score_func=mutual_info_classif, k=20)),
#                 ('rf', DecisionTreeClassifier())
#   ])
#   model.fit(X_train, y=y_train)
#   models += [model]
#   probs_train, probs_test = model.predict_proba(X_train)[:, 1], model.predict_proba(X_test)[:, 1]
#   metrics_train = get_metrics(y_train, probs_train, ii, 'training')
#   metrics_test = get_metrics(y_test, probs_test, ii, 'test')
#   metrics = metrics.append(metrics_train, ignore_index=True, sort=False)
#   metrics = metrics.append(metrics_test, ignore_index=True, sort=False)

In [None]:
keep_features_regex = '^race.+|^gender.+|^age.+|^weight.+|^admission_type_id.+|^time_in_hospital.+|^payer_code.+|^num_lab_procedures.+|^num_procedures.+|^num_medications.+|^number_outpatient.+|^number_emergency.+|^number_inpatient.+|insulin|diabetes_med'

metrics = pd.DataFrame()
models = []
for ii in tqdm(range(5)):
  X_train, y_train = train_sets[ii]
  X_test, y_test = test_sets[ii]
  keep_features = X.columns[X.columns.str.contains(keep_features_regex)].tolist()
  X_train = X_train[keep_features].copy()
  X_test = X_test[keep_features].copy()
  model = Pipeline([
                ('impute', SimpleImputer(strategy='median')),
                ('dt', DecisionTreeClassifier())
  ])
  model.fit(X_train, y=y_train)
  models += [model]
  probs_train, probs_test = model.predict_proba(X_train)[:, 1], model.predict_proba(X_test)[:, 1]
  metrics_train = get_metrics(y_train, probs_train, ii, 'training')
  metrics_test = get_metrics(y_test, probs_test, ii, 'test')
  metrics = metrics.append(metrics_train, ignore_index=True, sort=False)
  metrics = metrics.append(metrics_test, ignore_index=True, sort=False)

In [None]:
metrics.groupby(['desc']).agg([np.mean, np.std])

## Model still sucks... Let's try with a more complex model



In [None]:
# keep_features_regex = '^race.+|^gender.+|^age.+|^weight.+|^admission_type_id.+|^time_in_hospital.+|^payer_code.+|^num_lab_procedures.+|^num_procedures.+|^num_medications.+|^number_outpatient.+|^number_emergency.+|^number_inpatient.+|insulin|diabetes_med'
from sklearn.ensemble import RandomForestClassifier
metrics = pd.DataFrame()
models = []
for ii in tqdm(range(5)):
  X_train, y_train = train_sets[ii]
  X_test, y_test = test_sets[ii]
  keep_features = X.columns[X.columns.str.contains(keep_features_regex)].tolist()
  # X_train = X_train[keep_features].copy()
  # X_test = X_test[keep_features].copy()
  model = Pipeline([
                ('impute', SimpleImputer(strategy='median')),
                ('rf', RandomForestClassifier())
  ])
  model.fit(X_train, y=y_train)
  models += [model]
  probs_train, probs_test = model.predict_proba(X_train)[:, 1], model.predict_proba(X_test)[:, 1]
  metrics_train = get_metrics(y_train, probs_train, ii, 'training')
  metrics_test = get_metrics(y_test, probs_test, ii, 'test')
  metrics = metrics.append(metrics_train, ignore_index=True, sort=False)
  metrics = metrics.append(metrics_test, ignore_index=True, sort=False)

In [None]:
metrics.groupby(['desc']).agg([np.mean, np.std])

# Not stelar but we're getting somewhere. However, using a default prediction threshold of 0.5, we have near-perfect specificity and very poor sensitivity, let's explore how these values change based on the prediction threshold.

In [None]:
thresholds = np.linspace(0, 1, 100)
metrics = pd.DataFrame([get_metrics(y_test, probs_test, None, var, threshold=var) for var in thresholds])
metrics = metrics.rename(columns={'desc': 'threshold'})


In [None]:
from matplotlib import pyplot as plt
plt.rcParams.update({'font.size': 22})
plt.figure(figsize=(16, 12))
plt.plot( 'threshold', 'sensitivity', data=metrics, marker='o', markerfacecolor='blue', markersize=2, color='skyblue', linewidth=4)
plt.plot( 'threshold', 'specificity', data=metrics, marker='', color='olive', linewidth=2)
plt.plot( 'threshold', 'ppv', data=metrics, marker='', color='black', linewidth=2, linestyle='dashed')
plt.legend()