# Exercise - Decision Tree

This data set for this exercise is from healthcare. It contains data about 10 years of clinical care at 130 US Hospitals. Each row represents a single patient. The columns include the characteristics of deidentified diabetes patients. This is a binary classification task: predict whether a diabetes patient is readmitted to the hospital within 30 days of their discharge (1=Yes, 0=No). This is an important performance metric for hospitals as they try to minimize these types of readmissions.

## Description of Variables

The description of variables are provided in "Healthcare (small) - Data Dictionary.docx"

## Goal

Use the **healthcare_small.csv** data set and build a model to predict **readmitted**. 

# Read and Prepare the Data

In [81]:
# Common imports

import pandas as pd
import numpy as np

np.random.seed(42)

# Get the data

In [82]:
#We will predict the "price" value in the data set:

diabetes = pd.read_csv("healthcare_small.csv")
print(diabetes.shape)
diabetes.head()

(8666, 15)


Unnamed: 0,race,gender,age,admission_type,discharge_disposition,admission_source,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,readmitted
0,Other,Female,70-80,2,3,1,14,32,3,15,0,0,0,9,1
1,Caucasian,Female,80-90,1,3,5,4,44,0,15,0,0,0,9,0
2,AfricanAmerican,Male,50-60,5,1,1,6,29,1,15,0,0,0,9,1
3,Caucasian,Female,50-60,1,1,6,3,47,0,10,0,0,0,4,0
4,AfricanAmerican,Female,40-50,3,1,1,4,92,0,15,0,0,0,7,0


In [83]:
diabetes.isna().sum()

race                     174
gender                     0
age                        0
admission_type             0
discharge_disposition      0
admission_source           0
time_in_hospital           0
num_lab_procedures         0
num_procedures             0
num_medications            0
number_outpatient          0
number_emergency           0
number_inpatient           0
number_diagnoses           0
readmitted                 0
dtype: int64

In [84]:
diabetes.describe()

Unnamed: 0,admission_type,discharge_disposition,admission_source,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,readmitted
count,8666.0,8666.0,8666.0,8666.0,8666.0,8666.0,8666.0,8666.0,8666.0,8666.0,8666.0,8666.0
mean,1.992499,4.056658,5.77706,4.546273,43.661205,1.32841,16.352412,0.39303,0.277752,0.860028,7.549965,0.467113
std,1.424965,5.682168,4.0255,3.033187,19.518495,1.689974,8.135095,1.293926,1.328651,1.621737,1.855758,0.498946
min,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
25%,1.0,1.0,1.0,2.0,33.0,0.0,11.0,0.0,0.0,0.0,6.0,0.0
50%,1.0,1.0,7.0,4.0,45.0,1.0,15.0,0.0,0.0,0.0,9.0,0.0
75%,3.0,5.0,7.0,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0,1.0
max,8.0,28.0,20.0,14.0,109.0,6.0,72.0,40.0,63.0,19.0,16.0,1.0


# Split data (train/test)

In [85]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(diabetes, test_size=0.3)

# Data Prep

In [86]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.dummy import DummyClassifier

## Separate the target variable 

In [87]:
train_target = train['readmitted']
test_target = test['readmitted']

train_inputs = train.drop(['readmitted'], axis=1)
test_inputs = test.drop(['readmitted'], axis=1)

## Feature Engineering: Derive a new column

Examples:
- Whether the patient had any emergency visits or not
- Ratio of inpatient visits to outpatient visits
- Ratio of time in hospital to number of medications
- (etc.)

In [88]:
print(train_inputs.columns)

Index(['race', 'gender', 'age', 'admission_type', 'discharge_disposition',
       'admission_source', 'time_in_hospital', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'number_diagnoses'],
      dtype='object')


In [89]:
# # Whether the patient had any emergency visits or not
# train_inputs['had_emergency_visits'] = train_inputs['number_emergency'].apply(lambda x: 1 if x > 0 else 0)
# test_inputs['had_emergency_visits'] = test_inputs['number_emergency'].apply(lambda x: 1 if x > 0 else 0)

# Define the transformation function
def emergency_visits(x):
  return (x > 0).astype(int)

# Create the transformer
transformer = FunctionTransformer(emergency_visits)

# Apply the transformer
train_inputs['had_emergency_visits'] = transformer.fit_transform(train_inputs[['number_emergency']])
test_inputs['had_emergency_visits'] = transformer.transform(test_inputs[['number_emergency']])

In [90]:
# Ratio of inpatient visits to outpatient visits
train_inputs['inpatient_to_outpatient_ratio'] = train_inputs.apply(
    lambda row: row['number_inpatient'] / row['number_outpatient'] if row['number_outpatient'] > 0 else 0, axis=1
)
test_inputs['inpatient_to_outpatient_ratio'] = test_inputs.apply(
    lambda row: row['number_inpatient'] / row['number_outpatient'] if row['number_outpatient'] > 0 else 0, axis=1
)

In [91]:
train_inputs = train_inputs.drop(['number_emergency'], axis=1)
test_inputs = test_inputs.drop(['number_emergency'], axis=1)

In [92]:
train_inputs.head()

Unnamed: 0,race,gender,age,admission_type,discharge_disposition,admission_source,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_inpatient,number_diagnoses,had_emergency_visits,inpatient_to_outpatient_ratio
4166,Caucasian,Male,80-90,1,3,7,4,39,1,19,5,0,9,0,0.0
5546,Caucasian,Female,80-90,1,1,7,8,73,0,16,0,0,9,0,0.0
2957,Caucasian,Male,60-70,1,1,7,2,41,2,19,0,2,8,0,0.0
6329,Caucasian,Male,40-50,1,1,7,4,54,1,20,0,0,5,0,0.0
565,AfricanAmerican,Female,70-80,3,1,1,2,15,2,8,0,1,9,0,0.0


##  Identify the numeric, binary, and categorical columns

In [93]:
# Let's change some of the categorical variables' data type to object

train_inputs['admission_source'] = train_inputs['admission_source'].astype('object')
train_inputs['admission_type'] = train_inputs['admission_type'].astype('object')
train_inputs['discharge_disposition'] = train_inputs['discharge_disposition'].astype('object')

test_inputs['admission_source'] = test_inputs['admission_source'].astype('object')
test_inputs['admission_type'] = test_inputs['admission_type'].astype('object')
test_inputs['discharge_disposition'] = test_inputs['discharge_disposition'].astype('object')


In [94]:
# Identify the numerical columns
numeric_columns = train_inputs.select_dtypes(include=[np.number]).columns.to_list()

# Identify the categorical columns
categorical_columns = train_inputs.select_dtypes('object').columns.to_list()

In [95]:
numeric_columns

['time_in_hospital',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_inpatient',
 'number_diagnoses',
 'had_emergency_visits',
 'inpatient_to_outpatient_ratio']

In [96]:
numeric_correct = [
 'time_in_hospital',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_emergency',
 'number_inpatient',
 'number_diagnoses',
 'inpatient_to_outpatient_ratio'
 ]

In [97]:
categorical_columns

['race',
 'gender',
 'age',
 'admission_type',
 'discharge_disposition',
 'admission_source']

In [98]:
# Binary columns
binary_columns = ["had_emergency_visits"]

# Pipeline

In [99]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [100]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [101]:
# Create a pipeline for the binary column here
binary_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy= 'most_frequent'))
])

In [102]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('binary', binary_transformer, binary_columns),   
        ],   
        remainder='passthrough')

#passtrough is an optional step. You don't have to use it.

# Transform: fit_transform() for TRAIN

In [103]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 91789 stored elements and shape (6066, 66)>

In [104]:
train_x.shape

(6066, 66)

# Tranform: transform() for TEST

In [105]:
# Transform the test data
test_x = preprocessor.transform(test_inputs)

test_x

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 39358 stored elements and shape (2600, 66)>

In [106]:
test_x_df = pd.DataFrame.sparse.from_spmatrix(test_x) 
test_x_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56,57,58,59,60,61,62,63,64,65
0,0.466271,0.631539,-0.787916,-0.294988,-0.293483,-0.529201,0.779131,-0.389486,-0.218781,0,...,0,0,0,0,0,1.0,0,0,0,0
1,0.138493,1.09493,-0.787916,-0.789808,-0.293483,-0.529201,0.779131,-0.389486,-0.218781,0,...,0,0,0,0,0,1.0,0,0,0,0
2,-0.517064,1.35237,-0.787916,0.199833,-0.293483,-0.529201,0.779131,-0.389486,-0.218781,0,...,0,0,0,0,0,1.0,0,0,0,0
3,0.138493,0.013683,-0.787916,0.447243,-0.293483,0.095217,0.779131,-0.389486,-0.218781,0,...,0,0,0,0,0,1.0,0,0,0,0
4,-0.517064,0.477075,0.977474,-1.160924,-0.293483,-0.529201,-0.306231,-0.389486,-0.218781,0,...,0,0,0,0,0,1.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2595,1.777385,0.991954,1.565938,0.076128,-0.293483,-0.529201,0.779131,-0.389486,-0.218781,0,...,0,0,0,0,0,0,0,0,0,0
2596,3.088499,1.558322,0.389011,1.313179,-0.293483,0.095217,0.779131,-0.389486,-0.218781,1.0,...,0,0,0,0,0,1.0,0,0,0,0
2597,-0.189286,0.528563,-0.787916,-0.171283,-0.293483,-0.529201,0.779131,-0.389486,-0.218781,1.0,...,1.0,0,0,0,0,0,0,0,0,0
2598,0.79405,0.786002,-0.199453,-0.542398,-0.293483,0.719634,0.779131,2.567487,-0.218781,0,...,0,0,0,0,0,1.0,0,0,0,1.0


In [107]:
test_x.shape

(2600, 66)

# Calculate the Baseline

In [108]:
# Create and fit the dummy classifier
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(train_x, train_target)

In [109]:
# Predict and evaluate on evaluate on test data
baseline_predictions = dummy_clf.predict(test_x) 
from sklearn.metrics import accuracy_score

In [110]:
# Calculate baseline accuracy 
baseline_accuracy = accuracy_score(test_target, baseline_predictions)
print("Baseline test accuracy: ", baseline_accuracy)

Baseline test accuracy:  0.5365384615384615


# Train a DT model

In [111]:
# Import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Initialize the Decision Tree classifier
dt_model = DecisionTreeClassifier(random_state=42)

In [112]:
# Train the model on the training data
dt_model.fit(train_x, train_target)

# Predict on the test set
dt_predictions = dt_model.predict(test_x)

### Calculate the accuracy

In [113]:
# Calculate the accuracy
dt_accuracy = accuracy_score(test_target, dt_predictions)
print("Decisin tree accuracy: ", dt_accuracy)

Decisin tree accuracy:  0.5473076923076923


# Train another DT model (with different parameters)

In [114]:
# Train a DT model with declared maximum depth
max_depth_dt_model = DecisionTreeClassifier(max_depth=5, random_state=42)

# Train the model on the training data
max_depth_dt_model.fit(train_x, train_target)

# Predict on the test set
max_depth_dt_predictions = max_depth_dt_model.predict(test_x)

In [115]:
# Calculate the accuracy for the max_depth model
from sklearn.metrics import accuracy_score
max_depth_dt_accuracy = accuracy_score(test_target, max_depth_dt_predictions)
print("Max depth decision tree accuracy: ", max_depth_dt_accuracy)

Max depth decision tree accuracy:  0.6026923076923076


In [116]:
# Combination model
combo_dt_model_tuned = DecisionTreeClassifier(
    max_depth=5,
    min_samples_split=10,
    min_samples_leaf=5,
    criterion='entropy',
    random_state=42
)

### Calculate the accuracy

In [117]:
# Train and evaluate the new model
combo_dt_model_tuned.fit(train_x, train_target)
dt_tuned_predictions = combo_dt_model_tuned.predict(test_x)
dt_tuned_accuracy = accuracy_score(test_target, dt_tuned_predictions)
print("Max depth decision tree accuracy: ", dt_tuned_accuracy)

Max depth decision tree accuracy:  0.5961538461538461


# Optional: try grid search

In [118]:
from sklearn.model_selection import GridSearchCV

In [119]:
# Define the parameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 2, 6, 8, 10],
    'min_samples_split': [2, 6, 8, 10],
    'min_samples_leaf': [2, 6, 8, 10],
}

In [120]:
from sklearn.tree import DecisionTreeClassifier

# Initialize GridSearchCV with DecisionTreeClassifier and cross-validation
grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,                       
    scoring='accuracy',        
    n_jobs=3              
)

# Fit the grid search to the training data
grid_search.fit(train_x, train_target)

# Retrieve the best parameters and the best estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters from grid search:", best_params)

Best parameters from grid search: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'splitter': 'random'}
