# Predicting Survival of Heart Failure Patients with Machine Learning Models

In [1]:
#General Purpose Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#Machine Learning Libraries
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate

## Importing the dataset and pre-processing it

In [2]:
def data_preprocessor(file):
    #Reading the file into a dataframe
    df = pd.read_excel(file)
    #Rearranging the columns
    df = df[['sex', 'age', 
         'smoking', 'diabetes', 'high_blood_pressure',
         'anaemia', 'platelets', 'ejection_fraction',
         'creatinine_phosphokinase', 'serum_creatinine','serum_sodium',
         'time', 'DEATH_EVENT']]
    #Renaming the columns to keep them short
    df = df.rename(columns={'smoking':'smk',
                        'diabetes':'dia',
                        'anaemia':'anm',
                        'platelets':'plt',
                        'high_blood_pressure':'hbp',
                        'creatinine_phosphokinase':'cpk',
                        'ejection_fraction':'ejf',
                        'serum_creatinine':'scr',
                        'serum_sodium':'sna',
                        'DEATH_EVENT':'death'})
    return df

In [3]:
df = data_preprocessor('heart.xlsx')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   sex     299 non-null    int64  
 1   age     299 non-null    float64
 2   smk     299 non-null    int64  
 3   dia     299 non-null    int64  
 4   hbp     299 non-null    int64  
 5   anm     299 non-null    int64  
 6   plt     299 non-null    float64
 7   ejf     299 non-null    int64  
 8   cpk     299 non-null    int64  
 9   scr     299 non-null    float64
 10  sna     299 non-null    int64  
 11  time    299 non-null    int64  
 12  death   299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB


## Separating Predictors and Targets

In [5]:
cat_feat = df[['sex', 'smk', 'dia', 'hbp', 'anm']]
num_feat = df[['age', 'plt', 'ejf', 'cpk', 'scr', 'sna']]

predictors = pd.concat([cat_feat, num_feat],axis=1)
target = df['death']

## Checking for target class distribution

In [6]:
target.value_counts(normalize=True)

0    0.67893
1    0.32107
Name: death, dtype: float64

The target classes are imbalanced

## Feature Scaling

In [7]:
scaler = StandardScaler()
scaled_feat = pd.DataFrame(scaler.fit_transform(num_feat.values),
                           columns = num_feat.columns)

scaled_predictors = pd.concat([cat_feat, scaled_feat], axis=1)

## Building Machine Learning Models

### Non-penalized Logisitic Regression Model

In [8]:
#Stratified 8 fold cross validation
strat_kfold = StratifiedKFold(n_splits=10, shuffle=True)

#Instantiating the logistic regressor
logreg_clf = LogisticRegression()
x = scaled_predictors.values
y = target.values

#Running the model and tallying results of stratified 8-fold cross validation
result1 = cross_validate(logreg_clf, x, y, cv=strat_kfold, scoring=['accuracy',
                                                                    'balanced_accuracy',
                                                                    'precision',
                                                                    'recall',
                                                                    'roc_auc'])
for key, values in result1.items():
    print(key, values.mean())

fit_time 0.01238079071044922
score_time 0.004261374473571777
test_accuracy 0.7389655172413793
test_balanced_accuracy 0.6590873015873016
test_precision 0.672142857142857
test_recall 0.43555555555555553
test_roc_auc 0.7596878306878307


### Penalized Logisitc Regression Model

In [9]:
#Stratified 8 fold cross validation
strat_kfold = StratifiedKFold(n_splits=10, shuffle=True)

#Instantiating the logistic regressor
logreg_clf_pen = LogisticRegression(class_weight='balanced')
x = scaled_predictors.values
y = target.values

#Running the model and tallying results of stratified 8-fold cross validation
result2 = cross_validate(logreg_clf_pen, x, y, cv=strat_kfold, scoring=['accuracy',
                                                                    'balanced_accuracy',
                                                                    'precision',
                                                                    'recall',
                                                                    'roc_auc'])
for key, values in result2.items():
    print(key, values.mean())

fit_time 0.005613851547241211
score_time 0.003759169578552246
test_accuracy 0.712183908045977
test_balanced_accuracy 0.7151190476190475
test_precision 0.5449309514015397
test_recall 0.7200000000000001
test_roc_auc 0.7719259259259259


In [10]:
pd.concat([pd.DataFrame(result1).mean(),
           pd.DataFrame(result2).mean()],axis=1).rename(columns={0:'Non-Penalized LogReg',1:'Penalized LogReg'})

Unnamed: 0,Non-Penalized LogReg,Penalized LogReg
fit_time,0.012381,0.005614
score_time,0.004261,0.003759
test_accuracy,0.738966,0.712184
test_balanced_accuracy,0.659087,0.715119
test_precision,0.672143,0.544931
test_recall,0.435556,0.72
test_roc_auc,0.759688,0.771926


### Non-penalized Spport Vector Classifier

In [11]:
#Stratified 8 fold cross validation
strat_kfold = StratifiedKFold(n_splits=10, shuffle=True)

#Instantiating the SVC
svc_clf = SVC(kernel='rbf')
x = scaled_predictors.values
y = target.values

#Running the model and tallying results of stratified 8-fold cross validation
result3 = cross_validate(svc_clf, x, y, cv=strat_kfold, scoring=['accuracy',
                                                                    'balanced_accuracy',
                                                                    'precision',
                                                                    'recall',
                                                                    'roc_auc'])
for key, values in result3.items():
    print(key, values.mean())

fit_time 0.003960132598876953
score_time 0.005464863777160644
test_accuracy 0.7062068965517241
test_balanced_accuracy 0.6321428571428571
test_precision 0.5658730158730159
test_recall 0.42666666666666664
test_roc_auc 0.7741190476190477


### Penalized Logisitc Regression Model

In [12]:
#Stratified 8 fold cross validation
strat_kfold = StratifiedKFold(n_splits=10, shuffle=True)

#Instantiating the SVC
svc_clf_pen = SVC(kernel='rbf', class_weight='balanced')
x = scaled_predictors.values
y = target.values

#Running the model and tallying results of stratified 8-fold cross validation
result4 = cross_validate(svc_clf_pen, x, y, cv=strat_kfold, scoring=['accuracy',
                                                                    'balanced_accuracy',
                                                                    'precision',
                                                                    'recall',
                                                                    'roc_auc'])
for key, values in result4.items():
    print(key, values.mean())

fit_time 0.004862165451049805
score_time 0.008171987533569337
test_accuracy 0.7422988505747126
test_balanced_accuracy 0.7437301587301588
test_precision 0.5838141025641026
test_recall 0.7488888888888889
test_roc_auc 0.7988386243386244


In [13]:
pd.concat([pd.DataFrame(result3).mean(),
           pd.DataFrame(result4).mean()],axis=1).rename(columns={0:'Non-Penalized SVC',1:'Penalized SVC'})

Unnamed: 0,Non-Penalized SVC,Penalized SVC
fit_time,0.00396,0.004862
score_time,0.005465,0.008172
test_accuracy,0.706207,0.742299
test_balanced_accuracy,0.632143,0.74373
test_precision,0.565873,0.583814
test_recall,0.426667,0.748889
test_roc_auc,0.774119,0.798839
