# Heart Failure Prediction
12 clinical features for predicting death events.

Source: https://www.kaggle.com/datasets/andrewmvd/heart-failure-clinical-data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from IPython.core.interactiveshell import InteractiveShell # Importing so we can run multiple lines in one cell
InteractiveShell.ast_node_interactivity = "all" # Code so multiple lines in one cell can be ran simultaenously
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')
%matplotlib inline

# ▶  Data Import

In [None]:
df = pd.read_csv('heart_failure_clinical_records_dataset.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.dtypes

Our target variable is **DEATH_EVENT**.  
We will try to predict **DEATH EVENT** using the other features.

# ▶  Data Exploration

In [None]:
# Grouping DEATH_EVENT by age
df.groupby(['age'], as_index=False)['DEATH_EVENT'].sum().sort_values(by='DEATH_EVENT', ascending=False)
df.groupby(['diabetes'], as_index=False)['DEATH_EVENT'].sum().sort_values(by='DEATH_EVENT', ascending=False)
df.groupby(['high_blood_pressure'], as_index=False)['DEATH_EVENT'].sum().sort_values(by='DEATH_EVENT', ascending=False)
df.groupby(['smoking'], as_index=False)['DEATH_EVENT'].sum().sort_values(by='DEATH_EVENT', ascending=False)

In [None]:
# Visualize distribution of DEATH_EVENT
sns.catplot(x='DEATH_EVENT', kind='count', data=df);

In [None]:
# Percentage of DEATH_EVENT
df['DEATH_EVENT'].value_counts(normalize=True) * 100

Even though our target class is imbalanced, it is only a 32.10% which is mild. We will leave it as-is

# ▶  Data Cleaning

### Detection and Treatment of Nulls

In [None]:
df.isnull().sum()

### Detection and Treatment of Duplicated Values

In [None]:
df.duplicated().value_counts()

### Generate Dataframe Profile and export to HTML

In [None]:
from ydata_profiling import ProfileReport
import numba

profile_Report = ProfileReport(df, 
                               title="Heart Failure Prediction",
                               dataset={
                                        "description": "This profiling report was generated for the Heart Failure Prediction repository.",
                                        "author": "Bader Ale",
                                        "copyright_year": 2024,
                                        "url": "https://github.com/baderale/WGU_MSDA"}
                               )


profile_Report.to_widgets()
profile_Report.to_file('Profile Report.html')
print(numba.__version__)


# ▶  Exploratory Data Analysis

In [None]:
df.dtypes

### The columns are described as follows:
1) age = age of patient
2) anaemia = Decrease of red blood cells or hemoglobin (boolean)
3) creatinine_phosphokinase = Level of the CPK enzyme in the blood (mcg/L)
4) diabetes = If the patient has diabetes (boolean)
5) ejection_fraction = Percentage of blood leaving the heart at each contraction (percentage)
6) high_blood_pressure = If the patient has hypertension (boolean)
7) platelets = Platelets in the blood (kiloplatelets/mL)
8) serum_creatinine = Level of serum creatinine in the blood (mg/dL)
9) serum_sodim = Level of serum sodium in the blood (mEq/L)
10) sex =   Woman or man (binary)
11) smoking = If the patient smokes or not (boolean)
12) time = Follow-up period (days)
13) DEATH_EVENT = If the patient deceased during the follow-up period (boolean)

### Univariate Analysis


In [None]:
# Select numeric columns
num_cols = df.select_dtypes(include='number').columns

# Set the number of rows and columns for subplots
rows = min(len(num_cols), 4)
cols = 4

# Create a figure and subplots
fig, axes = plt.subplots(rows, cols, figsize=(15, 10))

# Flatten the axes array
axes = axes.flatten()

# Plot histograms for each numeric column
for i, col in enumerate(num_cols):
    if i < len(axes):  # Ensure we only iterate over valid indices
        axes[i].hist(df[col], bins=20)
        axes[i].set_title(col)

# Adjust the spacing between subplots
fig.tight_layout()

# Show the plot
plt.show();


In [None]:
# Pairwise plots
sns.pairplot(df, hue='DEATH_EVENT')
plt.show()

# Box plots for each variable against 'DEATH_EVENT'
for column in df.columns:
    if column != 'DEATH_EVENT':
        sns.boxplot(x='DEATH_EVENT', y=column, data=df)
        plt.show()

# Heatmap of the correlation matrix
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.show();

## § Model - XGBoost Classifier

In [None]:
X = df.iloc[:,0:-1]
y = df.iloc[:,-1:]

In [None]:
X.head()

In [None]:
y.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train , y_test = train_test_split(X,y,test_size=0.33, random_state=123)

In [None]:
X_train.shape
X_test.shape

In [None]:
from xgboost import XGBClassifier

# create model instance
xgb_clf = XGBClassifier(n_estimators=15, max_depth=5, learning_rate=1, objective='binary:logistic')
# fit model
xgb_clf.fit(X_train, y_train)
# make predictions
preds = xgb_clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

#Calculating accuracy
accuracy = accuracy_score(y_test, preds)

print("Accuracy:", accuracy)
#print("\nClassification Report:")
#print(classification_report(y_test, preds, target_names=df.columns))

## § Model - Support Vector Classifier

In [None]:
from sklearn.svm import SVC 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix 

In [None]:
# Separating feautres from targets
df_features = df.drop('DEATH_EVENT', axis=1)
df_target = df['DEATH_EVENT']

In [None]:
# Scaling features before model
df_features = StandardScaler().fit_transform(df_features)

In [None]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(df_features, df_target, test_size=0.3,random_state=109) # 70% training and 30% test

In [None]:
#Create a svm Classifier
model = SVC()

#Train the model using the training sets
model.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = model.predict(X_test)

In [None]:
# print prediction results 
predictions = model.predict(X_test) 
print(classification_report(y_test, predictions)) 

Here's an interpretation of the various metrics in the classification report:

Precision:

Precision for class 0: 0.75
Precision for class 1: 0.76
Precision measures the accuracy of positive predictions made by the model. For class 0, it means that 75% of the instances predicted as class 0 were actually class 0, and for class 1, it means that 76% of the instances predicted as class 1 were actually class 1.

Recall:

Recall for class 0: 0.89
Recall for class 1: 0.54
Recall, also known as sensitivity or true positive rate, measures the ability of the model to correctly identify all instances of a particular class. For class 0, it means that 89% of the actual class 0 instances were correctly identified, while for class 1, only 54% of the actual class 1 instances were correctly identified.

F1-Score:

F1-Score for class 0: 0.82
F1-Score for class 1: 0.63
The F1-Score is the harmonic mean of precision and recall and is a balanced metric that considers both false positives and false negatives. For class 0, the F1-Score is 0.82, indicating a good balance between precision and recall, while for class 1, the F1-Score is 0.63, which is lower due to the lower recall for this class.

Support:

Support for class 0: 55
Support for class 1: 35
The support represents the number of instances in each class in the dataset. In this case, there are 55 instances of class 0 and 35 instances of class 1.

Accuracy:

Overall accuracy: 0.76
The accuracy represents the proportion of correctly classified instances over the total number of instances. The overall accuracy of the model is 76%, which means that 76% of the instances in the dataset were correctly classified.

Macro Average:

Macro average precision: 0.76
Macro average recall: 0.72
Macro average F1-Score: 0.72
The macro average calculates the precision, recall, and F1-Score by averaging the values for each class without considering class imbalance. In this case, the macro average precision, recall, and F1-Score are all around 0.72, indicating a moderate performance across both classes.

Weighted Average:

Weighted average precision: 0.76
Weighted average recall: 0.76
Weighted average F1-Score: 0.75
The weighted average calculates the precision, recall, and F1-Score by considering the class imbalance in the dataset. It gives more weight to the class with more instances. In this case, the weighted average precision, recall, and F1-Score are all around 0.76, which is slightly higher than the macro average and indicates that the model's performance is more influenced by class 0 due to its higher support.

## § Parameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV 

# defining parameter range 
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
			'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
			'kernel': ['linear','rbf']} 

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) 

# fitting the model for grid search 
grid.fit(X_train, y_train) 


In [None]:
# print best parameter after tuning 
print(grid.best_params_) 
  
# print how our model looks after hyper-parameter tuning 
print(grid.best_estimator_) 

## § Model - Support Vector Classifier with Tuning

In [None]:
#Create a svm Classifier
model_tuned = SVC(C=1000, gamma=0.001, kernel='rbf')

#Train the model using the training sets
model_tuned.fit(X_train, y_train)

#Predict the response for test dataset
y_pred_tuned = model_tuned.predict(X_test)

In [None]:
#Classification report before and after tuning

print('Before Tuning')
predictions = model.predict(X_test) 
print(classification_report(y_test, predictions)) 

print('\n\nAfter Tuning')
predictions_tuned = model_tuned.predict(X_test) 
print(classification_report(y_test, y_pred_tuned)) 

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
import pandas as pd

# Assuming df is your DataFrame and "DEATH_EVENT" is the target variable
X = df.drop("DEATH_EVENT", axis=1)
y = df["DEATH_EVENT"]

# Create a classifier to use for feature selection
classifier = RandomForestClassifier(random_state=42)

# Create the RFECV object
rfecv = RFECV(estimator=classifier, step=1, cv=StratifiedKFold(10), scoring='accuracy')

# Fit the RFECV to the data
rfecv.fit(X, y)

# Print the optimal number of features
print('Optimal number of features: {}'.format(rfecv.n_features_))

# Get the features selected by RFECV
features = [f for f,s in zip(X.columns, rfecv.support_) if s]

print('The selected features are: {}'.format(features))