In [75]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

In [76]:
# Dataset loading and exploratory data analysis
heart_df = pd.read_csv('heart_failure_clinical_records_dataset.csv')
heart_df.head()

In [77]:
# Identify missing values
heart_df.isna().sum()

In [78]:
# Handle missing values
df = heart_df.dropna();
df.isna().sum()

In [79]:
import matplotlib.pyplot as plt
import seaborn as sns

# Gaining insights from the correlation matrix
corr_matrix = df.corr()

plt.figure(figsize=(12,8), dpi=100)
sns.heatmap(corr_matrix, center=0, cmap='Blues', annot=True)

plt.show()

In [80]:
## Defining feature vector and target variable

# Drop the target variable
X = df.drop(['DEATH_EVENT'], axis=1)
y = df['DEATH_EVENT']

y.head()

In [81]:
# Splitting data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
X_train.shape, X_test.shape, 

In [82]:
# Eliminating erelavant features from the dataset used in the model
correlated_features = set()

for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i,j]) > 0.6:
            colname = corr_matrix.columns[i]
            correlated_features.add(colname)
            
print(correlated_features)


# Remove the correlated features from the dataset
X_train.drop(labels= correlated_features, axis = 1, inplace= True)
X_test.drop(labels= correlated_features, axis = 1, inplace= True)


## Encoding categorical variables

X_train.dtypes

import category_encoders as ce

encoder= ce.OrdinalEncoder(cols=['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes', 'ejection_fraction', 'high_blood_pressure', 'platelets', 'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time'])

X_train= encoder.fit_transform(X_train)
X_test= encoder.transform(X_test)

In [83]:
# Build the decision tree classifier- gini
clf_gini= DecisionTreeClassifier(criterion='gini',
                                max_depth= 2,
                                random_state= 0)

# Train the classifier
clf_gini.fit(X_train, y_train)

# Predicting results for the test set
y_pred= clf_gini.predict(X_test)

# Evaluate the model
print('Accuracy: ', metrics.accuracy_score(y_test, y_pred))

# Compute the precision of the model
precision = metrics.precision_score(y_test, y_pred)
print('Precision: ', precision)

# Compute the recall of the model
recall = metrics.recall_score(y_test, y_pred)
print('Recall: ', recall)

# Compute the F1 score of the model
f1 = metrics.f1_score(y_test, y_pred)
print('F1 score: ', f1)