Dataset: https://www.kaggle.com/datasets/andrewmvd/heart-failure-clinical-data

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
import category_encoders as ce
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.tree import DecisionTreeClassifier

In [3]:
heart_data = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
heart_data.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [4]:
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB


In [5]:
heart_data.nunique().sort_values()

anaemia                       2
diabetes                      2
high_blood_pressure           2
sex                           2
smoking                       2
DEATH_EVENT                   2
ejection_fraction            17
serum_sodium                 27
serum_creatinine             40
age                          47
time                        148
platelets                   176
creatinine_phosphokinase    208
dtype: int64

In [6]:
# Definindo features numéricas e categóricas.
# Quando a feature é 0 ou 1, dummies, o correto é que seja nunérica também, mas para o estudo, vamos transformar diabetes de 0 e 1 para Yes e No e considerar essa variável categórica.

In [7]:
heart_data["diabetes"] = heart_data["diabetes"].map({1: "yes", 0: "no"})


In [8]:
#Checando mudança:
heart_data["diabetes"].value_counts()

diabetes
no     174
yes    125
Name: count, dtype: int64

In [9]:
# Modelagem, #Modelo de Machine Learning

In [10]:
heart_data.columns

Index(['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time',
       'DEATH_EVENT'],
      dtype='object')

In [11]:
features = ['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time']
target = 'DEATH_EVENT'

In [12]:
X = heart_data[features]
y = heart_data[target]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
cat_features = ["diabetes"]
num_features = ['age', 'anaemia', 'creatinine_phosphokinase', 
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time']

In [15]:
categorical_pipe = Pipeline(
    [
        ("imputer", SimpleImputer(strategy='most_frequent')),
        ("encoder", ce.TargetEncoder()),
    ]
)

transformer = ColumnTransformer(
    [
        ("categorical_transformer", categorical_pipe, cat_features),
        ("numerical_transformer", SimpleImputer(strategy="median"), num_features)
    ]
)

X_train_transformed = transformer.fit_transform(X_train, y_train)
X_test_transformed = transformer.transform(X_test)


In [16]:
# Os dados foram todos transformados em matrizes para poderem ser treinados.

X_test_transformed

array([[2.95774654e-01, 7.00000000e+01, 0.00000000e+00, 5.82000000e+02,
        4.00000000e+01, 0.00000000e+00, 5.10000000e+04, 2.70000000e+00,
        1.36000000e+02, 1.00000000e+00, 1.00000000e+00, 2.50000000e+02],
       [2.95774654e-01, 5.00000000e+01, 1.00000000e+00, 2.98000000e+02,
        3.50000000e+01, 0.00000000e+00, 3.62000000e+05, 9.00000000e-01,
        1.40000000e+02, 1.00000000e+00, 1.00000000e+00, 2.40000000e+02],
       [2.98968213e-01, 4.50000000e+01, 0.00000000e+00, 2.44200000e+03,
        3.00000000e+01, 0.00000000e+00, 3.34000000e+05, 1.10000000e+00,
        1.39000000e+02, 1.00000000e+00, 0.00000000e+00, 1.29000000e+02],
       [2.95774654e-01, 8.00000000e+01, 1.00000000e+00, 1.23000000e+02,
        3.50000000e+01, 1.00000000e+00, 3.88000000e+05, 9.40000000e+00,
        1.33000000e+02, 1.00000000e+00, 1.00000000e+00, 1.00000000e+01],
       [2.98968213e-01, 4.20000000e+01, 0.00000000e+00, 1.02000000e+02,
        4.00000000e+01, 0.00000000e+00, 2.37000000e+05, 1.20

In [17]:
tree = DecisionTreeClassifier()

In [18]:
tree.fit(X_train_transformed, y_train)

In [19]:
y_pred = tree.predict(X_test_transformed)

In [20]:
y_pred

array([1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1])

In [21]:
y_test.values

array([0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1])

In [22]:
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
)

print(f"Acurácia: {accuracy_score(y_test, y_pred):.2f}")
print(f"Recall: {recall_score(y_test, y_pred):.2f}")
print(f"Precision: {precision_score(y_test, y_pred):.2f}")
print(f"F1-Score: {f1_score(y_test, y_pred):.2f}")
print(f"ROC/AUC: {roc_auc_score(y_test, y_pred):.2f}")


      

Acurácia: 0.67
Recall: 0.48
Precision: 0.63
F1-Score: 0.55
ROC/AUC: 0.64
