# Heart Disease Analysis
Import the dataset and project dependencies

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("heart.csv")
df = df[df["RestingBP"] != 0]
# Replace zero Cholesterol with NaN
df['Cholesterol'] = df['Cholesterol'].replace(0, np.nan)

# Replace the NaN values with the mean Cholesterol.
df['Cholesterol'].fillna(df['Cholesterol'].mean(), inplace=True)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

## Classification Training

### Feature Scaling

In [24]:
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numerical_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

from sklearn.preprocessing import OneHotEncoder, StandardScaler

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    [
        ("one-hot-encoder", categorical_preprocessor, categorical_cols),
        ("standard_scaler", numerical_preprocessor, numerical_cols),
    ]
)

### Model

In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline

model = make_pipeline(preprocessor, DecisionTreeClassifier(criterion="entropy", random_state=42))
model

### Data splitting

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, test_size=0.3
)

### Model Fitting and Evaluation

In [27]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

y_pred = np.array(y_pred)
y_test = np.array(y_test)

# print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
# Calculate model accuracy
print("Accuracy:",accuracy_score(y_test, y_pred))
# Calculate model precision
print("Precision:",precision_score(y_test, y_pred))
# Calculate model recall
print("Recall:",recall_score(y_test, y_pred))
# Calculate model f1 score
print("F1-Score:",f1_score(y_test, y_pred))

[[ 85  22]
 [ 32 137]]
Accuracy: 0.8043478260869565
Precision: 0.8616352201257862
Recall: 0.8106508875739645
F1-Score: 0.8353658536585367


## Hyperparameter Tuning

In [28]:
X1 = X
num_X1 = X1.select_dtypes(exclude=['object'])
X1[num_X1.columns] = numerical_preprocessor.fit_transform(num_X1)

X1 = pd.get_dummies(X1)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X1, y, random_state=42, test_size=0.3
)

### Random Search

In [29]:
from sklearn.model_selection import RandomizedSearchCV# Number of trees in random forest
criterion = ['gini', 'entropy']
# Number of features to consider at every split
max_features = ['log2', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree


random_grid = {
    'criterion': criterion,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
}

rf_random = RandomizedSearchCV(estimator = DecisionTreeClassifier(), param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


#### Results

In [30]:
rf_random.best_params_

{'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 10,
 'criterion': 'gini'}

#### Evaluation

In [31]:
y_pred = rf_random.best_estimator_.predict(X_test)
y_pred = np.array(y_pred)
y_test = np.array(y_test)
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
# Calculate model accuracy
print("Accuracy:",accuracy_score(y_test, y_pred))
# Calculate model precision
print("Precision:",precision_score(y_test, y_pred))
# Calculate model recall
print("Recall:",recall_score(y_test, y_pred))
# Calculate model f1 score
print("F1-Score:",f1_score(y_test, y_pred))

[[ 86  21]
 [ 36 133]]
Accuracy: 0.7934782608695652
Precision: 0.8636363636363636
Recall: 0.7869822485207101
F1-Score: 0.823529411764706


### Grid Search

In [36]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_features': max_features,
    'max_depth': [50, 60, 70],
    'min_samples_split': [8,10,12],
    'min_samples_leaf': [2,3],
}

grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


#### Results and Evaluation

In [37]:
print(grid_search.best_params_)
best_grid = grid_search.best_estimator_

y_pred = best_grid.predict(X_test)
y_pred = np.array(y_pred)
y_test = np.array(y_test)
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
# Calculate model accuracy
print("Accuracy:",accuracy_score(y_test, y_pred))
# Calculate model precision
print("Precision:",precision_score(y_test, y_pred))
# Calculate model recall
print("Recall:",recall_score(y_test, y_pred))
# Calculate model f1 score
print("F1-Score:",f1_score(y_test, y_pred))

{'criterion': 'gini', 'max_depth': 60, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 8}
[[ 90  17]
 [ 33 136]]
Accuracy: 0.8188405797101449
Precision: 0.8888888888888888
Recall: 0.8047337278106509
F1-Score: 0.8447204968944099
