# Heart Disease Analysis

Import the dataset and project dependencies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("heart.csv")
df = df[df["RestingBP"] != 0]
# Replace zero Cholesterol with NaN
df['Cholesterol'] = df['Cholesterol'].replace(0, np.nan)

# Replace the NaN values with the mean Cholesterol.
df['Cholesterol'].fillna(df['Cholesterol'].mean(), inplace=True)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

## Classification Training

### Feature Transformation

In [3]:
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numerical_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

from sklearn.preprocessing import OneHotEncoder, StandardScaler

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    [
        ("one-hot-encoder", categorical_preprocessor, categorical_cols),
        ("standard_scaler", numerical_preprocessor, numerical_cols),
    ]
)

### Model

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

model = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=10, criterion="entropy", random_state=69))
model

### Data Split

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, test_size=0.3
)

### Model Fitting and Evaluation

In [6]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

y_pred = np.array(y_pred)
y_test = np.array(y_test)

# print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
# Calculate model accuracy
print("Accuracy:",accuracy_score(y_test, y_pred))
# Calculate model precision
print("Precision:",precision_score(y_test, y_pred))
# Calculate model recall
print("Recall:",recall_score(y_test, y_pred))
# Calculate model f1 score
print("F1-Score:",f1_score(y_test, y_pred))

[[ 95  12]
 [ 24 145]]
Accuracy: 0.8695652173913043
Precision: 0.9235668789808917
Recall: 0.8579881656804734
F1-Score: 0.8895705521472392


## Hyperparameter Tuning

In [7]:
X1 = X
num_X1 = X1.select_dtypes(exclude=['object'])
X1[num_X1.columns] = numerical_preprocessor.fit_transform(num_X1)

X1 = pd.get_dummies(X1)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X1, y, random_state=42, test_size=0.3
)

### Random Search

In [8]:
from sklearn.model_selection import RandomizedSearchCV# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['log2', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}

rf_random = RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


#### Results

In [9]:
rf_random.best_params_

{'n_estimators': 2000,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}

#### Evaluation

In [10]:
y_pred = rf_random.best_estimator_.predict(X_test)
y_pred = np.array(y_pred)
y_test = np.array(y_test)
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
# Calculate model accuracy
print("Accuracy:",accuracy_score(y_test, y_pred))
# Calculate model precision
print("Precision:",precision_score(y_test, y_pred))
# Calculate model recall
print("Recall:",recall_score(y_test, y_pred))
# Calculate model f1 score
print("F1-Score:",f1_score(y_test, y_pred))

[[ 91  16]
 [ 21 148]]
Accuracy: 0.8659420289855072
Precision: 0.9024390243902439
Recall: 0.8757396449704142
F1-Score: 0.8888888888888891


### Grid Search

In [11]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [1900,2000,2100],
    'min_samples_split': [3,5,7],
    'min_samples_leaf': [1,2,3],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [90,100,110],
    'bootstrap': [True]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 162 candidates, totalling 486 fits


#### Results and Evaluation

In [12]:
print(grid_search.best_params_)
best_grid = grid_search.best_estimator_

y_pred = best_grid.predict(X_test)
y_pred = np.array(y_pred)
y_test = np.array(y_test)
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
# Calculate model accuracy
print("Accuracy:",accuracy_score(y_test, y_pred))
# Calculate model precision
print("Precision:",precision_score(y_test, y_pred))
# Calculate model recall
print("Recall:",recall_score(y_test, y_pred))
# Calculate model f1 score
print("F1-Score:",f1_score(y_test, y_pred))

{'bootstrap': True, 'max_depth': 90, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 7, 'n_estimators': 1900}
[[ 91  16]
 [ 20 149]]
Accuracy: 0.8695652173913043
Precision: 0.9030303030303031
Recall: 0.8816568047337278
F1-Score: 0.8922155688622754
