# Heart Disease Analysis
Import the dataset and project dependencies

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("heart.csv")
df = df[df["RestingBP"] != 0]
# Replace zero Cholesterol with NaN
df['Cholesterol'] = df['Cholesterol'].replace(0, np.nan)

# Replace the NaN values with the mean Cholesterol.
df['Cholesterol'].fillna(df['Cholesterol'].mean(), inplace=True)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

## Classification Training
### Feature Scaling

In [57]:
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numerical_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

from sklearn.preprocessing import OneHotEncoder, StandardScaler

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    [
        ("one-hot-encoder", categorical_preprocessor, categorical_cols),
        ("standard_scaler", numerical_preprocessor, numerical_cols),
    ]
)

### Model

In [58]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline

model = make_pipeline(preprocessor, KNeighborsClassifier(n_neighbors=5))
model

### Data Splitting

In [59]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, test_size=0.3
)

### Model Fitting and Evaluation

In [60]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

y_pred = np.array(y_pred)
y_test = np.array(y_test)

# print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
# Calculate model accuracy
print("Accuracy:",accuracy_score(y_test, y_pred))
# Calculate model precision
print("Precision:",precision_score(y_test, y_pred))
# Calculate model recall
print("Recall:",recall_score(y_test, y_pred))
# Calculate model f1 score
print("F1-Score:",f1_score(y_test, y_pred))

[[ 88  19]
 [ 18 151]]
Accuracy: 0.8659420289855072
Precision: 0.888235294117647
Recall: 0.893491124260355
F1-Score: 0.8908554572271388


## Hyperparameter Tuning

In [61]:
X1 = X
num_X1 = X1.select_dtypes(exclude=['object'])
X1[num_X1.columns] = numerical_preprocessor.fit_transform(num_X1)

X1 = pd.get_dummies(X1)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X1, y, random_state=42, test_size=0.3
)

### Random Search

In [62]:
from sklearn.model_selection import RandomizedSearchCV# Number of trees in random forest

param_dist = {
    "n_neighbors": [3, 5, 7, 9, 11, 13, 15, 17, 19, 21],
    "weights": ["uniform", "distance"],
    "p": [1, 2],
    "algorithm": ["ball_tree", "kd_tree", "brute"]
}

rf_random = RandomizedSearchCV(estimator = KNeighborsClassifier(), param_distributions = param_dist, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


#### Results

In [63]:
rf_random.best_params_

{'weights': 'distance', 'p': 1, 'n_neighbors': 15, 'algorithm': 'kd_tree'}

#### Evaluation

In [64]:
y_pred = rf_random.best_estimator_.predict(X_test)
y_pred = np.array(y_pred)
y_test = np.array(y_test)
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
# Calculate model accuracy
print("Accuracy:",accuracy_score(y_test, y_pred))
# Calculate model precision
print("Precision:",precision_score(y_test, y_pred))
# Calculate model recall
print("Recall:",recall_score(y_test, y_pred))
# Calculate model f1 score
print("F1-Score:",f1_score(y_test, y_pred))

[[ 92  15]
 [ 19 150]]
Accuracy: 0.8768115942028986
Precision: 0.9090909090909091
Recall: 0.8875739644970414
F1-Score: 0.8982035928143712


### Grid Search

In [65]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_neighbors": [13, 15, 17],
    "weights": ["uniform", "distance"],
    "p": [1, 2],
    "algorithm": ["ball_tree", "kd_tree", "brute"]
}

grid_search = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


#### Results and Evaluation

In [66]:
grid_search.best_params_

{'algorithm': 'ball_tree', 'n_neighbors': 15, 'p': 1, 'weights': 'distance'}

In [67]:
best_grid = grid_search.best_estimator_

y_pred = best_grid.predict(X_test)
y_pred = np.array(y_pred)
y_test = np.array(y_test)
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
# Calculate model accuracy
print("Accuracy:",accuracy_score(y_test, y_pred))
# Calculate model precision
print("Precision:",precision_score(y_test, y_pred))
# Calculate model recall
print("Recall:",recall_score(y_test, y_pred))
# Calculate model f1 score
print("F1-Score:",f1_score(y_test, y_pred))

[[ 92  15]
 [ 19 150]]
Accuracy: 0.8768115942028986
Precision: 0.9090909090909091
Recall: 0.8875739644970414
F1-Score: 0.8982035928143712
