In [1]:
import pandas as pd
import numpy as np
import altair as alt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer

np.random.seed(1234)

Reading Data

In [2]:
# Fetch dataset 
url = "https://github.com/allanji100/dsci-100-group-project/blob/main/heart%2Bdisease/processed.cleveland.data?raw=true"

df = pd.read_csv(
    url,
    names=["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"]
)

column_transformer = make_column_transformer(
    (StandardScaler(), ["age", "trestbps", "chol", "thalach", "oldpeak"]),
    remainder='passthrough',
    verbose_feature_names_out = False
)

# Drop "ca" and "thal" due to missing values
# Scale numerical features
heart_disease_df = pd.DataFrame(
    column_transformer.fit_transform(df.drop(columns=["ca", "thal"])),
    columns=column_transformer.get_feature_names_out()
)
X = heart_disease_df.drop(columns=["num"])
y = heart_disease_df["num"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=123)

heart_disease_df.head()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,sex,cp,fbs,restecg,exang,slope,num
0,0.948726,0.757525,-0.2649,0.017197,1.087338,1.0,1.0,1.0,2.0,0.0,3.0,0.0
1,1.392002,1.61122,0.760415,-1.821905,0.397182,1.0,4.0,0.0,2.0,1.0,2.0,2.0
2,1.392002,-0.6653,-0.342283,-0.902354,1.346147,1.0,4.0,0.0,2.0,1.0,2.0,1.0
3,-1.932564,-0.09617,0.063974,1.637359,2.122573,1.0,3.0,0.0,0.0,0.0,3.0,0.0
4,-1.489288,-0.09617,-0.825922,0.980537,0.310912,0.0,2.0,0.0,2.0,0.0,1.0,0.0


In [3]:
knn = KNeighborsClassifier()
scores = {"name": [], "average_CV_score": []}

# This serves as a baseline to compare the other runs
cv_results = cross_val_score(knn, X_train, y_train)
scores["name"].append("all_features")
scores["average_CV_score"].append(np.average(cv_results))
print(
    "Baseline model with no optimizations: ",
    np.round(np.average(cv_results), decimals=4)
)

for col in X_train.columns:
    cv_results = cross_val_score(knn, X_train.drop(columns=[col]), y_train)
    scores["name"].append("no_" + col)
    scores["average_CV_score"].append(np.average(cv_results))

cv_results_df = pd.DataFrame(scores)

chart = alt.Chart(cv_results_df).mark_bar(clip=True).encode(
    alt.X(
        'name:O',
        title="CV Run Name"
    ),
    alt.Y(
        'average_CV_score:Q',
        title="Average CV Score"
    ).scale(domain=[0.5, 0.6]),
).properties(
    title = "Average CV Scores w/Removed Features (Lower is better)"
)
line = alt.Chart(cv_results_df.loc[cv_results_df["name"] == "all_features"]).mark_rule().encode(y='average_CV_score')

chart + line


Baseline model with no optimizations:  0.5411


This tells us that removing cp, exang, sex, and thalach lowers our overall CV scores. Therefore, these are probably the most useful features. So we can probably drop the other features.

In [4]:
X_train_cropped = X_train.drop(columns=["age", "chol", "fbs", "restecg", "slope", "trestbps"])
X_test_cropped = X_test.drop(columns=["age", "chol", "fbs", "restecg", "slope", "trestbps"])

Now we can compare how our model fairs with and without these cropped features.

In [5]:
print(
    "With all features: ",
    np.round(
        np.average(cross_val_score(knn, X_train, y_train)),
        decimals=4
    )
)
print(
    "With cropped features: ",
    np.round(
        np.average(cross_val_score(knn, X_train_cropped, y_train)),
        decimals=4
    )
)

With all features:  0.5411
With cropped features:  0.5695


We can see that our model is slightly improved by dropping the bad features.

We can further optimize our model by optimizing our hyper-parameters. In this case, we only need to optimize for n_neighbors in our KNeighborsClassifier.

In [6]:
parameters = {'n_neighbors':range(1, 50)}
grid = GridSearchCV(knn, parameters)

grid.fit(X_train_cropped, y_train)
print(grid.cv_results_["rank_test_score"])

[49 10  7  4 20 14  3  2  1 22 31 21 37  8 15 27 29 29  8 27 22 11  5 19
 16 16 16 12  5 44 36 44 13 25 24 35 42 26 42 48 44 44 33 31 33 38 38 38
 38]


Here we see that the 8th item gives us the best score. That correlates to an n_neighbors value of 9.

We can again see how that affects our model:

In [7]:
hyper_param_optimized_knn = KNeighborsClassifier(n_neighbors=9)
print(
    "Test score with hyper-parameters optimized: ",
    np.round(
        np.average(cross_val_score(hyper_param_optimized_knn, X_train_cropped, y_train)),
        decimals=4
    )
)

Test score with hyper-parameters optimized:  0.6078


Finally, we can see how our model stacks up against test data:

In [8]:
final_knn = KNeighborsClassifier(n_neighbors=9).fit(X_train_cropped, y_train)
print(np.round(final_knn.score(X_test_cropped, y_test), decimals=4))

0.5574


We can see that our test score is a bit lower than our cross-validation score. However, we can also see that our optimized model has scored better on the test data, than our unoptmized model did on the training data.

Baseline unoptimized: 0.5411

Optmized: 0.5574