In [473]:
# Import libraries to be used.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import floor

from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import SCORERS

In [474]:
# Define a random state value
RANDOM_STATE = 3030

In [475]:
# Define column names which will be used instead of the ones in the datasets.
column_names = [
    "age", "sex", "chest_pain_type", "rest_bp", "cholesterol",
    "fasting_bs", "resting_ecg", "max_hr", "exercise_angina", "st_exercise_depression",
    "st_exercise_slope", "major_vessels_fluorospy", "thal", "angiographic_disease"
]
na_values = ["?"] # Define what values will be interpreted as missing values, in this dataset the "?" character is used for this.

# Load all datasets using relative paths and combine into one dataframe.
df = pd.concat([
    pd.read_csv("../data/processed.cleveland.data", names=column_names, na_values=na_values),
    pd.read_csv("../data/processed.hungarian.data", names=column_names, na_values=na_values),
    pd.read_csv("../data/processed.switzerland.data", names=column_names, na_values=na_values),
    pd.read_csv("../data/processed.va.data", names=column_names, na_values=na_values)
]).drop(['st_exercise_slope', 'major_vessels_fluorospy', 'thal'], axis=1)

In [476]:
df[df["angiographic_disease"] > 1] = 1

In [477]:
df['age'].fillna((df['age'].mean()), inplace=True)
df['rest_bp'].fillna((df['rest_bp'].mean()), inplace=True)
df['cholesterol'].fillna((df['cholesterol'].mean()), inplace=True)
df['max_hr'].fillna((df['max_hr'].mean()), inplace=True)
df['st_exercise_depression'].fillna((df['st_exercise_depression'].mean()), inplace=True)
df = df.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [478]:
# Set category attributes
df["sex"] = df["sex"].astype("category")
df["chest_pain_type"] = df["chest_pain_type"].astype("category")
df["fasting_bs"] = df["fasting_bs"].astype("category")
df["resting_ecg"] = df["resting_ecg"].astype("category")
df["exercise_angina"] = df["exercise_angina"].astype("category")
df["angiographic_disease"] = df["angiographic_disease"].astype("category")

In [479]:
X = MinMaxScaler().fit_transform(df.drop(["angiographic_disease"], axis=1).to_numpy())
Y = df["angiographic_disease"].to_numpy()


In [480]:
df2=df.drop(["angiographic_disease"], axis=1)
df2

Unnamed: 0,age,sex,chest_pain_type,rest_bp,cholesterol,fasting_bs,resting_ecg,max_hr,exercise_angina,st_exercise_depression
0,63.0,1.0,1.0,145.000000,233.0,1.0,2.0,150.000000,0.0,2.300000
1,1.0,1.0,1.0,1.000000,1.0,1.0,1.0,1.000000,1.0,1.000000
2,67.0,1.0,4.0,120.000000,229.0,0.0,2.0,129.000000,1.0,2.600000
3,37.0,1.0,3.0,130.000000,250.0,0.0,0.0,187.000000,0.0,3.500000
4,41.0,0.0,2.0,130.000000,204.0,0.0,2.0,172.000000,0.0,1.400000
...,...,...,...,...,...,...,...,...,...,...
195,54.0,0.0,4.0,127.000000,333.0,1.0,1.0,154.000000,0.0,0.000000
196,62.0,1.0,1.0,95.288613,139.0,0.0,1.0,103.095829,1.0,0.743665
197,1.0,1.0,1.0,1.000000,1.0,1.0,1.0,1.000000,1.0,1.000000
198,58.0,1.0,4.0,95.288613,385.0,1.0,2.0,103.095829,1.0,0.743665


In [512]:
knn = KNeighborsClassifier(n_neighbors=15)

In [514]:
 scores_result = {}
 for i in range(1,6):
    print(i*2)  
    bestk = SelectKBest(chi2, k=i*2).fit(X, Y)
    X_new=bestk.transform(X)
    scores = cross_validate(knn, X_new, Y, cv=5, scoring=["roc_auc", "f1_macro", "accuracy"])
    scores_result[i*2]={}
    scores_result[i*2]["AUC"] = np.mean(scores["test_roc_auc"])
    scores_result[i*2]["F1"] = np.mean(scores["test_f1_macro"])
    scores_result[i*2]["Accuracy"] = np.mean(scores["test_accuracy"])
    bestk.get_feature_names_out(input_features=df2.columns) 
    dfk=df[bestk.get_feature_names_out(input_features=df2.columns)]
print(scores_result)


2
4
6
8
10
{2: {'AUC': 0.8256397195526712, 'F1': 0.7405941562201978, 'Accuracy': 0.7521739130434782}, 4: {'AUC': 0.8537940945137711, 'F1': 0.7949910992241007, 'Accuracy': 0.7989130434782609}, 6: {'AUC': 0.8329513360058414, 'F1': 0.7841001736813782, 'Accuracy': 0.7923913043478261}, 8: {'AUC': 0.8936095918540705, 'F1': 0.7908832575760647, 'Accuracy': 0.7978260869565217}, 10: {'AUC': 0.8884513193761168, 'F1': 0.7957528462291091, 'Accuracy': 0.8021739130434783}}


In [482]:
bestk.get_feature_names_out(input_features=df2.columns)

array(['age', 'sex', 'chest_pain_type', 'rest_bp', 'cholesterol',
       'fasting_bs', 'max_hr', 'exercise_angina'], dtype=object)

In [483]:
dfk=df[bestk.get_feature_names_out(input_features=df2.columns)]
dfk


Unnamed: 0,age,sex,chest_pain_type,rest_bp,cholesterol,fasting_bs,max_hr,exercise_angina
0,63.0,1.0,1.0,145.000000,233.0,1.0,150.000000,0.0
1,1.0,1.0,1.0,1.000000,1.0,1.0,1.000000,1.0
2,67.0,1.0,4.0,120.000000,229.0,0.0,129.000000,1.0
3,37.0,1.0,3.0,130.000000,250.0,0.0,187.000000,0.0
4,41.0,0.0,2.0,130.000000,204.0,0.0,172.000000,0.0
...,...,...,...,...,...,...,...,...
195,54.0,0.0,4.0,127.000000,333.0,1.0,154.000000,0.0
196,62.0,1.0,1.0,95.288613,139.0,0.0,103.095829,1.0
197,1.0,1.0,1.0,1.000000,1.0,1.0,1.000000,1.0
198,58.0,1.0,4.0,95.288613,385.0,1.0,103.095829,1.0


In [484]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=RANDOM_STATE)

In [485]:
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X=X_train, y=Y_train)


In [486]:
#sorted(SCORERS.keys())

In [487]:
scores

{'fit_time': array([0.002527  , 0.00087404, 0.00077796, 0.00081682, 0.00092006]),
 'score_time': array([0.01434612, 0.01937366, 0.01611996, 0.02301002, 0.01942778]),
 'test_roc_auc': array([0.89293809, 0.80416069, 0.93794835, 0.94111669, 0.86609278]),
 'test_f1_macro': array([0.84216287, 0.75947712, 0.83678297, 0.85664629, 0.68369498]),
 'test_accuracy': array([0.8423913 , 0.76086957, 0.83695652, 0.85869565, 0.71195652])}

In [488]:
scores = cross_validate(knn, X, Y, cv=5, scoring=["roc_auc", "f1_macro", "accuracy"])
scores
scores_result = {}
scores_result["AUC"] = np.mean(scores["test_roc_auc"])
scores_result["F1"] = np.mean(scores["test_f1_macro"])
scores_result["Accuracy"] = np.mean(scores["test_accuracy"])
scores_result

{'AUC': 0.8884513193761168,
 'F1': 0.7957528462291091,
 'Accuracy': 0.8021739130434783}