In [None]:
### Please run this cell before continuing.

library(tidyverse)
library(repr)
library(tidymodels)
library(GGally)
options(repr.matrix.max.rows = 6)
set.seed(2022) 

In [None]:
# reading and cleaning dataset

heart_disease_data <- read_csv("https://raw.githubusercontent.com/labellali/dsci-100-2022w1-group-169/main/data/heart_disease_dataset.csv")
heart_disease_data <- heart_disease_data |>
    rename(age = Column1,
            sex = Column2,
                            chest_pain = Column3,
                            resting_blood_pressure = Column4,
                            cholesterol = Column5,
                            fasting_blood_sugar = Column6,
                            rest_ecg = Column7,
                            max_heart_rate = Column8,
                            exercised_ind_angina = Column9,
                            oldpeak = Column10,
                            slope = Column11,
                            ca = Column12,
                            thal = Column13,
                            num = Column14) |>
    mutate(sex = as.factor(sex),
          chest_pain = as.factor(chest_pain),
          fasting_blood_sugar = as.factor(fasting_blood_sugar),
          rest_ecg = as.factor(rest_ecg),
          exercised_ind_angina = as.factor(exercised_ind_angina),
          num = as.factor(num),
          thal = na_if(thal, '?'),
          thal = as.factor(thal),
          ca = na_if(ca, '?'),
          ca = as.numeric(ca)) |>
    select(resting_blood_pressure,
           cholesterol,
           max_heart_rate,
           num) |>
    filter(resting_blood_pressure != is.na(resting_blood_pressure),
           cholesterol != is.na(cholesterol),
           max_heart_rate != is.na(max_heart_rate),
           num != is.na(num))
heart_disease_data

In [None]:
# splitting data into initial and training datasets

heart_disease_split <- initial_split(heart_disease_data, prop = 0.75, strata = num)
training_data <- training(heart_disease_split)
testing_data <- testing(heart_disease_split)

In [None]:
# looking at the number of observations in each class to predict

training_data_count <- training_data |>
    group_by(num) |>
    summarize(n = n())

training_data_count

In [None]:
# finding range of each numeric variable

training_data_max <- training_data |>
    select(-num) |>
    map_df(max, na.rm = TRUE)

training_data_min <- training_data |>
    select(-num) |>
    map_df(min, na.rm = TRUE)

training_data_ranges <- tibble(training_data_max - training_data_min)

training_data_ranges

In [None]:
# histograms of every numeric variable

heart_training_hist_resting_blood_pressure<- training_data %>%
    ggplot(aes(x=resting_blood_pressure))+
    geom_histogram(bins = 30)+
    labs(x = "Resting Blood Pressure", y = "Count") +
    ggtitle("Distribution of resting blood pressure values in heart disease dataset")
heart_training_hist_resting_blood_pressure

heart_training_hist_cholesterol<- training_data %>%
    ggplot(aes(x=cholesterol))+
    geom_histogram(bins = 30)+
    labs(x = "Cholesterol", y = "Count") +
    ggtitle("Distribution of cholesterol level values in heart disease dataset")
heart_training_hist_cholesterol

heart_training_max_heart_rate<- training_data %>%
    ggplot(aes(x=max_heart_rate))+
    geom_histogram(bins = 30)+
    labs(x = "Max Heart Rate", y = "Count") +
    ggtitle("Distribution of max heart rate values in heart disease dataset")
heart_training_max_heart_rate

In [None]:
# comparing variable correlations

correlations <- ggpairs(heart_disease_data)
correlations

In [None]:
# balancing and scaling the data in the classification recipe

heart_disease_recipe <- recipe(num ~ ., data = training_data) |>
    step_center(all_predictors()) |>
    step_scale(all_predictors())

In [None]:
# creating the tuning classification model 

knn_tune <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
    set_engine("kknn") |>
    set_mode("classification")

In [None]:
# cross-validation on training data to determine best k-value

k_values <- tibble(neighbors = c(1:50))

heart_disease_vfold <- vfold_cv(training_data, v = 5, strata = num)

knn_tune_fit <- workflow() |>
    add_recipe(heart_disease_recipe) |>
    add_model(knn_tune) |>
    tune_grid(resamples = heart_disease_vfold, grid = k_values) |>
    collect_metrics()

knn_tune_fit

In [None]:
# plotting accuracy values to determine the best k-value

knn_tune_accuracies <- knn_tune_fit |>
    filter(.metric == "accuracy") |>
    select(neighbors, mean)

accuracies <- ggplot(knn_tune_accuracies, aes(x = neighbors, y = mean)) +
    geom_point() +
    geom_line() +
    labs(x = "k-value", y = "Accuracy mean") +
    ggtitle("Accuracy mean vs. k-value")

accuracies

In [None]:
# choosing the best k-value

knn_tune_preview <- knn_tune_accuracies |>
    filter(neighbors %in% c(20:25))
knn_tune_preview

chosen_k <- 24

In [None]:
# creating model with chosen k

knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = chosen_k) |>
    set_engine("kknn") |>
    set_mode("classification")

knn_fit <- workflow() |>
    add_recipe(heart_disease_recipe) |>
    add_model(knn_spec) |>
    fit(data = training_data)