In [None]:
### Run the R libraries 
library(repr)
library(tidyverse)
library(tidymodels)
library(cowplot)
library(GGally)
options(repr.matrix.max.rows = 6) #limits output of dataframes to 6 rows
options(repr.plot.width=10, repr.plot.height=10)

In [None]:
cleveland_data <- read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data", col_names = FALSE)       
colnames(cleveland_data) <- c("age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca","thal", "heart_disease")

filtered_data <- cleveland_data |>
    filter(ca != "?", thal != "?")|>
    drop_na()


mutate_data <- filtered_data |>
    mutate(heart_disease = ifelse(heart_disease == 0, 0, 1)) |>
    mutate(heart_disease = as_factor(heart_disease))
final_data <- mutate_data |>
mutate(ca = as.numeric(ca), thal = as.numeric(thal))

final_data <- mutate_data |>
mutate(ca = as.numeric(ca), thal = as.numeric(thal))

final_data

set.seed(978)
heart_split <- initial_split(final_data, prop = 0.75, strata = heart_disease)  
heart_train <- training(heart_split)   
heart_test <- testing(heart_split)

[1mRows: [22m[34m303[39m [1mColumns: [22m[34m14[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (2): X12, X13
[32mdbl[39m (12): X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X14

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
63,1,1,145,233,1,2,150,0,2.3,3,0,6,0
67,1,4,160,286,0,2,108,1,1.5,2,3,3,1
67,1,4,120,229,0,2,129,1,2.6,2,2,7,1
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
68,1,4,144,193,1,0,141,0,3.4,2,2,7,1
57,1,4,130,131,0,0,115,1,1.2,2,1,7,1
57,0,2,130,236,0,2,174,0,0.0,2,1,3,1


In [14]:
names <- colnames(heart_train |> 
select(-heart_disease))

example_formula <- paste("heart_disease", "~", paste(names, collapse = "+"))

example_formula

accuracies <- tibble(size = integer(), 
                     model_string = character(), 
                     accuracy = numeric())

knn_spec <- nearest_neighbor(weight_func = "rectangular", 
                             neighbors = tune()) |>
     set_engine("kknn") |>
     set_mode("classification")

heart_vfold <- vfold_cv(heart_train, v = 5, strata = heart_disease)

n_total <- length(names)

selected <- c()

# for every size from 1 to the total number of predictors


In [15]:
for (i in 1:n_total) {
    # for every predictor still not added yet
    accs <- list()
    models <- list()
    for (j in 1:length(names)) {
        preds_new <- c(selected, names[[j]])
        model_string <- paste("heart_disease", "~", paste(preds_new, collapse="+"))

        heart_recipe <- recipe(as.formula(model_string), 
                                data = heart_train) |>
                          step_scale(all_predictors()) |>
                          step_center(all_predictors())

     
        acc <- workflow() |>
          add_recipe(heart_recipe) |>
          add_model(knn_spec) |>
          tune_grid(resamples = heart_vfold, grid = 10) |>
          collect_metrics() |>
          filter(.metric == "accuracy") |>
          summarize(mx = max(mean))
        acc <- acc$mx |> unlist()

      
        accs[[j]] <- acc
        models[[j]] <- model_string
    }
    jstar <- which.max(unlist(accs))
    accuracies <- accuracies |> 
      add_row(size = i, 
              model_string = models[[jstar]], 
              accuracy = accs[[jstar]])
    selected <- c(selected, names[[jstar]])
    names <- names[-jstar]
}

print(accuracies)

[90m# A tibble: 13 × 3[39m
    size model_string                                                   accuracy
   [3m[90m<int>[39m[23m [3m[90m<chr>[39m[23m                                                             [3m[90m<dbl>[39m[23m
[90m 1[39m     1 heart_disease ~ oldpeak                                           0.657
[90m 2[39m     2 heart_disease ~ oldpeak+ca                                        0.757
[90m 3[39m     3 heart_disease ~ oldpeak+ca+thal                                   0.802
[90m 4[39m     4 heart_disease ~ oldpeak+ca+thal+cp                                0.816
[90m 5[39m     5 heart_disease ~ oldpeak+ca+thal+cp+age                            0.829
[90m 6[39m     6 heart_disease ~ oldpeak+ca+thal+cp+age+exang                      0.829
[90m 7[39m     7 heart_disease ~ oldpeak+ca+thal+cp+age+exang+slope                0.833
[90m 8[39m     8 heart_disease ~ oldpeak+ca+thal+cp+age+exang+slope+sex            0.838
[90m 9[39m     9 heart