In [1]:
library(tidyverse)
library(repr)
library(tidymodels)
options(repr.matrix.max.rows = 6)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.6     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.7     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.0.0 ──

[32m✔[39m [34mbroom       [39m 1.0.0     [32m✔[39m [34mrsample     [39m 1.0.0
[32m✔[39m [34mdials       [39m 1.0.0     [32m✔[39m [34mtune        [39m 1.0.0
[32m✔[39m [34minfer       [39m 1.0.2     [32m✔[39m [34mworkflows   [39m 1.0.0
[32m✔

Table 1: Tidy Dataset

In [2]:
cleveland_data <- read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data",
        col_names = FALSE)

colnames(cleveland_data) <- c("age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num")

tidy_cleveland_data <- cleveland_data |>
select("age", "trestbps", "chol", "thalach", "oldpeak", "thal") |>
mutate(across(age:thal, ~na_if(., "?"))) #https://dplyr.tidyverse.org/reference/na_if.html
tidy_cleveland_data$thal[tidy_cleveland_data$thal == "3.0"] <- "normal" #https://sparkbyexamples.com/r-programming/replace-values-in-r/#:~:text=To%20replace%20a%20column%20value,single%20column%20use%20df%24column_name%20.
tidy_cleveland_data$thal[tidy_cleveland_data$thal == "6.0"] <- "fixed_defect" #https://sparkbyexamples.com/r-programming/replace-values-in-r/#:~:text=To%20replace%20a%20column%20value,single%20column%20use%20df%24column_name%20.
tidy_cleveland_data$thal[tidy_cleveland_data$thal == "7.0"] <- "reversible_defect" #https://sparkbyexamples.com/r-programming/replace-values-in-r/#:~:text=To%20replace%20a%20column%20value,single%20column%20use%20df%24column_name%20.
tidy_cleveland_data <- mutate(tidy_cleveland_data, thal = as.factor(thal))
tidy_cleveland_data

[1mRows: [22m[34m303[39m [1mColumns: [22m[34m14[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (2): X12, X13
[32mdbl[39m (12): X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X14

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


age,trestbps,chol,thalach,oldpeak,thal
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
63,145,233,150,2.3,fixed_defect
67,160,286,108,1.5,normal
67,120,229,129,2.6,reversible_defect
⋮,⋮,⋮,⋮,⋮,⋮
57,130,131,115,1.2,reversible_defect
57,130,236,174,0.0,normal
38,138,175,173,0.0,normal


In [3]:
set.seed(234) 
cleveland_split <- initial_split(tidy_cleveland_data, prop = 0.75, strata = thal)  
cleveland_train <- training(cleveland_split)   
cleveland_test <- testing(cleveland_split)
cleveland_train
cleveland_test

age,trestbps,chol,thalach,oldpeak,thal
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
63,145,233,150,2.3,fixed_defect
37,130,250,187,3.5,normal
41,130,204,172,1.4,normal
⋮,⋮,⋮,⋮,⋮,⋮
63,140,187,144,4.0,reversible_defect
68,144,193,141,3.4,reversible_defect
57,130,131,115,1.2,reversible_defect


age,trestbps,chol,thalach,oldpeak,thal
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
67,160,286,108,1.5,normal
67,120,229,129,2.6,reversible_defect
62,140,268,160,3.6,normal
⋮,⋮,⋮,⋮,⋮,⋮
57,140,241,123,0.2,reversible_defect
45,110,264,132,1.2,reversible_defect
57,130,236,174,0.0,normal


In [6]:
cleveland_table <- cleveland_train |>
summarize(across(age:oldpeak, mean))
cleveland_table

cleveland_table_2 <- group_by(cleveland_train, thal) |>
summarize(count = n())
cleveland_table_2

cleveland_table_3 <- cleveland_train[!complete.cases(cleveland_train), ] #https://www.easytweaks.com/select-rows-na-missing-values-r/#:~:text=In%20order%20to%20find%20rows,to%20subset%20the%20DataFrame%20rows.
cleveland_table_3

age,trestbps,chol,thalach,oldpeak
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
53.95595,131.5903,244.2952,150.5507,1.040088


thal,count
<fct>,<int>
fixed_defect,12
normal,126
reversible_defect,88
,1


age,trestbps,chol,thalach,oldpeak,thal
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
53,128,216,115,0,
