In [1]:
library(tidyverse)
library(repr)
library(tidymodels)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.4.2     [32m✔[39m [34mpurrr  [39m 1.0.1
[32m✔[39m [34mtibble [39m 3.2.1     [32m✔[39m [34mdplyr  [39m 1.1.1
[32m✔[39m [34mtidyr  [39m 1.3.0     [32m✔[39m [34mstringr[39m 1.5.0
[32m✔[39m [34mreadr  [39m 2.1.3     [32m✔[39m [34mforcats[39m 0.5.2
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.0.0 ──

[32m✔[39m [34mbroom       [39m 1.0.2     [32m✔[39m [34mrsample     [39m 1.1.1
[32m✔[39m [34mdials       [39m 1.1.0     [32m✔[39m [34mtune        [39m 1.0.1
[32m✔[39m [34minfer       [39m 1.0.4     [32m✔[39m [34mworkflows   [39m 1.1.2
[32m✔[39

In [2]:
# reading the dataframe from the document 

heart_data <- read_delim("data/processed.cleveland.data", delim=",", col_names = FALSE)

[1mRows: [22m[34m303[39m [1mColumns: [22m[34m14[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (2): X12, X13
[32mdbl[39m (12): X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X14

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [3]:
# assigning coloumn names to each column

heart_data <- rename(heart_data,
                    age = X1,
                    sex = X2,
                    cp = X3,
                    trestbps = X4,
                    chol = X5,
                    fbs = X6,
                    restecg = X7,
                    thalach = X8,
                    exang = X9,
                    oldpeak = X10,
                    slope = X11,
                    ca = X12,
                    thal = X13,
                    num = X14)


In [4]:
# removing "?" from data and replacing it with NA

heart_data[heart_data == "?"] <- NA


In [74]:
# selecting columns of interest: age, sex, chol and num

heart_data <- select(heart_data, age, sex, chol, num)


In [68]:
heart_data_clean <- heart_data |>
    mutate(sex = as_factor(sex)) |>
    mutate(sex = fct_recode(sex, "M" = "1", "F" = "0")) |>
    mutate(num = as_factor(num)) |>
    mutate(num = fct_recode(num, "H" = "0", "D" = "1", "D" = "2", "D" = "3", "D" = "4"))

In [20]:
# splitting dataframe into training and testing datasets

heart_split <- initial_split(heart_data_clean, prop = 0.75, strata = num)
heart_training <- training(heart_split)
heart_testing <- testing(heart_split)

In [98]:
# number of male patients in training dataset
male_count <- heart_training |> filter(sex == "M") |> group_by(num) |> summarize(male = n()) 

#number of female patients in training dataset
female_count <- heart_training |> filter(sex == "F") |> group_by(num) |> summarize(female = n()) 

# joining the male and female tables
sex_join <- full_join(male_count, female_count)

# getting the patient, count, percentage, min, max, count of male and female patients and mean of each predictor
num_obs <- nrow(heart_training)
heart_summary <- heart_training |> 
    group_by(num) |>
    summarize(
        count = n(),
        percentage = n()/num_obs * 100,
        min_age = min(age),
        max_age = max(age),
        mean_age = mean(age),
        min_chol = min(chol),
        max_chol = max(chol),
        mean_chol = mean(chol)) 

heart_summary <- full_join(heart_summary, sex_join)
heart_summary

[1m[22mJoining with `by = join_by(num)`
[1m[22mJoining with `by = join_by(num)`


num,count,percentage,min_age,max_age,mean_age,min_chol,max_chol,mean_chol,male,female
<fct>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>
H,123,54.18502,29,76,52.74797,126,564,241.2764,70,53
D,104,45.81498,35,77,56.40385,149,409,251.6731,84,20
