In [1]:
library(tidyverse)
library(repr)
library(tidymodels)
options(repr.matrix.max.rows = 6)

set.seed(8205)

 # Reading in Heart Disease Data

In [2]:

#download.file("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.switzerland.data",
            #  destfile = "./switzerland_data.data")

#read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/switzerland.data/processed.switzerland.data")

cleveland_data <- read_csv("data/processed.cleveland.data", col_names = FALSE) |>
    rename(age      = X1,      # Age in years
           sex      = X2,      # Sex (1 = male, 0 = female)
           cp       = X3,      # Chest pain type 
                               # - 1 : typical angina
                               # - 2 : atypical angina
                               # - 3 : non-anginal pain
                               # - 4 : asymptomatic
           trestbps = X4,      # Resting blood pressure (mmHg)
           chol     = X5,      # Serum cholestoral (mg/dL)
           fbs      = X6,      # Fasting blood sugar > 120 mg/dL (1 = true, 0 = false)
           restecg  = X7,      # Resting electrocardiographic results 
                               # - 0 : normal,
                               # - 1 : having ST-T wave abnormality (T wave inversions and/or ST 
                               #       elevation or depression of > 0.05 mV)
                               # - 2 : showing probable or definite left ventricular hypertrophy by Estes' criteria
           thalach  = X8,      # Maximum heart rate achieved
           exang    = X9,      # Exercise induced angina (0 = no, 1 = yes)
           oldpeak  = X10,     # ST depression induced by exercise relative to rest
           slope    = X11,     # The slope of the peak exercise ST segment
                               # - 1 : upsloping
                               # - 2 : flat
                               # - 3 : downsloping
           ca       = X12,     # Number of major vessels (0-3) colored by flourosopy
           thal     = X13,     # 3 = normal; 6 = fixed defect; 7 = reversable defect
           num      = X14)     # diagnosis of heart disease (angiographic disease status)

cleveland_data

[1mRows: [22m[34m303[39m [1mColumns: [22m[34m14[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (2): X12, X13
[32mdbl[39m (12): X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X14

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>
63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2
67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0
56,1,2,120,236,0,0,178,0,0.8,1,0.0,3.0,0
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
57,0,4,140,241,0,0,123,1,0.2,2,0.0,7.0,1
45,1,1,110,264,0,0,132,0,1.2,2,0.0,7.0,1
68,1,4,144,193,1,0,141,0,3.4,2,2.0,7.0,2


# Data wrangling

In [3]:
cleveland_data <- cleveland_data |>
    mutate(age      = as.integer(age),
           sex      = as.factor(sex),
           cp       = as.factor(cp),
           trestbps = as.integer(trestbps),
           chol     = as.integer(chol),
           fbs      = as.factor(fbs),
           restecg  = as.factor(restecg),
           thalach  = as.integer(thalach),
           exang    = as.factor(exang),
           oldpeak  = as.double(oldpeak),
           slope    = as.factor(slope),
           ca       = as.integer(ca),
           thal     = as.factor(thal),
           num      = as.factor(num))

cleveland_data

“NAs introduced by coercion”


age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
<int>,<fct>,<fct>,<int>,<int>,<fct>,<fct>,<int>,<fct>,<dbl>,<fct>,<int>,<fct>,<fct>
63,1,1,145,233,1,2,150,0,2.3,3,0,6.0,0
67,1,4,160,286,0,2,108,1,1.5,2,3,3.0,2
67,1,4,120,229,0,2,129,1,2.6,2,2,7.0,1
37,1,3,130,250,0,0,187,0,3.5,3,0,3.0,0
41,0,2,130,204,0,2,172,0,1.4,1,0,3.0,0
56,1,2,120,236,0,0,178,0,0.8,1,0,3.0,0
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
57,0,4,140,241,0,0,123,1,0.2,2,0,7.0,1
45,1,1,110,264,0,0,132,0,1.2,2,0,7.0,1
68,1,4,144,193,1,0,141,0,3.4,2,2,7.0,2


In [4]:
# Splitting the data into training and testing sets
cleveland_split <- initial_split(cleveland_data, prop = 0.75, strata = num)
cleveland_training <- training(cleveland_split) 
cleveland_testing <- testing(cleveland_split)

# Exploratory data analysis

### Heart Disease Summary

In [5]:
target_summary <- cleveland_training |>
    group_by(num) |>
    summarize(count = n())
target_summary

num,count
<fct>,<int>
0,123
1,42
2,26
3,27
4,8


From the dataset, the majority of patients do not have any presence of heart disease. For those who have presence of a heart disease, most have **Type 1** whereas **Type 4** is the least common.

### Demographics

In [6]:
demographic_summary <- cleveland_training |>
    group_by(sex) |>
    summarize(count = n(), average_age = mean(age))
demographic_summary

sex,count,average_age
<fct>,<int>,<dbl>
0,73,56.06849
1,153,53.66667


The number of male double that of female participants. The average age for this dataset is around 54 years old. In addition, the average age of the male is slightly younger than female participants.

### Blood Pressure

In [7]:
average_restbps <- cleveland_training |>
    summarize(average_restbps = mean(trestbps))
average_restbps

average_restbps
<dbl>
132.3319


The average resting blood pressure of all participants is 132*mmHg*. Let's see how this would affect the type of heart disease.

In [8]:
average_restbps_by_type <- cleveland_training |>
    group_by(num) |>
    summarize(average_restbps = mean(trestbps))
average_restbps_by_type

num,average_restbps
<fct>,<dbl>
0,128.8293
1,135.7143
2,136.2692
3,136.5926
4,141.25


People with no presence of heart disease have lower average resting blood pressure. Those who do tend to have increasingly higher resting blood pressure with worse cases of heart disease.

### Serum Cholestoral

In [11]:
average_chol <- cleveland_training |>
    summarize(average_chol = mean(chol))
average_chol

average_chol
<dbl>
248.5


The average serum cholestoral in the data set is 248.5*mg/dL*. Let's see what's the average for each type of heart disease.

In [12]:
average_chol_by_type <- cleveland_training |>
    group_by(num) |>
    summarize(average_chol = mean(chol))
average_chol_by_type

num,average_chol
<fct>,<dbl>
0,247.122
1,249.5952
2,255.7692
3,248.3333
4,240.875


Patients with Type 2 heart disease have higher than average serum cholestoral whereas Type 4 tend to have less than average.

### Fasting Blood Sugar

In [40]:
#pull the number of instances of each type of heart disease
disease_count <- c(123, 42, 26, 27, 8)

fbs_summary <- cleveland_training |>
    group_by(num, fbs) |>
    summarize(count = n()) |>
    filter(fbs == 0) |> 
    bind_cols(type_count = disease_count) |>
    mutate(lower_than_proportion = count / type_count) |>
    select(num, lower_than_proportion)
fbs_summary

[1m[22m`summarise()` has grouped output by 'num'. You can override using the `.groups`
argument.


num,lower_than_proportion
<fct>,<dbl>
0,0.8699187
1,0.9047619
2,0.8076923
3,0.7777778
4,1.0


### Resting electrocardiographic

In [None]:
restecg_summary <- cleveland_training |>
    group_by(num, restecg) |>
    summarize(count = n()) |>
    filter(restecg == 0) |> 
    bind_cols(type_count = disease_count) |>
    mutate(normal_proportion = count / type_count) |>
    select(num, normal_proportion)
restecg_summary

[1m[22m`summarise()` has grouped output by 'num'. You can override using the `.groups`
argument.


num,normal_proportion
<fct>,<dbl>
0,0.5934959
1,0.4285714
2,0.5384615
3,0.3703704
4,0.125


### Maximum Heart Rate

In [63]:
average_thalach <- cleveland_training |>
    summarize(average_thalach = mean(thalach))
average_thalach

average_thalach
<dbl>
149.4867


In [64]:
average_thalach_by_type <- cleveland_training |>
    group_by(num) |>
    summarize(average_thalach = mean(thalach))
average_thalach_by_type

num,average_thalach
<fct>,<dbl>
0,158.4309
1,147.9762
2,131.9231
3,131.3333
4,138.25
