## DSCI 100 007 Group 37

- #####  Preliminary exploratory data analysis

In [41]:
library(tidyverse)
library(ggplot2)
library(psych)
library(tidymodels)
options(repr.matrix.max.rows = 6)
library(repr)

##### Step 1 : read data from url

In [42]:
path <- "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.switzerland.data"
heart_disease_data <-read_csv(path)
heart_disease_data

[1m[22mNew names:
[36m•[39m `1` -> `1...2`
[36m•[39m `1` -> `1...3`
[36m•[39m `0` -> `0...5`
[36m•[39m `?` -> `?...6`
[36m•[39m `0` -> `0...7`
[36m•[39m `0` -> `0...9`
[36m•[39m `1` -> `1...11`
[36m•[39m `?` -> `?...12`
[36m•[39m `?` -> `?...13`
[36m•[39m `1` -> `1...14`
[1mRows: [22m[34m122[39m [1mColumns: [22m[34m14[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (9): 95, ?...6, 0...7, 127, 0...9, .7, 1...11, ?...12, ?...13
[32mdbl[39m (5): 32, 1...2, 1...3, 0...5, 1...14

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


32,1...2,1...3,95,0...5,?...6,0...7,127,0...9,.7,1...11,?...12,?...13,1...14
<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>
34,1,4,115,0,?,?,154,0,.2,1,?,?,1
35,1,4,?,0,?,0,130,1,?,?,?,7,3
36,1,4,110,0,?,0,125,1,1,2,?,6,1
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
72,1,3,160,0,?,2,114,0,1.6,2,2,?,0
73,0,3,160,0,0,1,121,0,0,1,?,3,1
74,1,2,145,0,?,1,123,0,1.3,1,?,?,1


##### Step 2 : Naming and filtering data

In [43]:
# Give each column a specific name from given dataset
heart_disease_data <- read_csv(path, col_names = FALSE) |> 
  set_names(c("age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach",
              "exang", "oldpeak", "slope", "ca", "thal", "num"))
heart_disease_data

[1mRows: [22m[34m123[39m [1mColumns: [22m[34m14[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (9): X4, X6, X7, X8, X9, X10, X11, X12, X13
[32mdbl[39m (5): X1, X2, X3, X5, X14

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>
32,1,1,95,0,?,0,127,0,.7,1,?,?,1
34,1,4,115,0,?,?,154,0,.2,1,?,?,1
35,1,4,?,0,?,0,130,1,?,?,?,7,3
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
72,1,3,160,0,?,2,114,0,1.6,2,2,?,0
73,0,3,160,0,0,1,121,0,0,1,?,3,1
74,1,2,145,0,?,1,123,0,1.3,1,?,?,1


In [44]:
# Now let's add a new column that includes the number of patients.
heart_disease_data <- heart_disease_data %>%
  mutate(patient_number = row_number()) %>%
  select(patient_number, everything())
heart_disease_data

patient_number,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
<int>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>
1,32,1,1,95,0,?,0,127,0,.7,1,?,?,1
2,34,1,4,115,0,?,?,154,0,.2,1,?,?,1
3,35,1,4,?,0,?,0,130,1,?,?,?,7,3
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
121,72,1,3,160,0,?,2,114,0,1.6,2,2,?,0
122,73,0,3,160,0,0,1,121,0,0,1,?,3,1
123,74,1,2,145,0,?,1,123,0,1.3,1,?,?,1


In [45]:
# We will use the select function to keep only the variables trestbps (#4) the  and cholesterol (#5), 
# and num (#14) the predicted heart disease outcome.
filltered_heart_disease_data <- heart_disease_data %>%
  select(1, 5, 10, 15)
filltered_heart_disease_data

patient_number,trestbps,exang,num
<int>,<chr>,<chr>,<dbl>
1,95,0,1
2,115,0,1
3,?,1,3
⋮,⋮,⋮,⋮
121,160,0,0
122,160,0,1
123,145,0,1


In [46]:
# We want to change column 14 to be only 0 (no to heart disease) and 1 (1 and above is yes to heart disease)
changed_num <- filltered_heart_disease_data %>%
  mutate(num = ifelse(num >= 1, 1, 0))
# remove all row with '?'
changed_num_1 <- changed_num |>
  filter_all(all_vars(. != "?"))
changed_num_1

patient_number,trestbps,exang,num
<int>,<chr>,<chr>,<dbl>
1,95,0,1
2,115,0,1
4,110,1,1
⋮,⋮,⋮,⋮
121,160,0,0
122,160,0,1
123,145,0,1


In [52]:
# final dataframe 
final_data_1 <- changed_num_1

final_data_1 <- final_data_1 |>
    mutate(num = as_factor(num))
final_data_1

patient_number,trestbps,exang,num
<int>,<chr>,<chr>,<fct>
1,95,0,1
2,115,0,1
4,110,1,1
⋮,⋮,⋮,⋮
121,160,0,0
122,160,0,1
123,145,0,1


In [53]:
# shows
head(final_data_1,10)

patient_number,trestbps,exang,num
<int>,<chr>,<chr>,<fct>
1,95,0,1
2,115,0,1
4,110,1,1
⋮,⋮,⋮,⋮
9,135,0,1
10,150,1,1
11,95,0,1


In [54]:
set.seed(9999)

heart_disease_split <- initial_split(final_data_1, prop = 0.75, strata = num)
heart_disease_training <- training(heart_disease_split)
heart_disease_testing <- testing(heart_disease_split)
heart_disease_training
heart_disease_testing

patient_number,trestbps,exang,num
<int>,<chr>,<chr>,<fct>
1,95,0,1
2,115,0,1
4,110,1,1
⋮,⋮,⋮,⋮
119,115,1,1
122,160,0,1
123,145,0,1


patient_number,trestbps,exang,num
<int>,<chr>,<chr>,<fct>
6,110,0,1
13,105,1,1
14,145,1,1
⋮,⋮,⋮,⋮
114,145,0,1
120,140,1,1
121,160,0,0


In [55]:
describe(heart_disease_training)

Unnamed: 0_level_0,vars,n,mean,sd,median,trimmed,mad,min,max,range,skew,kurtosis,se
Unnamed: 0_level_1,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
patient_number,1,90,58.911111,34.7296604,56.5,58.361111,40.7715,1,123,122,0.1452813,-1.129056,3.66082764
trestbps*,2,90,8.177778,4.7960137,7.0,7.736111,4.4478,1,19,18,0.6693535,-0.365074,0.50554424
exang*,3,90,1.433333,0.4983118,1.0,1.416667,0.0,1,2,1,0.2645971,-1.9513,0.05252667
num*,4,90,1.933333,0.2508413,2.0,2.0,0.0,1,2,1,-3.4166507,9.782566,0.02644099
