In [19]:
library(tidyverse)
library(repr)
library(infer)
library(cowplot)
library(broom)
library(GGally)
library(modelr)
library(car)
library(stats)

In [20]:
diabetes_data <- read_csv('data/diabetes.csv')
head (diabetes_data)

[1mRows: [22m[34m768[39m [1mColumns: [22m[34m9[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[32mdbl[39m (9): Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, D...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
6,148,72,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
8,183,64,0,0,23.3,0.672,32,1
1,89,66,23,94,28.1,0.167,21,0
0,137,40,35,168,43.1,2.288,33,1
5,116,74,0,0,25.6,0.201,30,0


In [21]:
zero_counts <- sapply(diabetes_data[, -which(names(diabetes_data) == "Outcome")], function(x) sum(x == 0))
zero_counts

In [22]:
diabetes_data_clean <- diabetes_data %>%
  select(-SkinThickness) %>% # drop this column because it has too many null values
  filter(Glucose != 0 & BloodPressure != 0 & BMI != 0)
head(diabetes_data_clean)

Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
6,148,72,0,33.6,0.627,50,1
1,85,66,0,26.6,0.351,31,0
8,183,64,0,23.3,0.672,32,1
1,89,66,94,28.1,0.167,21,0
0,137,40,168,43.1,2.288,33,1
5,116,74,0,25.6,0.201,30,0


In [23]:
train_indices <- sample(seq_len(nrow(diabetes_data_clean)), size = 0.8 * nrow(diabetes_data_clean))
train_data <- diabetes_data_clean[train_indices, ]
test_data <- diabetes_data_clean[-train_indices, ]

# Using backward selection for an additive model

In [24]:
model_add <- glm(formula = Outcome ~ Age + Pregnancies + Glucose + BloodPressure + Insulin + BMI + DiabetesPedigreeFunction,
      data = diabetes_data_clean,
      family = binomial)

# use step to do backward selection
backward_model_add <- step(model_add, direction = "backward")

summary(backward_model_add)

Start:  AIC=687.13
Outcome ~ Age + Pregnancies + Glucose + BloodPressure + Insulin + 
    BMI + DiabetesPedigreeFunction

                           Df Deviance    AIC
- BloodPressure             1   672.60 686.60
- Insulin                   1   672.86 686.86
<none>                          671.13 687.13
- Age                       1   673.75 687.75
- DiabetesPedigreeFunction  1   682.13 696.13
- Pregnancies               1   683.45 697.45
- BMI                       1   708.85 722.85
- Glucose                   1   786.60 800.60

Step:  AIC=686.6
Outcome ~ Age + Pregnancies + Glucose + Insulin + BMI + DiabetesPedigreeFunction

                           Df Deviance    AIC
- Insulin                   1   673.93 685.93
- Age                       1   674.51 686.51
<none>                          672.60 686.60
- DiabetesPedigreeFunction  1   683.78 695.78
- Pregnancies               1   684.59 696.59
- BMI                       1   709.58 721.58
- Glucose                   1   786.69 798


Call:
glm(formula = Outcome ~ Age + Pregnancies + Glucose + BMI + DiabetesPedigreeFunction, 
    family = binomial, data = diabetes_data_clean)

Coefficients:
                          Estimate Std. Error z value Pr(>|z|)    
(Intercept)              -9.313824   0.753602 -12.359  < 2e-16 ***
Age                       0.014446   0.009500   1.521  0.12838    
Pregnancies               0.116028   0.033284   3.486  0.00049 ***
Glucose                   0.034783   0.003566   9.753  < 2e-16 ***
BMI                       0.086147   0.015035   5.730    1e-08 ***
DiabetesPedigreeFunction  0.969342   0.305185   3.176  0.00149 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 931.94  on 723  degrees of freedom
Residual deviance: 673.93  on 718  degrees of freedom
AIC: 685.93

Number of Fisher Scoring iterations: 5


In [34]:
# backward_model <- glm (Outcome ~ Age + Pregnancies + Glucose + BMI + DiabetesPedigreeFunction,
#                       data = diabetes_data_clean,
#                       family = binomial)

# backward_model <- glm(Outcome ~ Age + Pregnancies + Glucose + Insulin + BMI + DiabetesPedigreeFunction,
#                       data = diabetes_data_clean,
#                       family = binomial)

backward_model <- glm (Outcome ~ Age + Pregnancies + Glucose + BloodPressure + Insulin + BMI + DiabetesPedigreeFunction,
                      data = diabetes_data_clean,
                      family = binomial)

backward_model


Call:  glm(formula = Outcome ~ Age + Pregnancies + Glucose + BloodPressure + 
    Insulin + BMI + DiabetesPedigreeFunction, family = binomial, 
    data = diabetes_data_clean)

Coefficients:
             (Intercept)                       Age               Pregnancies  
               -9.056895                  0.015955                  0.116142  
                 Glucose             BloodPressure                   Insulin  
                0.036921                 -0.010528                 -0.001116  
                     BMI  DiabetesPedigreeFunction  
                0.094419                  1.008186  

Degrees of Freedom: 723 Total (i.e. Null);  716 Residual
Null Deviance:	    931.9 
Residual Deviance: 671.1 	AIC: 687.1

In [35]:
test_data_backward <- test_data %>%
  mutate(predicted_prob = predict(backward_model, newdata = ., type = "response"),
         predicted_class = ifelse(predicted_prob > 0.5, 1, 0))

test_data_backward

Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,predicted_prob,predicted_class
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
6,148,72,0,33.6,0.627,50,1,0.72082263,1
1,85,66,0,26.6,0.351,31,0,0.04159364,0
1,89,66,94,28.1,0.167,21,0,0.03564050,0
0,137,40,168,43.1,2.288,33,1,0.90848640,1
5,116,74,0,25.6,0.201,30,0,0.13307455,0
3,78,50,88,31.0,0.248,26,1,0.05409342,0
11,143,94,146,36.6,0.254,51,1,0.70542253,1
9,102,76,0,32.9,0.665,46,1,0.36933190,0
7,133,84,0,40.2,0.696,37,0,0.70471682,1
7,62,78,0,32.6,0.391,41,0,0.06601445,0


In [36]:
conf_matrix <- table(Predicted = test_data_backward$predicted_class, Actual = test_data_backward$Outcome)
accuracy <- mean(test_data_backward$predicted_class == test_data_backward$Outcome)

tidy (conf_matrix)
accuracy

“'tidy.table' is deprecated.
Use 'tibble::as_tibble()' instead.
See help("Deprecated")”


Predicted,Actual,n
<chr>,<chr>,<int>
0,0,83
1,0,13
0,1,19
1,1,30
