In [1]:
library(tidyverse)
library(repr)
library(infer)
library(cowplot)
library(broom)
library(GGally)
library(modelr)
library(car)
library(stats)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: ‘cowplot’


The following object is masked from ‘package:lubridate’:

    stamp


Registered S3 method overwritten by 'GGally':
  method from   
  +.

In [2]:
diabetes_data <- read_csv('data/diabetes.csv')
head (diabetes_data)

[1mRows: [22m[34m768[39m [1mColumns: [22m[34m9[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[32mdbl[39m (9): Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, D...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
6,148,72,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
8,183,64,0,0,23.3,0.672,32,1
1,89,66,23,94,28.1,0.167,21,0
0,137,40,35,168,43.1,2.288,33,1
5,116,74,0,0,25.6,0.201,30,0


In [3]:
diabetes_data_clean <- diabetes_data %>%
  select(-SkinThickness) %>% # drop this column because it has too many null values
  filter(Glucose != 0 & BloodPressure != 0 & BMI != 0)
head(diabetes_data_clean)

Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
6,148,72,0,33.6,0.627,50,1
1,85,66,0,26.6,0.351,31,0
8,183,64,0,23.3,0.672,32,1
1,89,66,94,28.1,0.167,21,0
0,137,40,168,43.1,2.288,33,1
5,116,74,0,25.6,0.201,30,0


In [4]:
train_indices <- sample(seq_len(nrow(diabetes_data_clean)), size = 0.8 * nrow(diabetes_data_clean))
train_data <- diabetes_data_clean[train_indices, ]
test_data <- diabetes_data_clean[-train_indices, ]

In [5]:
model <- glm(formula = Outcome ~ Age + Pregnancies + Glucose + Insulin + 
    BMI + DiabetesPedigreeFunction + Age:Pregnancies + Age:Glucose + 
    Age:Insulin, family = binomial, data = diabetes_data_clean)
model


Call:  glm(formula = Outcome ~ Age + Pregnancies + Glucose + Insulin + 
    BMI + DiabetesPedigreeFunction + Age:Pregnancies + Age:Glucose + 
    Age:Insulin, family = binomial, data = diabetes_data_clean)

Coefficients:
             (Intercept)                       Age               Pregnancies  
              -1.344e+01                 1.234e-01                 4.545e-01  
                 Glucose                   Insulin                       BMI  
               6.134e-02                -9.203e-03                 9.134e-02  
DiabetesPedigreeFunction           Age:Pregnancies               Age:Glucose  
               9.865e-01                -9.318e-03                -6.598e-04  
             Age:Insulin  
               2.360e-04  

Degrees of Freedom: 723 Total (i.e. Null);  714 Residual
Null Deviance:	    931.9 
Residual Deviance: 647.8 	AIC: 667.8

In [6]:
test_data <- test_data %>%
  mutate(predicted_prob = predict(model, newdata = ., type = "response"),
         predicted_class = ifelse(predicted_prob > 0.5, 1, 0))

test_data

Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,predicted_prob,predicted_class
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
0,137,40,168,43.1,2.288,33,1,0.88186462,1
2,197,70,543,30.5,0.158,53,1,0.95051659,1
5,166,72,175,25.8,0.587,51,1,0.68556640,1
13,145,82,110,22.2,0.245,57,0,0.22741015,0
2,90,68,0,38.2,0.503,27,1,0.14200137,0
7,159,64,0,27.4,0.294,40,0,0.60321702,1
1,146,56,0,29.7,0.564,29,0,0.43905390,0
7,150,66,342,34.7,0.718,42,0,0.79427560,1
0,105,64,142,41.5,0.173,22,0,0.08196092,0
13,126,90,0,43.4,0.583,42,1,0.79264705,1


In [7]:
conf_matrix <- table(Predicted = test_data$predicted_class, Actual = test_data$Outcome)
accuracy <- mean(test_data$predicted_class == test_data$Outcome)

conf_matrix
accuracy

         Actual
Predicted  0  1
        0 83 22
        1  8 32

# Using forward selection

In [8]:
model_add <- glm(formula = Outcome ~ Age + Pregnancies + Glucose + BloodPressure + Insulin + BMI + DiabetesPedigreeFunction,
      data = diabetes_data_clean,
      family = binomial)

# use step to do forward selection
forward_model_int <- step(model_int, direction = "backward")

summary(forward_model_int)

ERROR: Error in eval(expr, envir, enclos): object 'model_int' not found


In [None]:
forward_model <- glm (Outcome ~ Age + Pregnancies + Glucose + BMI + DiabetesPedigreeFunction,
                      data = diabetes_data_clean,
                      family = binomial)

forward_model

In [None]:
test_data_forward <- test_data %>%
  mutate(predicted_prob = predict(forward_model, newdata = ., type = "response"),
         predicted_class = ifelse(predicted_prob > 0.5, 1, 0))

test_data_forward

In [None]:
conf_matrix <- table(Predicted = test_data$predicted_class, Actual = test_data$Outcome)
accuracy <- mean(test_data$predicted_class == test_data$Outcome)

conf_matrix
accuracy