In [27]:
# Load Library

packages <- c("dplyr", "haven","ggplot2","MASS", "randomForest", "caret", "ranger", "parallel")

install_if_missing <- function(pkg) {
  if (!requireNamespace(pkg, quietly = TRUE)) {
    install.packages(pkg)
  }
  library(pkg, character.only = TRUE)
}

lapply(packages, install_if_missing)

select <- dplyr::select

# Maching Learning

In [34]:
df1 <- read.csv("Data/merge_df_cln.csv")
df1 <- df1 %>% 
        mutate(hyper = ifelse((bp_sys_mean >= 140 | bp_dia_mean >= 90), 1 , 0)) %>% 
        filter(htn_aware == "No") %>% #filter with Aware
        dplyr::select(-"X", -"bp_sys_mean", -"svy_id", -"bp_dia_mean", -"htn_aware")

colnames(df1)
# df1$hyper <- as.factor(df1$hyper)
# summary(df1$hyper)

## Random Forest

### Desrease size of majority

In [35]:
table(df1$hyper)

# Separate majority and minority classes
df1_majority <- df1 %>% filter(hyper == 0)
df1_minority <- df1 %>% filter(hyper == 1)

set.seed(42)
# Oversample minority class (duplicate existing data)
df1_majority_oversampled <- df1_majority %>% sample_n(nrow(df1_minority), replace = F)

# Combine balanced dataset
df1_balanced_decrease <- bind_rows(df1_minority, df1_majority_oversampled)

# Check new class distribution
table(df1_balanced_decrease$hyper)


    0     1 
29845  3680 


   0    1 
3680 3680 

In [36]:
# Change here for different resample method
df <- df1_balanced_decrease


# Split into training (70%) and testing (30%) sets
set.seed(42)
train_index <- createDataPartition(df$hyper, p = 0.7, list = FALSE)
train_data <- df[train_index, ]
test_data <- df[-train_index, ]

train_data$hyper <- as.factor(train_data$hyper)
test_data$hyper <- as.factor(test_data$hyper)

In [37]:
# Check available CPU cores
num_cores <- detectCores()
print(paste("Using", num_cores, "cores"))

# Train Random Forest classification model with multi-threading
rf_model <- ranger(hyper ~ ., data = train_data, 
                   num.trees = 500, 
                   mtry = sqrt(ncol(train_data) - 1), 
                   importance = "impurity",
                   probability = TRUE,  # Enables probability prediction
                   num.threads = num_cores)

# Predict class probabilities on the test set
predictions <- predict(rf_model, test_data)$predictions

# Convert probabilities to class labels
predicted_class <- ifelse(predictions[,2] > 0.5, "1", "0")  # Threshold at 0.5
predicted_class <- factor(predicted_class, levels = levels(test_data$hyper))

# Model Evaluation
conf_matrix <- confusionMatrix(predicted_class, test_data$hyper)

# Print Accuracy, Sensitivity, Specificity
print(conf_matrix)

# Variable Importance Plot
importance_values <- rf_model$variable.importance
sort(importance_values, decreasing = T)

[1] "Using 32 cores"
Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 754 253
         1 350 851
                                          
               Accuracy : 0.7269          
                 95% CI : (0.7078, 0.7454)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.4538          
                                          
 Mcnemar's Test P-Value : 9.252e-05       
                                          
            Sensitivity : 0.6830          
            Specificity : 0.7708          
         Pos Pred Value : 0.7488          
         Neg Pred Value : 0.7086          
             Prevalence : 0.5000          
         Detection Rate : 0.3415          
   Detection Prevalence : 0.4561          
      Balanced Accuracy : 0.7269          
                                          
       'Positive' Class : 0               
         

### Increase size of minority (Bootstrap)

In [38]:
table(df1$hyper)

# Separate majority and minority classes
df1_majority <- df1 %>% filter(hyper == 0)
df1_minority <- df1 %>% filter(hyper == 1)

set.seed(24)
# Oversample minority class (duplicate existing data)
df1_minority_oversampled <- df1_minority %>% sample_n(nrow(df1_majority), replace = T)

# Combine balanced dataset
df1_balanced_bootstrap <- bind_rows(df1_majority, df1_minority_oversampled)

# Check new class distribution
table(df1_balanced_bootstrap$hyper)


    0     1 
29845  3680 


    0     1 
29845 29845 

In [39]:
# Change here for different resample method
df <- df1_balanced_bootstrap


# Split into training (70%) and testing (30%) sets
set.seed(42)
train_index <- createDataPartition(df$hyper, p = 0.7, list = FALSE)
train_data <- df[train_index, ]
test_data <- df[-train_index, ]

train_data$hyper <- as.factor(train_data$hyper)
test_data$hyper <- as.factor(test_data$hyper)

In [40]:
# Check available CPU cores
num_cores <- detectCores()
print(paste("Using", num_cores, "cores"))

# Train Random Forest classification model with multi-threading
rf_model <- ranger(hyper ~ ., data = train_data, 
                   num.trees = 500, 
                   mtry = sqrt(ncol(train_data) - 1), 
                   importance = "impurity",
                   probability = TRUE,  # Enables probability prediction
                   num.threads = num_cores)

# Predict class probabilities on the test set
predictions <- predict(rf_model, test_data)$predictions

# Convert probabilities to class labels
predicted_class <- ifelse(predictions[,2] > 0.5, "1", "0")  # Threshold at 0.5
predicted_class <- factor(predicted_class, levels = levels(test_data$hyper))

# Model Evaluation
conf_matrix <- confusionMatrix(predicted_class, test_data$hyper)

# Print Accuracy, Sensitivity, Specificity
print(conf_matrix)

# Variable Importance Plot
importance_values <- rf_model$variable.importance
sort(importance_values, decreasing = T)

[1] "Using 32 cores"
Confusion Matrix and Statistics

          Reference
Prediction    0    1
         0 8587   42
         1  366 8911
                                          
               Accuracy : 0.9772          
                 95% CI : (0.9749, 0.9794)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.9544          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.9591          
            Specificity : 0.9953          
         Pos Pred Value : 0.9951          
         Neg Pred Value : 0.9605          
             Prevalence : 0.5000          
         Detection Rate : 0.4796          
   Detection Prevalence : 0.4819          
      Balanced Accuracy : 0.9772          
                                          
       'Positive' Class : 0               
   

### Hybrid

In [74]:
# Step 1: Reduce the majority class (Under-sampling)
df1_majority <- df1 %>% filter(hyper == 0) %>% sample_n(nrow(df1_minority) * 1.5, replace = FALSE)

# Step 2: Increase the minority class (Over-sampling)
df1_minority_oversampled <- df1_minority %>% sample_n(nrow(df1_majority), replace = TRUE)

# Combine both
df1_balanced_hybrid <- bind_rows(df1_majority, df1_minority_oversampled)

# Check new class distribution
table(df1_balanced_hybrid$hyper)



   0    1 
5520 5520 

In [75]:
# Change here for different resample method
df <- df1_balanced_hybrid


# Split into training (70%) and testing (30%) sets
set.seed(24)
train_index <- createDataPartition(df$hyper, p = 0.7, list = FALSE)
train_data <- df[train_index, ]
test_data <- df[-train_index, ]

train_data$hyper <- as.factor(train_data$hyper)
test_data$hyper <- as.factor(test_data$hyper)

In [79]:
# Check available CPU cores
num_cores <- detectCores()
print(paste("Using", num_cores, "cores"))

# Train Random Forest classification model with multi-threading
rf_model <- ranger(hyper ~ ., data = train_data, 
                   num.trees = 1500, 
                   mtry = floor(sqrt(ncol(train_data) - 1) * 1.5), 
                   importance = "impurity",
                   probability = TRUE,
                   num.threads = parallel::detectCores())


# Predict class probabilities on the test set
predictions <- predict(rf_model, test_data)$predictions

# # Convert probabilities to class labels
# predicted_class <- ifelse(predictions[,2] > 0.5, "1", "0")  # Threshold at 0.5
# predicted_class <- factor(predicted_class, levels = levels(test_data$hyper))

# # Model Evaluation
# conf_matrix <- confusionMatrix(predicted_class, test_data$hyper)

# # Print Accuracy, Sensitivity, Specificity
# print(conf_matrix)

threshold <- 0.45  # Lower threshold slightly to increase sensitivity

predicted_class <- ifelse(predictions[,2] > threshold, "1", "0")  
predicted_class <- factor(predicted_class, levels = levels(test_data$hyper))

conf_matrix <- confusionMatrix(predicted_class, test_data$hyper)
print(conf_matrix)


# Variable Importance Plot
importance_values <- rf_model$variable.importance
sort(importance_values, decreasing = T)

[1] "Using 32 cores"
Confusion Matrix and Statistics

          Reference
Prediction    0    1
         0 1181  158
         1  475 1498
                                          
               Accuracy : 0.8089          
                 95% CI : (0.7951, 0.8221)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.6178          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.7132          
            Specificity : 0.9046          
         Pos Pred Value : 0.8820          
         Neg Pred Value : 0.7592          
             Prevalence : 0.5000          
         Detection Rate : 0.3566          
   Detection Prevalence : 0.4043          
      Balanced Accuracy : 0.8089          
                                          
       'Positive' Class : 0               
   

In [29]:
# Change here for different resample method
df <- df1


# Split into training (70%) and testing (30%) sets
set.seed(42)
train_index <- createDataPartition(df$hyper, p = 0.7, list = FALSE)
train_data <- df[train_index, ]
test_data <- df[-train_index, ]

train_data$hyper <- as.factor(train_data$hyper)
test_data$hyper <- as.factor(test_data$hyper)

### Desrease size of majority

In [30]:
table(train_data$hyper)

# Separate majority and minority classes
train_data_majority <- train_data %>% filter(hyper == 0)
train_data_minority <- train_data %>% filter(hyper == 1)

set.seed(42)
# Oversample minority class (duplicate existing data)
train_data_majority_oversampled <- train_data_majority %>% sample_n(nrow(train_data_minority), replace = F)

# Combine balanced dataset
train_data_decrease <- bind_rows(train_data_minority, train_data_majority_oversampled)

# Check new class distribution
table(train_data_decrease$hyper)


    0     1 
28409  6986 


   0    1 
6986 6986 

In [31]:
# Check available CPU cores
num_cores <- detectCores()
print(paste("Using", num_cores, "cores"))

# Train Random Forest classification model with multi-threading
rf_model <- ranger(hyper ~ ., data = train_data_decrease, 
                   num.trees = 1000, 
                   mtry = sqrt(ncol(train_data_decrease) - 1), 
                   importance = "impurity",
                   probability = TRUE,  # Enables probability prediction
                   num.threads = num_cores)

# Predict class probabilities on the test set
predictions <- predict(rf_model, test_data)$predictions

# Convert probabilities to class labels
predicted_class <- ifelse(predictions[,2] > 0.5, "1", "0")  # Threshold at 0.5
predicted_class <- factor(predicted_class, levels = levels(test_data$hyper))

# Model Evaluation
conf_matrix <- confusionMatrix(predicted_class, test_data$hyper)

# Print Accuracy, Sensitivity, Specificity
print(conf_matrix)

# Variable Importance Plot
importance_values <- rf_model$variable.importance
sort(importance_values, decreasing = T)

[1] "Using 32 cores"
Confusion Matrix and Statistics

          Reference
Prediction    0    1
         0 7472  601
         1 4684 2411
                                          
               Accuracy : 0.6516          
                 95% CI : (0.6439, 0.6592)
    No Information Rate : 0.8014          
    P-Value [Acc > NIR] : 1               
                                          
                  Kappa : 0.275           
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.6147          
            Specificity : 0.8005          
         Pos Pred Value : 0.9256          
         Neg Pred Value : 0.3398          
             Prevalence : 0.8014          
         Detection Rate : 0.4926          
   Detection Prevalence : 0.5322          
      Balanced Accuracy : 0.7076          
                                          
       'Positive' Class : 0               
   

### Increase size of minority (Bootstrap)

In [32]:
table(train_data$hyper)

# Separate majority and minority classes
train_data_majority <- train_data %>% filter(hyper == 0)
train_data_minority <- train_data %>% filter(hyper == 1)

set.seed(24)
# Oversample minority class (duplicate existing data)
train_data_minority_oversampled <- train_data_minority %>% sample_n(nrow(train_data_majority), replace = T)

# Combine balanced dataset
train_data_bootstrap <- bind_rows(train_data_majority, train_data_minority_oversampled)

# Check new class distribution
table(train_data_bootstrap$hyper)


    0     1 
28409  6986 


    0     1 
28409 28409 

In [33]:
# Check available CPU cores
num_cores <- detectCores()
print(paste("Using", num_cores, "cores"))

# Train Random Forest classification model with multi-threading
rf_model <- ranger(hyper ~ ., data = train_data_bootstrap, 
                   num.trees = 500, 
                   mtry = sqrt(ncol(train_data_bootstrap) - 1), 
                   importance = "impurity",
                   probability = TRUE,  # Enables probability prediction
                   num.threads = num_cores)

# Predict class probabilities on the test set
predictions <- predict(rf_model, test_data)$predictions

# Convert probabilities to class labels
predicted_class <- ifelse(predictions[,2] > 0.5, "1", "0")  # Threshold at 0.5
predicted_class <- factor(predicted_class, levels = levels(test_data$hyper))

# Model Evaluation
conf_matrix <- confusionMatrix(predicted_class, test_data$hyper)

# Print Accuracy, Sensitivity, Specificity
print(conf_matrix)

# Variable Importance Plot
importance_values <- rf_model$variable.importance
sort(importance_values, decreasing = T)

[1] "Using 32 cores"
Confusion Matrix and Statistics

          Reference
Prediction     0     1
         0 11355  2375
         1   801   637
                                         
               Accuracy : 0.7906         
                 95% CI : (0.784, 0.7971)
    No Information Rate : 0.8014         
    P-Value [Acc > NIR] : 0.9996         
                                         
                  Kappa : 0.1812         
                                         
 Mcnemar's Test P-Value : <2e-16         
                                         
            Sensitivity : 0.9341         
            Specificity : 0.2115         
         Pos Pred Value : 0.8270         
         Neg Pred Value : 0.4430         
             Prevalence : 0.8014         
         Detection Rate : 0.7486         
   Detection Prevalence : 0.9052         
      Balanced Accuracy : 0.5728         
                                         
       'Positive' Class : 0              
                 

### Hybrid

In [24]:
# Step 1: Reduce the majority class (Under-sampling)
train_data_majority <- train_data %>% filter(hyper == 0) %>% sample_n(nrow(train_data_minority) * 1.5, replace = FALSE)

# Step 2: Increase the minority class (Over-sampling)
train_data_minority_oversampled <- train_data_minority %>% sample_n(nrow(train_data_majority), replace = TRUE)

# Combine both
train_data_hybrid <- bind_rows(train_data_majority, train_data_minority_oversampled)

# Check new class distribution
table(train_data_hybrid$hyper)



   0    1 
3913 3913 

In [26]:
# Check available CPU cores
num_cores <- detectCores()
print(paste("Using", num_cores, "cores"))

# Train Random Forest classification model with multi-threading
rf_model <- ranger(hyper ~ ., data = train_data_hybrid, 
                   num.trees = 1500, 
                   mtry = floor(sqrt(ncol(train_data_hybrid) - 1) * 1.5), 
                   importance = "impurity",
                   probability = TRUE,
                   num.threads = parallel::detectCores())


# Predict class probabilities on the test set
predictions <- predict(rf_model, test_data)$predictions

threshold <- 0.45  # Lower threshold slightly to increase sensitivity

predicted_class <- ifelse(predictions[,2] > threshold, "1", "0")  
predicted_class <- factor(predicted_class, levels = levels(test_data$hyper))

conf_matrix <- confusionMatrix(predicted_class, test_data$hyper)
print(conf_matrix)


# Variable Importance Plot
importance_values <- rf_model$variable.importance
sort(importance_values, decreasing = T)

[1] "Using 32 cores"
Confusion Matrix and Statistics

          Reference
Prediction    0    1
         0 6594  315
         1 2392  756
                                          
               Accuracy : 0.7308          
                 95% CI : (0.7221, 0.7395)
    No Information Rate : 0.8935          
    P-Value [Acc > NIR] : 1               
                                          
                  Kappa : 0.2371          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.7338          
            Specificity : 0.7059          
         Pos Pred Value : 0.9544          
         Neg Pred Value : 0.2402          
             Prevalence : 0.8935          
         Detection Rate : 0.6557          
   Detection Prevalence : 0.6870          
      Balanced Accuracy : 0.7198          
                                          
       'Positive' Class : 0               
   