Skip to content

HepB LiveTest: The Development of a Machine Learning Algorithm for Early Detection of Viral Hepatitis B Infection in Nigerian patients

Notifications You must be signed in to change notification settings

bia-ml/HepB-LiveTest

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

6 Commits
 
 

Repository files navigation

HepB-LiveTest


Title: "The Development of a Machine Learning Algorithm for Early Detection of Viral Hepatitis B Infection in Nigerian patients" author: "Busayo Ajuwon" date: "17 August 2021" output:

pdf_document: default html_document: default word_document: default

require(ggplot2) require(plotly) require(psych) require(rattle) library(caret)

MACHINE LEARNING EXPERIMENT Name of Dataset: Allcases*

READ THE DATASET

library(readr)
Allcases <- read_csv("D:/PhD Studies@ANU/Predicting HBsAg  in Oz/Allcases.csv")
View(Allcases)

DESCRIPTIVE ANALYSIS Any NAsin the data?

length(which(is.na(Allcases)))

How many positive and negative hepatitis cases are in the data set?

Allcases$HBSA <- as.factor(Allcases$HBSA)
summary (Allcases$HBSA)

Summarize data by group

by(Allcases, Allcases$HBSA, summary)

DATA EXPLORATION

ggplot(Allcases, aes(x = HBSA, fill = HBSA)) +
  geom_bar()

#CONTINUE WITH DATA EXPLORATION

featurePlot(x = Allcases[, 1:20],
y = Allcases$HBSA,
plot = "box",
strip=strip.custom(par.strip.text=list(cex=.9)),
scales = list(x = list(relation="free"),
y = list(relation="free")))

EXAMINE THEIR DENSITY PLOTS

featurePlot(x = Allcases[, 1:20],
y = Allcases$HBSA,
plot = "density",
strip=strip.custom(par.strip.text=list(cex=.7)),
scales = list(x = list(relation="free"),
y = list(relation="free")))

Feature selection using recursive feature elimination (rfe)

set.seed(100)
subsets <- c(1:5, 10, 15, 20)

ctrl <- rfeControl(functions = rfFuncs,
method = "repeatedcv",
repeats = 5,
verbose = FALSE)

lmProfile <- rfe(x=Allcases[, 1:20], y=Allcases$HBSA,
sizes = subsets,
rfeControl = ctrl)
lmProfile

TRAINING AND TUNING THE MODEL Train the model and interpret the results

set.seed(100)
# Step 1: Get row numbers for the training data
trainRowNumbers <- createDataPartition(Allcases$HBSA, p=0.7, list=FALSE)

# Step 2: Create the training dataset
trainData <- Allcases[trainRowNumbers,]

# Step 3: Create the test dataset
testData <- Allcases[-trainRowNumbers,]

```{r}
# Store X and Y for later use.
x = trainData[, 1:20]
y = trainData$HBSA

Run RF algorithm on patient data and rank predictive markers according to importance Define the training control

fitControl <- trainControl(
  method = 'repeatedcv',            # repeated cross validation
  number = 10,                      # number of folds
  repeats = 10,                    # number of repeats 
  savePredictions = 'final',       # saves predictions for optimal tuning parameter
  classProbs = T,                  # should class probabilities be returned
  summaryFunction=twoClassSummary  # results summary function
)

Hyperparameter tuning using tune length I take the train() function that was before, plus, additionally set the tuneLength, trControl and metric.

#Tune hyperparameters by setting tuneLength 
set.seed(100)
model_rf = train(HBSA ~ ., data=trainData, method='rf', tuneLength=5, trControl = fitControl)
model_rf
#Step 2: Predict on testData
predicted <- predict(model_rf, testData)
head(predicted)
#Compute the confusion matrix
confusionMatrix(reference = testData$HBSA, data = predicted, mode='everything', positive='positive')

Compute variable importance

varimp_rf <- varImp(model_rf)
plot(varimp_rf, main="Variable Importance with RF")

Establish decision thresholds using top-ranked predictive markers from random forest

set.seed(100)
model_dt = train(HBSA ~ AST + WBC + Age + ALT , data=trainData, method='rpart', tuneLength=5, trControl = fitControl)
model_dt

fancyRpartPlot(model_dt$finalModel, sub = '')

#Predict on test data

predicted <- predict(model_dt, testData)
head(predicted)

#Compute the confusion matrix

confusionMatrix(reference = testData$HBSA, data = predicted, mode='everything', positive='positive')

Random Forest Analysis of HBsAg Immunoassay Results

set.seed(100)
model_rf = train(HBSA ~ AST + WBC + Age + ALT , data=trainData, method='rf', tuneLength=5, trControl = fitControl)
model_rf
#Step 2: Predict on testData
predicted <- predict(model_rf, testData)
head(predicted)
#Compute the confusion matrix
confusionMatrix(reference = testData$HBSA, data = predicted, mode='everything', positive='positive')

Support Vector Machine (SVM) Analysis of HBsAg Immunoassay Results Train the model using Support Vector Machines

set.seed(100)
model_svmRadial = train(HBSA ~ AST + WBC + Age + ALT + ALB, data=trainData, method='svmRadial', tuneLength=5, trControl = fitControl)
model_svmRadial
#Predict on test data
predicted <- predict(model_svmRadial, testData)
head(predicted)
#Compute the confusion matrix
confusionMatrix(reference = testData$HBSA, data = predicted, mode='everything', positive='positive')

SVM PLOTS ALT Kinetics Associated with the Primary Predictor Variables AST and WBC Introduce ALT as static slice ALT = 20 U/L

m1 <- svm (HBSA~ AST + WBC + ALT, data = testData, method = 'svmRadial', gamma = 0.36, cost= 4)
plot(m1, testData, AST ~ WBC, 
     slice = list (ALT = 20))

ALT = 30 U/L

m1 <- svm (HBSA~ AST + WBC + ALT, data = testData, method = 'svmRadial', gamma = 0.36, cost= 4)
plot(m1, testData, AST ~ WBC, 
     slice = list (ALT = 30))

**ALT = 50 U/L **

m1 <- svm (HBSA~ AST + WBC + ALT, data = testData, method = 'svmRadial', gamma = 0.36, cost= 4)
plot(m1, testData, AST ~ WBC, 
     slice = list (ALT = 50))

**ALT = 100 U/L **

m1 <- svm (HBSA~ AST + WBC + ALT, data = testData, method = 'svmRadial', gamma = 0.36, cost= 4)
plot(m1, testData, AST ~ WBC, 
     slice = list (ALT = 100))

**ALT = 1000 U/L **

m1 <- svm (HBSA~ AST + WBC + ALT, data = testData, method = 'svmRadial', gamma = 0.36, cost= 4)
plot(m1, testData, AST ~ WBC, 
     slice = list (ALT = 1000))

Impact of Age on the SVM Prediction of HBsAg Immunoassay Result by AST and WBC Introduce age as a static slice

#Age = 15 years 
m1 <- svm (HBSA~ AST + WBC + Age, data = testData, method = 'svmRadial', gamma = 0.36, cost= 4)
plot(m1, testData, AST ~ WBC, 
     slice = list (Age = 15))
#Age = 25 years 
m1 <- svm (HBSA~ AST + WBC + Age, data = testData, method = 'svmRadial', gamma = 0.36, cost= 4)
plot(m1, testData, AST ~ WBC, 
     slice = list (Age = 25))
#Age = 35 years 
m1 <- svm (HBSA~ AST + WBC + Age, data = testData, method = 'svmRadial', gamma = 0.36, cost= 4)
plot(m1, testData, AST ~ WBC, 
     slice = list (Age = 35))
#Age = 45 years 
m1 <- svm (HBSA~ AST + WBC + Age, data = testData, method = 'svmRadial', gamma = 0.36, cost= 4)
plot(m1, testData, AST ~ WBC, 
     slice = list (Age = 45))
#Age = 55 years 
m1 <- svm (HBSA~ AST + WBC + Age, data = testData, method = 'svmRadial', gamma = 0.36, cost= 4)
plot(m1, testData, AST ~ WBC, 
     slice = list (Age = 55))
#Age = 65 years 
m1 <- svm (HBSA~ AST + WBC + Age, data = testData, method = 'svmRadial', gamma = 0.36, cost= 4)
plot(m1, testData, AST ~ WBC, 
     slice = list (Age = 65))

#Kinetics for HBsAg positive and HBsAg negative cohorts across the age range investigated by SVM and tree-based machine learning algorithms. Comparison of mean WBC (A) and ALB (B) versus age at the time of testing.

WBC KINETICS

ggplot(aes(x = Age, y = WBC, color=HBSA), data = Allcases) +
    stat_summary(fun=mean, geom="line") +
    ylab("Mean WBC ( X 10^9 /L) ") 
ggplot(aes(x = Age, y = ALB, color=HBSA), data = Allcases) +
    stat_summary(fun=mean, geom="line") +
    ylab("Mean ALB (g/L) ")

RANDOM FOREST MODEL PERFORMANCE COMPARISON AFTER SUBSAMPLING - subsampling experiments

#Model the original unbalanced data
fitControl <- trainControl(
  method = 'repeatedcv',                   
  number = 10,                      
  repeats = 10,
  savePredictions = 'final',     
  classProbs = T,                  
  summaryFunction=twoClassSummary
) 

set.seed(100)
model_rf = train(HBSA ~ AST + WBC + Age + ALT, data=trainData, method='rf', tuneLength=5, trControl = fitControl)
model_rf

#Predict on test data
predicted <- predict(model_rf, testData)
head(predicted)

cm_original <- confusionMatrix(reference = testData$HBSA, data = predicted, mode='everything', positive='positive')
cm_original
#Undersampling
fitControl <- trainControl(
  method = 'repeatedcv',                   
  number = 10,                      
  repeats = 10,
  savePredictions = 'final',     
  classProbs = T,                  
  summaryFunction=twoClassSummary,
  sampling = 'under'
) 

set.seed(100)
model_rf_under = train(HBSA ~ AST + WBC + Age + ALT, data=trainData, method='rf', tuneLength=5, trControl = fitControl)
model_rf_under

predicted <- predict(model_rf_under, testData)
head(predicted)

cm_under <- confusionMatrix(reference = testData$HBSA, data = predicted, mode='everything', positive='positive')
cm_under
#Oversampling
fitControl <- trainControl(
  method = 'repeatedcv',                   
  number = 10,                      
  repeats = 10,
  savePredictions = 'final',     
  classProbs = T,                  
  summaryFunction=twoClassSummary,
  sampling = 'up'
) 

set.seed(100)
model_rf_over = train(HBSA ~ AST + WBC + Age + ALT, data=trainData, method='rf', tuneLength=5, trControl = fitControl)
model_rf_over

predicted <- predict(model_rf_over, testData)
head(predicted)

cm_over <- confusionMatrix(reference = testData$HBSA, data = predicted, mode='everything', positive='positive')
cm_over
#ROSE
fitControl <- trainControl(
  method = 'repeatedcv',                   
  number = 10,  
  repeats = 10,
  savePredictions = 'final',     
  classProbs = T,                  
  summaryFunction=twoClassSummary,  
  sampling = 'rose'
) 

set.seed(100)
model_rf_rose = train(HBSA ~ AST + WBC + Age + ALT, data=trainData, method='rf', tuneLength=5, trControl = fitControl)
model_rf_rose

predicted <- predict(model_rf_rose, testData)
head(predicted)

cm_rose <- confusionMatrix(reference = testData$HBSA, data = predicted, mode='everything', positive='positive')
cm_rose
#Resample models
models <- list(original = model_rf,
               under = model_rf_under,
               over = model_rf_over,
               rose = model_rf_rose)


resampling <- resamples(models)
bwplot(resampling)
#Plot a figure to compare- final

library(dplyr)
comparison <- data.frame(model = names(models),
                         Sensitivity = rep(NA, length(models)),
                         Specificity = rep(NA, length(models)),
                         Precision = rep(NA, length(models)),
                         F1 = rep(NA, length(models)))



comparison <- data.frame(model = names(models))

for (name in names(models)) {
  model <- get(paste0("cm_", name))
  
  comparison[comparison$model == name, "Sensitivity"] <- model$byClass[["Sensitivity"]]
  comparison[comparison$model == name, "Specificity"] <- model$byClass[["Specificity"]]
  comparison[comparison$model == name, "Precision"] <- model$byClass[["Precision"]]
  comparison[comparison$model == name, "F1"] <- model$byClass[["F1"]]
}


library(tidyr)
comparison %>%
  gather(x, y, Sensitivity:F1) %>%
  ggplot(aes(x = x, y = y, color = model)) +
  geom_jitter(width = 0.1, alpha = 0.3, size = 3)

About

HepB LiveTest: The Development of a Machine Learning Algorithm for Early Detection of Viral Hepatitis B Infection in Nigerian patients

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published