# Regression Analysis

Load in required R packages

In [None]:
library(stats)
library(dplyr)
library(caret)
install.packages('ROCR')
install.packages('ini')
library('ini')
data = read.csv('StudyData.csv')

Relevant columns selected from data

In [None]:
study_data = data %>% select(participant_identifier, smoker_status, language, wearing_mask, covid_test_result, covid_test_method, ethnicity, age, gender,
       recruitment_source, submission_delay, symptom_fatigue, symptom_new_continuous_cough, symptom_diarrhoea, symptom_headache, symptom_runny_or_blocked_nose,
       symptom_sore_throat, symptom_fever_high_temperature, symptom_change_to_sense_of_smell_or_taste,
       symptom_shortness_of_breath, symptom_abdominal_pain, symptom_loss_of_taste, symptom_cough_any, symptom_other,
       respiratory_condition_asthma, respiratory_condition_copd_or_emphysema, respiratory_condition_other,
       height, weight, pseudonymised_local_authority_code, covid_viral_load_category, covid_vaccine_doses)

R Variable types are assigned to each variable

In [None]:
study_data$age = as.numeric(study_data$age)
study_data$height = replace(study_data$height, which(study_data$height == 'Prefer not to say'), NA)
study_data$height = replace(study_data$height, which(study_data$height == '<=90'), 90)
study_data$height = as.numeric(study_data$height)
study_data$weight = replace(study_data$weight, which(study_data$weight == 'Prefer not to say'), NA)
study_data$weight = as.numeric(study_data$weight)
study_data$smoker_status = replace(study_data$smoker_status, which(study_data$smoker_status == 'Prefer not to say'), NA)
study_data$smoker_status = as.factor(study_data$smoker_status)
levels(study_data$smoker_status) = c('Never smoked', 'Ex-smoker', 'Current smoker (e-cigarettes or vapes only)', 'Current smoker (1 to 10 cigarettes per day)', 'Current smoker (11 or more cigarettes per day)')
study_data$language = as.factor(study_data$language)
study_data$wearing_mask = as.factor(study_data$wearing_mask)
study_data$covid_test_method = as.factor(study_data$covid_test_method)
study_data$covid_test_result = as.factor(study_data$covid_test_result)
study_data$ethnicity = as.factor(study_data$ethnicity)
study_data$gender = replace(study_data$gender, which(study_data$gender == 'Unknown'), NA)
study_data$gender = as.factor(study_data$gender)
study_data$symptom_cough_any = as.factor(study_data$symptom_cough_any)
study_data$symptom_new_continuous_cough = as.factor(study_data$symptom_new_continuous_cough)
study_data$symptom_shortness_of_breath = as.factor(study_data$symptom_shortness_of_breath)
study_data$symptom_abdominal_pain = as.factor(study_data$symptom_abdominal_pain)
study_data$symptom_diarrhoea = as.factor(study_data$symptom_diarrhoea)
study_data$symptom_fatigue = as.factor(study_data$symptom_fatigue)
study_data$symptom_fever_high_temperature = as.factor(study_data$symptom_fever_high_temperature)
study_data$symptom_headache = as.factor(study_data$symptom_headache)
study_data$symptom_fever_high_temperature = as.factor(study_data$symptom_fever_high_temperature)
study_data$symptom_headache = as.factor(study_data$symptom_headache)
study_data$symptom_change_to_sense_of_smell_or_taste = as.factor(study_data$symptom_change_to_sense_of_smell_or_taste)
study_data$symptom_other = as.factor(study_data$symptom_other)
study_data$symptom_loss_of_taste = as.factor(study_data$symptom_loss_of_taste)
study_data$respiratory_condition_asthma = as.factor(study_data$respiratory_condition_asthma)
study_data$respiratory_condition_copd_or_emphysema = as.factor(study_data$respiratory_condition_copd_or_emphysema)
study_data$respiratory_condition_other = as.factor(study_data$respiratory_condition_other)
study_data$covid_vaccine_doses = as.factor(study_data$covid_vaccine_doses)
study_data$pseudonymised_local_authority_code = as.factor(study_data$pseudonymised_local_authority_code)

## Train Test Split

### Designed Train Test Split

Load train test split from pkl file outputted in Exploratory Analysis and Split Generation.ipynb

In [None]:
library(reticulate)
source_python("split_reader.py")
pickle_data <- read_pickle_file("original_split_stage1.pkl")

Set up train and test set objects

In [None]:
train_ids = pickle_data$train
test_ids = pickle_data$test

train = study_data[study_data$participant_identifier %in% train_ids,]
test = study_data[study_data$participant_identifier %in% test_ids,]

Logistic regression model with relevant variables and interaction terms

In [None]:
full_mod_designed = glm(covid_test_result ~ smoker_status + age + gender + symptom_cough_any + symptom_new_continuous_cough + symptom_runny_or_blocked_nose 
                 + symptom_shortness_of_breath + symptom_sore_throat + symptom_abdominal_pain + symptom_diarrhoea + symptom_fatigue 
                 + symptom_fever_high_temperature + symptom_headache + symptom_change_to_sense_of_smell_or_taste + symptom_loss_of_taste + respiratory_condition_asthma + height + weight
                 + covid_vaccine_doses + respiratory_condition_asthma*symptom_cough_any + respiratory_condition_asthma*symptom_new_continuous_cough + respiratory_condition_asthma*symptom_shortness_of_breath + respiratory_condition_asthma*symptom_diarrhoea
                 + respiratory_condition_asthma*symptom_fatigue + respiratory_condition_asthma*symptom_headache + respiratory_condition_asthma*symptom_fever_high_temperature + respiratory_condition_asthma*symptom_change_to_sense_of_smell_or_taste
                 + respiratory_condition_asthma*symptom_loss_of_taste + respiratory_condition_asthma*symptom_abdominal_pain + age*gender + age*height + age*weight
                 + gender*height + gender*weight 
                 + gender*symptom_cough_any + gender*symptom_new_continuous_cough + gender*symptom_shortness_of_breath + gender*symptom_diarrhoea
                 + gender*symptom_fatigue + gender*symptom_headache + gender*symptom_fever_high_temperature + gender*symptom_change_to_sense_of_smell_or_taste
                 + gender*symptom_loss_of_taste + gender*symptom_abdominal_pain
                 + age*symptom_cough_any + age*symptom_new_continuous_cough + age*symptom_shortness_of_breath + age*symptom_diarrhoea
                 + age*symptom_fatigue + age*symptom_headache + age*symptom_fever_high_temperature + age*symptom_change_to_sense_of_smell_or_taste
                 + age*symptom_loss_of_taste + age*symptom_abdominal_pain
               , data = train, family='binomial')

Stepwise model selection using step function (AIC as the model selection criteria)

In [None]:
red_mod_designed = step(full_mod_designed)

In [None]:
sum = summary(red_mod_designed)
plot(red_mod_designed)

Performance on training set

In [None]:
train_roc = na.omit(train)
library(ROCR)
pred = predict(red_mod_designed, train_roc)
pred = prediction(pred, train_roc$covid_test_result)
roc = performance(pred,"tpr","fpr")
plot(roc, lwd = 2, title = 'Accuracy on Training Set')
abline(a = 0, b = 1) 
auc = performance(pred, measure = "auc")
print(auc@y.values)

Performance on test Set

In [None]:
test_roc = na.omit(test)
library(ROCR)
pred = predict(red_mod_designed, test_roc)
pred = prediction(pred, test_roc$covid_test_result)
roc = performance(pred,"tpr","fpr")
plot(roc, lwd = 2, title = 'Accuracy on Training Set')
abline(a = 0, b = 1) 
auc = performance(pred, measure = "auc")
print(auc@y.values)

### Other train test splits

Load other train test splits - s3 details retrieved from config file

In [None]:
config_filepath = 's3_config.ini'
config = read.ini(config_filepath)
s3_path = paste("s3://", config$S3$bucket, "/", config$S3$splits_path, sep="") 
region = config$S3$region

In [None]:
install.packages('aws.s3')
Sys.setenv("AWS_DEFAULT_REGION" = region)
train_test_splits <- aws.s3::s3read_using(read.csv, object = s3_path)

#### Matched Test Set

Evaluate the accuracy of the trained model on the matched test set

In [None]:
test_matched_ids = train_test_splits[(train_test_splits$in_matched_original_test == 'True'),]$participant_identifier
test_matched_roc = study_data[study_data$participant_identifier %in% test_matched_ids,]
test_matched_roc = na.omit(test_matched_roc)
pred = predict(red_mod_designed, test_matched_roc)
pred = prediction(pred, test_matched_roc$covid_test_result)
roc = performance(pred,"tpr","fpr")
plot(roc, lwd = 2, title = 'Accuracy on Training Set')
abline(a = 0, b = 1) 
auc = performance(pred, measure = "auc")
print(auc@y.values)

#### Randomised Train Test Split

Same step wise selection procedure to train a logistic regression using a randomised train-test split.

In [None]:
train_random_ids = train_test_splits[(train_test_splits$naive_splits == 'train') | (train_test_splits$naive_splits == 'val'),]$participant_identifier
test_random_ids = train_test_splits[(train_test_splits$naive_splits == 'test'),]$participant_identifier

In [None]:
train_random = study_data[study_data$participant_identifier %in% train_random_ids,]
test_random = study_data[study_data$participant_identifier %in% test_random_ids,]

In [None]:
full_mod_random = glm(covid_test_result ~ smoker_status + age + gender + symptom_cough_any + symptom_new_continuous_cough + symptom_runny_or_blocked_nose 
                 + symptom_shortness_of_breath + symptom_sore_throat + symptom_abdominal_pain + symptom_diarrhoea + symptom_fatigue 
                 + symptom_fever_high_temperature + symptom_headache + symptom_change_to_sense_of_smell_or_taste + symptom_loss_of_taste + respiratory_condition_asthma + height + weight
                 + covid_vaccine_doses + respiratory_condition_asthma*symptom_cough_any + respiratory_condition_asthma*symptom_new_continuous_cough + respiratory_condition_asthma*symptom_shortness_of_breath + respiratory_condition_asthma*symptom_diarrhoea
                 + respiratory_condition_asthma*symptom_fatigue + respiratory_condition_asthma*symptom_headache + respiratory_condition_asthma*symptom_fever_high_temperature + respiratory_condition_asthma*symptom_change_to_sense_of_smell_or_taste
                 + respiratory_condition_asthma*symptom_loss_of_taste + respiratory_condition_asthma*symptom_abdominal_pain + age*gender + age*height + age*weight
                 + gender*height + gender*weight 
                 + gender*symptom_cough_any + gender*symptom_new_continuous_cough + gender*symptom_shortness_of_breath + gender*symptom_diarrhoea
                 + gender*symptom_fatigue + gender*symptom_headache + gender*symptom_fever_high_temperature + gender*symptom_change_to_sense_of_smell_or_taste
                 + gender*symptom_loss_of_taste + gender*symptom_abdominal_pain
                 + age*symptom_cough_any + age*symptom_new_continuous_cough + age*symptom_shortness_of_breath + age*symptom_diarrhoea
                 + age*symptom_fatigue + age*symptom_headache + age*symptom_fever_high_temperature + age*symptom_change_to_sense_of_smell_or_taste
                 + age*symptom_loss_of_taste + age*symptom_abdominal_pain
               , data = train_random, family='binomial')

In [None]:
red_mod_random = step(full_mod_random)

Evaluate randomised split logistic regression model on test set

In [None]:
test_random_roc = na.omit(test_random)
pred = predict(red_mod_random, test_random_roc)
pred = prediction(pred, test_random_roc$covid_test_result)
roc = performance(pred,"tpr","fpr")
plot(roc, lwd = 2, title = 'Accuracy on Randomised Test Set')
abline(a = 0, b = 1) 
auc = performance(pred, measure = "auc")
print(auc@y.values)