In [None]:
# Programming language: R
# read file data_zfp_zrinka.csv
data_gfp_zrinka <- read.csv("/Users/ale/Documents/thesis_codon_bias/Testing_the_TEH/data/data_gfp_zrinka.csv", header = TRUE, sep = ",")

In [None]:
names(data_gfp_zrinka)

In [None]:
# keep only 'X...sequence_name...', 'strain...' and 'fluorescence.value..AU.' columns
data_gfp_zrinka <- data_gfp_zrinka[,c(1,2,4)]

In [None]:
# group by (sequence name, strain) and calculate mean fluorescence value
data_gfp_zrinka <- aggregate(data_gfp_zrinka$fluorescence.value..AU., by = list(data_gfp_zrinka$X...sequence_name., data_gfp_zrinka$strain.), FUN = mean)

In [None]:
names(data_gfp_zrinka) <- c("sequence_name", "strain", "mean_fluorescence")

In [None]:
# remove rows where sequence_name is pET28b_empty
data_gfp_zrinka <- data_gfp_zrinka[data_gfp_zrinka$sequence_name != "pET28b_empty",]

In [None]:
head(data_gfp_zrinka)

#### Random model for comparison

In [None]:
# Random model
random_model <- data_gfp_zrinka
# generate random fluorescence values for each sequence and strain
random_model$predicted_level <- runif(nrow(data_gfp_zrinka), min = 0, max = 40000)

In [None]:
head(random_model)

### Define metrics

In [None]:
correlation <- function(model, gt_col = "mean_fluorescence"){
    cor(model[,gt_col], model$predicted_level)
}

In [None]:
r_squared <- function(model, gt_col = "mean_fluorescence"){
    summary(lm(model[,gt_col] ~ model$predicted_level))$r.squared
}

In [None]:
mae <- function(model, gt_col = "mean_fluorescence"){
    mean(abs(model[,gt_col] - model$predicted_level))
}

In [None]:
rmse <- function(model, gt_col = "mean_fluorescence"){
    sqrt(mean((model[,gt_col] - model$predicted_level)^2))
}

### Compute metrics

In [None]:
# create dataframe for storing results for each model
results <- data.frame(model = character(), correlation = numeric(), r_squared = numeric(), mae = numeric(), rmse = numeric(), stringsAsFactors = FALSE)

# Random model

In [None]:
# add results on all metrics for random model as new row in results dataframe
results <- rbind(results, c("random", correlation(random_model), r_squared(random_model), mae(random_model), rmse(random_model)))

In [None]:
names(results) <- c("model", "correlation", "r_squared", "mae", "rmse")

In [None]:
results

# Efficacy model

#### Efficacy averaged

In [None]:
#read the csv file containing the data
efficacy_model_avg <- read.csv("dataframes/efficacy/predicted_efficacy_averaged.csv", header = TRUE, sep = ",")
names(efficacy_model_avg)

In [None]:
results <- rbind(results, c("Efficacy averaged", correlation(efficacy_model_avg), r_squared(efficacy_model_avg), mae(efficacy_model_avg), rmse(efficacy_model_avg)))

In [None]:
results

#### Efficacy averaged K12 strain 

In [None]:
#read the csv file containing the data
efficacy_model_avg_k12 <- read.csv("dataframes/efficacy/predicted_efficacy_averaged_k12.csv", header = TRUE, sep = ",")
names(efficacy_model_avg_k12)

In [None]:
results <- rbind(results, c("Efficacy averaged, K12 strain", correlation(efficacy_model_avg_k12), r_squared(efficacy_model_avg_k12), mae(efficacy_model_avg_k12), rmse(efficacy_model_avg_k12)))

In [None]:
results

#### Efficacy averaged BL21DE3 strain

In [None]:
#read the csv file containing the data
efficacy_model_avg_bl21 <- read.csv("dataframes/efficacy/predicted_efficacy_averaged_bl21.csv", header = TRUE, sep = ",")
names(efficacy_model_avg_bl21)

In [None]:
results <- rbind(results, c("Efficacy averaged, BL21DE3 strain", correlation(efficacy_model_avg_bl21), r_squared(efficacy_model_avg_bl21), mae(efficacy_model_avg_bl21), rmse(efficacy_model_avg_bl21)))

In [None]:
results

#### Efficacy averaged without outliers

In [None]:
efficacy_model_avg_no_outliers <- read.csv("dataframes/efficacy/predicted_efficacy_no_outliers_averaged.csv", header = TRUE, sep = ",")
names(efficacy_model_avg_no_outliers)

In [None]:
results <- rbind(results, c("Efficacy averaged without outliers", correlation(efficacy_model_avg_no_outliers), r_squared(efficacy_model_avg_no_outliers), mae(efficacy_model_avg_no_outliers), rmse(efficacy_model_avg_no_outliers)))

In [None]:
results

#### Efficacy averaged without outliers, K12 strain

In [None]:
#read the csv file containing the data
efficacy_model_avg_k12_no_outliers <- read.csv("dataframes/efficacy/predicted_efficacy_no_outliers_averaged_k12.csv", header = TRUE, sep = ",")
names(efficacy_model_avg_k12_no_outliers)

In [None]:
results <- rbind(results, c("Efficacy averaged without outliers, K12 strain", correlation(efficacy_model_avg_k12_no_outliers), r_squared(efficacy_model_avg_k12_no_outliers), mae(efficacy_model_avg_k12_no_outliers), rmse(efficacy_model_avg_k12_no_outliers)))

In [None]:
results

#### Efficacy averaged without outliers, BL21DE3 strain

In [None]:
#read the csv file containing the data
efficacy_model_avg_bl21_no_outliers <- read.csv("dataframes/efficacy/predicted_efficacy_no_outliers_averaged_bl21.csv", header = TRUE, sep = ",")
names(efficacy_model_avg_bl21_no_outliers)

In [None]:
results <- rbind(results, c("Efficacy averaged without outliers, BL21DE3 strain", correlation(efficacy_model_avg_bl21_no_outliers), r_squared(efficacy_model_avg_bl21_no_outliers), mae(efficacy_model_avg_bl21_no_outliers), rmse(efficacy_model_avg_bl21_no_outliers)))

In [None]:
results

#### Efficacy (not averaged)

In [None]:
#read the csv file containing the data
efficacy_model <- read.csv("dataframes/efficacy/predicted_efficacy.csv", header = TRUE, sep = ",")
names(efficacy_model)

In [None]:
results <- rbind(results, c("Efficacy (not averaged)", correlation(efficacy_model, "value"), r_squared(efficacy_model, "value"), mae(efficacy_model, "value"), rmse(efficacy_model, "value")))

In [None]:
results

#### Efficacy (not averaged) K12 strain

In [None]:
# read the csv file containing the data
efficacy_model_k12 <- read.csv("dataframes/efficacy/predicted_efficacy_k12.csv", header = TRUE, sep = ",")
names(efficacy_model_k12)

In [None]:
results <- rbind(results, c("Efficacy (not averaged), K12 strain", correlation(efficacy_model_k12, "value"), r_squared(efficacy_model_k12, "value"), mae(efficacy_model_k12, "value"), rmse(efficacy_model_k12, "value")))

In [None]:
results

#### Efficacy (not averaged) BL21DE3 strain

In [None]:
# read the csv file containing the data
efficacy_model_bl21 <- read.csv("dataframes/efficacy/predicted_efficacy_bl21.csv", header = TRUE, sep = ",")
names(efficacy_model_bl21)

In [None]:
results <- rbind(results, c("Efficacy (not averaged), BL21DE3 strain", correlation(efficacy_model_bl21, "value"), r_squared(efficacy_model_bl21, "value"), mae(efficacy_model_bl21, "value"), rmse(efficacy_model_bl21, "value")))

In [None]:
results

#### Efficacy (not averaged) without outliers

In [None]:
#read the csv file containing the data
efficacy_model_no_outliers <- read.csv("dataframes/efficacy/predicted_efficacy_no_outliers.csv", header = TRUE, sep = ",")
names(efficacy_model_no_outliers)

In [None]:
results <- rbind(results, c("Efficacy (not averaged) without outliers", correlation(efficacy_model_no_outliers, "value"), r_squared(efficacy_model_no_outliers, "value"), mae(efficacy_model_no_outliers, "value"), rmse(efficacy_model_no_outliers, "value")))

In [None]:
results

#### Efficacy (not averaged) without outliers K12 strain

In [None]:
# read the csv file containing the data
efficacy_model_k12_no_outliers <- read.csv("dataframes/efficacy/predicted_efficacy_no_outliers_k12.csv", header = TRUE, sep = ",")
names(efficacy_model_k12_no_outliers)

In [None]:
results <- rbind(results, c("Efficacy (not averaged) without outliers, K12 strain", correlation(efficacy_model_k12_no_outliers, "value"), r_squared(efficacy_model_k12_no_outliers, "value"), mae(efficacy_model_k12_no_outliers, "value"), rmse(efficacy_model_k12_no_outliers, "value")))

In [None]:
results

#### Efficacy (not averaged) without outliers BL21DE3 strain

In [None]:
# read the csv file containing the data
efficacy_model_bl21_no_outliers <- read.csv("dataframes/efficacy/predicted_efficacy_no_outliers_bl21.csv", header = TRUE, sep = ",")
names(efficacy_model_bl21)

In [None]:
results <- rbind(results, c("Efficacy (not averaged) without outliers, BL21DE3 strain", correlation(efficacy_model_bl21_no_outliers, "value"), r_squared(efficacy_model_bl21_no_outliers, "value"), mae(efficacy_model_bl21_no_outliers, "value"), rmse(efficacy_model_bl21_no_outliers, "value")))

In [None]:
results

# Accuracy model

#### Accuracy averaged

In [None]:
# read the csv file containing the data
accuracy_model_avg <- read.csv("dataframes/accuracy/accuracy_model_predictions_averaged.csv", header = TRUE, sep = ",")
names(accuracy_model_avg)

In [None]:
results <- rbind(results, c("Accuracy averaged", correlation(accuracy_model_avg), r_squared(accuracy_model_avg), mae(accuracy_model_avg), rmse(accuracy_model_avg)))

In [None]:
results

#### Accuracy averaged, K12 strain

In [None]:
# read the csv file containing the data
accuracy_model_avg_k12 <- read.csv("dataframes/accuracy/accuracy_model_predictions_averaged_k12.csv", header = TRUE, sep = ",")
names(accuracy_model_avg_k12)

In [None]:
results <- rbind(results, c("Accuracy averaged, K12 strain", correlation(accuracy_model_avg_k12), r_squared(accuracy_model_avg_k12), mae(accuracy_model_avg_k12), rmse(accuracy_model_avg_k12)))

In [None]:
results

#### Accuracy averaged, BL21DE3 strain

In [None]:
# read the csv file containing the data
accuracy_model_avg_bl21 <- read.csv("dataframes/accuracy/accuracy_model_predictions_averaged_bl21.csv", header = TRUE, sep = ",")
names(accuracy_model_avg_bl21)

In [None]:
results <- rbind(results, c("Accuracy averaged, BL21DE3 strain", correlation(accuracy_model_avg_bl21), r_squared(accuracy_model_avg_bl21), mae(accuracy_model_avg_bl21), rmse(accuracy_model_avg_bl21)))

In [None]:
results

#### Accuracy averaged without outliers

In [None]:
# read the csv file containing the data
accuracy_model_avg_no_outliers <- read.csv("dataframes/accuracy/accuracy_model_predictions_averaged_no_outliers.csv", header = TRUE, sep = ",")
names(accuracy_model_avg_no_outliers)

In [None]:
results <- rbind(results, c("Accuracy averaged without outliers", correlation(accuracy_model_avg_no_outliers), r_squared(accuracy_model_avg_no_outliers), mae(accuracy_model_avg_no_outliers), rmse(accuracy_model_avg_no_outliers)))

In [None]:
results

#### Accuracy averaged without outliers, K12 strain

In [None]:
# read the csv file containing the data
accuracy_model_avg_k12_no_outliers <- read.csv("dataframes/accuracy/accuracy_model_predictions_averaged_no_outliers_k12.csv", header = TRUE, sep = ",")
names(accuracy_model_avg_k12_no_outliers)

In [None]:
results <- rbind(results, c("Accuracy averaged without outliers, K12 strain", correlation(accuracy_model_avg_k12_no_outliers), r_squared(accuracy_model_avg_k12_no_outliers), mae(accuracy_model_avg_k12_no_outliers), rmse(accuracy_model_avg_k12_no_outliers)))

In [None]:
results

#### Accuracy averaged without outliers, BL21DE3 strain

In [None]:
# read the csv file containing the data
accuracy_model_avg_bl21_no_outliers <- read.csv("dataframes/accuracy/accuracy_model_predictions_averaged_no_outliers_bl21.csv", header = TRUE, sep = ",")
names(accuracy_model_avg_bl21_no_outliers)

In [None]:
results <- rbind(results, c("Accuracy averaged without outliers, BL21DE3 strain", correlation(accuracy_model_avg_bl21_no_outliers), r_squared(accuracy_model_avg_bl21_no_outliers), mae(accuracy_model_avg_bl21_no_outliers), rmse(accuracy_model_avg_bl21_no_outliers)))

In [None]:
results

#### Accuracy (not averaged)

In [None]:
# read the csv file containing the data
accuracy_model <- read.csv("dataframes/accuracy/accuracy_model_predictions.csv", header = TRUE, sep = ",")
names(accuracy_model)

In [None]:
results <- rbind(results, c("Accuracy (not averaged)", correlation(accuracy_model, "value"), r_squared(accuracy_model, "value"), mae(accuracy_model, "value"), rmse(accuracy_model, "value")))

In [None]:
results

#### Accuracy (not averaged), K12 strain

In [None]:
# read the csv file containing the data
accuracy_model_k12 <- read.csv("dataframes/accuracy/accuracy_model_predictions_k12.csv", header = TRUE, sep = ",")
names(accuracy_model_k12)

In [None]:
results <- rbind(results, c("Accuracy (not averaged), K12 strain", correlation(accuracy_model_k12, "value"), r_squared(accuracy_model_k12, "value"), mae(accuracy_model_k12, "value"), rmse(accuracy_model_k12, "value")))

In [None]:
results

#### Accuracy (not averaged), BL21DE3 strain

In [None]:
# read the csv file containing the data
accuracy_model_bl21 <- read.csv("dataframes/accuracy/accuracy_model_predictions_bl21.csv", header = TRUE, sep = ",")
names(accuracy_model_bl21)

In [None]:
results <- rbind(results, c("Accuracy (not averaged), BL21DE3 strain", correlation(accuracy_model_bl21, "value"), r_squared(accuracy_model_bl21, "value"), mae(accuracy_model_bl21, "value"), rmse(accuracy_model_bl21, "value")))

In [None]:
results

#### Accuracy (not averaged) without outliers

In [None]:
# read the csv file containing the data
accuracy_model_no_outliers <- read.csv("dataframes/accuracy/accuracy_model_predictions_no_outliers.csv", header = TRUE, sep = ",")
names(accuracy_model_no_outliers)

In [None]:
results <- rbind(results, c("Accuracy (not averaged) without outliers", correlation(accuracy_model_no_outliers, "value"), r_squared(accuracy_model_no_outliers, "value"), mae(accuracy_model_no_outliers, "value"), rmse(accuracy_model_no_outliers, "value")))

In [None]:
results

#### Accuracy (not averaged) without outliers, K12 strain

In [None]:
# read the csv file containing the data
accuracy_model_no_outliers_k12 <- read.csv("dataframes/accuracy/accuracy_model_predictions_no_outliers_k12.csv", header = TRUE, sep = ",")
names(accuracy_model_no_outliers_k12)

In [None]:
results <- rbind(results, c("Accuracy (not averaged) without outliers, K12 strain", correlation(accuracy_model_no_outliers_k12, "value"), r_squared(accuracy_model_no_outliers_k12, "value"), mae(accuracy_model_no_outliers_k12, "value"), rmse(accuracy_model_no_outliers_k12, "value")))

In [None]:
results

#### Accuracy (not averaged) without outliers, BL21DE3 strain

In [None]:
# read the csv file containing the data
accuracy_model_no_outliers_bl21 <- read.csv("dataframes/accuracy/accuracy_model_predictions_no_outliers_bl21.csv", header = TRUE, sep = ",")
names(accuracy_model_no_outliers_bl21)

In [None]:
results <- rbind(results, c("Accuracy (not averaged) without outliers, BL21DE3 strain", correlation(accuracy_model_no_outliers_bl21, "value"), r_squared(accuracy_model_no_outliers_bl21, "value"), mae(accuracy_model_no_outliers_bl21, "value"), rmse(accuracy_model_no_outliers_bl21, "value")))

In [None]:
results

# Efficacy - Accuracy model

#### Efficacy - accuracy averaged

In [None]:
# read the csv file containing the data
effi_accu_model_avg <- read.csv("dataframes/efficacy_accuracy/data_ecoli_averaged.csv", header = TRUE, sep = ",")
names(effi_accu_model_avg)

In [None]:
results <- rbind(results, c("Efficacy-Accuracy averaged", correlation(effi_accu_model_avg), r_squared(effi_accu_model_avg), mae(effi_accu_model_avg), rmse(effi_accu_model_avg)))

In [None]:
results

#### Efficacy - accuracy averaged, K12 strain

In [None]:
# read the csv file containing the data
effi_accu_model_avg_k12 <- read.csv("dataframes/efficacy_accuracy/data_ecoli_averaged_K12.csv", header = TRUE, sep = ",")
names(effi_accu_model_avg_k12)

In [None]:
results <- rbind(results, c("Efficacy-Accuracy averaged, K12 strain", correlation(effi_accu_model_avg_k12), r_squared(effi_accu_model_avg_k12), mae(effi_accu_model_avg_k12), rmse(effi_accu_model_avg_k12)))

In [None]:
results

#### Efficacy - accuracy averaged, BL21DE3 strain

In [None]:
# read the csv file containing the data
effi_accu_model_avg_bl21 <- read.csv("dataframes/efficacy_accuracy/data_ecoli_averaged_BL21DE3.csv", header = TRUE, sep = ",")
names(effi_accu_model_avg_bl21)

In [None]:
results <- rbind(results, c("Efficacy-Accuracy averaged, BL21DE3 strain", correlation(effi_accu_model_avg_bl21), r_squared(effi_accu_model_avg_bl21), mae(effi_accu_model_avg_bl21), rmse(effi_accu_model_avg_bl21)))

In [None]:
results

#### Efficacy - accuracy averaged without outliers

In [None]:
# read the csv file containing the data
effi_accu_model_avg_no_outliers <- read.csv("dataframes/efficacy_accuracy/data_ecoli_no_outliers_averaged.csv", header = TRUE, sep = ",")
names(effi_accu_model_avg_no_outliers)

In [None]:
results <- rbind(results, c("Efficacy-Accuracy averaged without outliers", correlation(effi_accu_model_avg_no_outliers), r_squared(effi_accu_model_avg_no_outliers), mae(effi_accu_model_avg_no_outliers), rmse(effi_accu_model_avg_no_outliers)))

In [None]:
results

#### Efficacy - accuracy averaged without outliers, K12 strain

In [None]:
# read the csv file containing the data
effi_accu_model_avg_no_outliers_k12 <- read.csv("dataframes/efficacy_accuracy/data_ecoli_no_outliers_averaged_K12.csv", header = TRUE, sep = ",")
names(effi_accu_model_avg_no_outliers_k12)

In [None]:
results <- rbind(results, c("Efficacy-Accuracy averaged without outliers, K12 strain", correlation(effi_accu_model_avg_no_outliers_k12), r_squared(effi_accu_model_avg_no_outliers_k12), mae(effi_accu_model_avg_no_outliers_k12), rmse(effi_accu_model_avg_no_outliers_k12)))

In [None]:
results

#### Efficacy - accuracy averaged without outliers, BL21DE3 strain

In [None]:
# read the csv file containing the data
effi_accu_model_avg_no_outliers_bl21 <- read.csv("dataframes/efficacy_accuracy/data_ecoli_no_outliers_averaged_BL21DE3.csv", header = TRUE, sep = ",")
names(effi_accu_model_avg_no_outliers_bl21)

In [None]:
results <- rbind(results, c("Efficacy-Accuracy averaged without outliers, BL21DE3 strain", correlation(effi_accu_model_avg_no_outliers_bl21), r_squared(effi_accu_model_avg_no_outliers_bl21), mae(effi_accu_model_avg_no_outliers_bl21), rmse(effi_accu_model_avg_no_outliers_bl21)))

In [None]:
results

#### Efficacy - accuracy (not averaged)

In [None]:
# read the csv file containing the data
effi_accu_model <- read.csv("dataframes/efficacy_accuracy/data_ecoli.csv", header = TRUE, sep = ",")
names(effi_accu_model)

In [None]:
results <- rbind(results, c("Efficacy-Accuracy (not averaged)", correlation(effi_accu_model, "value"), r_squared(effi_accu_model, "value"), mae(effi_accu_model, "value"), rmse(effi_accu_model, "value")))

In [None]:
results

#### Efficacy - accuracy (not averaged), K12 strain

In [None]:
# read the csv file containing the data
effi_accu_model_k12 <- read.csv("dataframes/efficacy_accuracy/data_ecoli_K12.csv", header = TRUE, sep = ",")
names(effi_accu_model_k12)

In [None]:
results <- rbind(results, c("Efficacy-Accuracy (not averaged), K12 strain", correlation(effi_accu_model_k12, "value"), r_squared(effi_accu_model_k12, "value"), mae(effi_accu_model_k12, "value"), rmse(effi_accu_model_k12, "value")))

In [None]:
results

#### Efficacy - accuracy (not averaged), BL21DE3 strain

In [None]:
# read the csv file containing the data
effi_accu_model_bl21 <- read.csv("dataframes/efficacy_accuracy/data_ecoli_BL21DE3.csv", header = TRUE, sep = ",")
names(effi_accu_model_bl21)

In [None]:
results <- rbind(results, c("Efficacy-Accuracy (not averaged), BL21DE3 strain", correlation(effi_accu_model_bl21, "value"), r_squared(effi_accu_model_bl21, "value"), mae(effi_accu_model_bl21, "value"), rmse(effi_accu_model_bl21, "value")))

In [None]:
results

#### Efficacy - accuracy (not averaged) without outliers

In [None]:
# read the csv file containing the data
effi_accu_model_no_outliers <- read.csv("dataframes/efficacy_accuracy/data_ecoli_no_outliers.csv", header = TRUE, sep = ",")
names(effi_accu_model_no_outliers)

In [None]:
results <- rbind(results, c("Efficacy-Accuracy (not averaged) without outliers", correlation(effi_accu_model_no_outliers, "value"), r_squared(effi_accu_model_no_outliers, "value"), mae(effi_accu_model_no_outliers, "value"), rmse(effi_accu_model_no_outliers, "value")))

In [None]:
results

#### Efficacy - accuracy (not averaged) without outliers, K12 strain

In [None]:
# read the csv file containing the data
effi_accu_model_no_outliers_k12 <- read.csv("dataframes/efficacy_accuracy/data_ecoli_no_outliers_K12.csv", header = TRUE, sep = ",")
names(effi_accu_model_no_outliers_k12)

In [None]:
results <- rbind(results, c("Efficacy-Accuracy (not averaged) without outliers, K12 strain", correlation(effi_accu_model_no_outliers_k12, "value"), r_squared(effi_accu_model_no_outliers_k12, "value"), mae(effi_accu_model_no_outliers_k12, "value"), rmse(effi_accu_model_no_outliers_k12, "value")))

In [None]:
results

#### Efficacy - accuracy (not averaged) without outliers, BL21DE3 strain

In [None]:
# read the csv file containing the data
effi_accu_model_no_outliers_bl21 <- read.csv("dataframes/efficacy_accuracy/data_ecoli_no_outliers_BL21DE3.csv", header = TRUE, sep = ",")
names(effi_accu_model_no_outliers_bl21)

In [None]:
results <- rbind(results, c("Efficacy-Accuracy (not averaged) without outliers, BL21DE3 strain", correlation(effi_accu_model_no_outliers_bl21, "value"), r_squared(effi_accu_model_no_outliers_bl21, "value"), mae(effi_accu_model_no_outliers_bl21, "value"), rmse(effi_accu_model_no_outliers_bl21, "value")))

In [None]:
results