In [51]:
# Programming language: R
# read file data_zfp_zrinka.csv
data_gfp_zrinka <- read.csv("/Users/ale/Documents/thesis_codon_bias/Testing_the_TEH/data/data_gfp_zrinka.csv", header = TRUE, sep = ",")

In [52]:
names(data_gfp_zrinka)

In [53]:
# keep only 'X...sequence_name...', 'strain...' and 'fluorescence.value..AU.' columns
data_gfp_zrinka <- data_gfp_zrinka[,c(1,2,4)]

In [54]:
# group by (sequence name, strain) and calculate mean fluorescence value
data_gfp_zrinka <- aggregate(data_gfp_zrinka$fluorescence.value..AU., by = list(data_gfp_zrinka$X...sequence_name., data_gfp_zrinka$strain.), FUN = mean)

In [55]:
names(data_gfp_zrinka) <- c("sequence_name", "strain", "mean_fluorescence")

In [56]:
# remove rows where sequence_name is pET28b_empty
data_gfp_zrinka <- data_gfp_zrinka[data_gfp_zrinka$sequence_name != "pET28b_empty",]

In [57]:
head(data_gfp_zrinka)

Unnamed: 0_level_0,sequence_name,strain,mean_fluorescence
Unnamed: 0_level_1,<chr>,<chr>,<dbl>
1,V015-wildtype,BL21DE3,17098.38
2,V016-AnaCoda,BL21DE3,33182.5
3,V017-GeneGA,BL21DE3,33248.62
4,V018-JCAT,BL21DE3,32378.12
5,V019-Twist (Variant 1),BL21DE3,31469.88
6,V020-Twist (Variant 2),BL21DE3,26363.5


Random model for comparison

In [58]:
# Random model
random_model <- data_gfp_zrinka
# generate random fluorescence values for each sequence and strain
random_model$predicted_level <- runif(nrow(data_gfp_zrinka), min = 0, max = 40000)

In [59]:
head(random_model)

Unnamed: 0_level_0,sequence_name,strain,mean_fluorescence,predicted_level
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>
1,V015-wildtype,BL21DE3,17098.38,5327.0
2,V016-AnaCoda,BL21DE3,33182.5,24653.93
3,V017-GeneGA,BL21DE3,33248.62,10679.42
4,V018-JCAT,BL21DE3,32378.12,15449.1
5,V019-Twist (Variant 1),BL21DE3,31469.88,27054.39
6,V020-Twist (Variant 2),BL21DE3,26363.5,16778.21


## Define metrics

In [60]:
correlation <- function(model, gt_col = "mean_fluorescence"){
    cor(model[,gt_col], model$predicted_level)
}

In [61]:
r_squared <- function(model, gt_col = "mean_fluorescence"){
    summary(lm(model[,gt_col] ~ model$predicted_level))$r.squared
}

In [62]:
mae <- function(model, gt_col = "mean_fluorescence"){
    mean(abs(model[,gt_col] - model$predicted_level))
}

In [63]:
rmse <- function(model, gt_col = "mean_fluorescence"){
    sqrt(mean((model[,gt_col] - model$predicted_level)^2))
}

### Compute metrics

In [64]:
# create dataframe for storing results for each model
results <- data.frame(model = character(), correlation = numeric(), r_squared = numeric(), mae = numeric(), rmse = numeric(), stringsAsFactors = FALSE)

#### Random

In [65]:
# add results on all metrics for random model as new row in results dataframe
results <- rbind(results, c("random", correlation(random_model), r_squared(random_model), mae(random_model), rmse(random_model)))

In [66]:
names(results) <- c("model", "correlation", "r_squared", "mae", "rmse")

In [67]:
results

model,correlation,r_squared,mae,rmse
<chr>,<chr>,<chr>,<chr>,<chr>
random,-0.255847154044229,0.0654577662325313,14422.703458968,16521.1669432497


#### Efficacy with mean values

In [68]:
#read file data_gfp_zrinka_predicted_efficiency.csv
efficacy_model <- read.csv("data_gfp_zrinka_predicted_efficiency_averaged.csv", header = TRUE, sep = ",")
names(efficacy_model)

In [69]:
results <- rbind(results, c("efficacy averaged", correlation(efficacy_model), r_squared(efficacy_model), mae(efficacy_model), rmse(efficacy_model)))

In [70]:
results

model,correlation,r_squared,mae,rmse
<chr>,<chr>,<chr>,<chr>,<chr>
random,-0.255847154044229,0.0654577662325313,14422.703458968,16521.1669432497
efficacy averaged,0.697063277094304,0.485897212273451,6490.55924593995,7546.13134273863


#### Efficacy with values not averaged

In [71]:
#read file data_gfp_zrinka_predicted_efficiency.csv
efficacy_values_model <- read.csv("data_gfp_zrinka_predicted_efficiency.csv", header = TRUE, sep = ",")

In [72]:
results <- rbind(results, c("efficacy with values", correlation(efficacy_values_model, "value"), r_squared(efficacy_values_model, "value"), mae(efficacy_values_model, "value"), rmse(efficacy_values_model, "value")))

In [73]:
results

model,correlation,r_squared,mae,rmse
<chr>,<chr>,<chr>,<chr>,<chr>
random,-0.255847154044229,0.0654577662325313,14422.703458968,16521.1669432497
efficacy averaged,0.697063277094304,0.485897212273451,6490.55924593995,7546.13134273863
efficacy with values,0.66927686160308,0.447931517477269,6633.50162672675,7807.90200974659


#### Efficacy K12 strain set

In [74]:
# read file data_gfp_zrinka_predicted_accuracy.csv
efficacy_model_k12 <- read.csv("data_gfp_zrinka_predicted_efficiency_k12.csv", header = TRUE, sep = ",")

In [75]:
names(efficacy_model_k12)

In [76]:
results <- rbind(results, c("efficacy averaged k12", correlation(efficacy_model_k12, "value"), r_squared(efficacy_model_k12, "value"), mae(efficacy_model_k12, "value"), rmse(efficacy_model_k12, "value")))

In [77]:
results

model,correlation,r_squared,mae,rmse
<chr>,<chr>,<chr>,<chr>,<chr>
random,-0.255847154044229,0.0654577662325313,14422.703458968,16521.1669432497
efficacy averaged,0.697063277094304,0.485897212273451,6490.55924593995,7546.13134273863
efficacy with values,0.66927686160308,0.447931517477269,6633.50162672675,7807.90200974659
efficacy averaged k12,0.621820720146541,0.386661008003563,3530.59047248711,4479.10427947054


#### Efficiency BL21DE3 strain set

In [78]:
# read file data_gfp_zrinka_predicted_accuracy.csv
efficacy_model_bl21 <- read.csv("data_gfp_zrinka_predicted_efficiency_bl21.csv", header = TRUE, sep = ",")

In [79]:
names(efficacy_model_bl21)

In [80]:
results <- rbind(results, c("efficacy averaged bl21", correlation(efficacy_model_bl21, "value"), r_squared(efficacy_model_bl21, "value"), mae(efficacy_model_bl21, "value"), rmse(efficacy_model_bl21, "value")))

In [81]:
results

model,correlation,r_squared,mae,rmse
<chr>,<chr>,<chr>,<chr>,<chr>
random,-0.255847154044229,0.0654577662325313,14422.703458968,16521.1669432497
efficacy averaged,0.697063277094304,0.485897212273451,6490.55924593995,7546.13134273863
efficacy with values,0.66927686160308,0.447931517477269,6633.50162672675,7807.90200974659
efficacy averaged k12,0.621820720146541,0.386661008003563,3530.59047248711,4479.10427947054
efficacy averaged bl21,0.455880002455986,0.207826576639269,8522.23015539435,9266.04389652038


#### Accuracy with mean values

In [82]:
# read file data_gfp_zrinka_predicted_accuracy.csv
accuracy_model_averaged <- read.csv("dataframes/accuracy/accuracy_model_predictions_averaged.csv", header = TRUE, sep = ",")

In [83]:
names(accuracy_model_averaged)

In [84]:
results <- rbind(results, c("accuracy averaged", correlation(accuracy_model_averaged), r_squared(accuracy_model_averaged), mae(accuracy_model_averaged), rmse(accuracy_model_averaged)))

In [85]:
results

model,correlation,r_squared,mae,rmse
<chr>,<chr>,<chr>,<chr>,<chr>
random,-0.255847154044229,0.0654577662325313,14422.703458968,16521.1669432497
efficacy averaged,0.697063277094304,0.485897212273451,6490.55924593995,7546.13134273863
efficacy with values,0.66927686160308,0.447931517477269,6633.50162672675,7807.90200974659
efficacy averaged k12,0.621820720146541,0.386661008003563,3530.59047248711,4479.10427947054
efficacy averaged bl21,0.455880002455986,0.207826576639269,8522.23015539435,9266.04389652038
accuracy averaged,0.640268125334067,0.4099432723188,11296.3975019223,12265.0092866878


#### Accuracy with values not averaged

In [86]:
# read file data_gfp_zrinka_predicted_accuracy.csv
accuracy_model_values <- read.csv("dataframes/accuracy/accuracy_model_predictions.csv", header = TRUE, sep = ",")

In [87]:
results <- rbind(results, c("accuracy with values", correlation(accuracy_model_values, "value"), r_squared(accuracy_model_values, "value"), mae(accuracy_model_values, "value"), rmse(accuracy_model_values, "value")))

In [88]:
results

model,correlation,r_squared,mae,rmse
<chr>,<chr>,<chr>,<chr>,<chr>
random,-0.255847154044229,0.0654577662325313,14422.703458968,16521.1669432497
efficacy averaged,0.697063277094304,0.485897212273451,6490.55924593995,7546.13134273863
efficacy with values,0.66927686160308,0.447931517477269,6633.50162672675,7807.90200974659
efficacy averaged k12,0.621820720146541,0.386661008003563,3530.59047248711,4479.10427947054
efficacy averaged bl21,0.455880002455986,0.207826576639269,8522.23015539435,9266.04389652038
accuracy averaged,0.640268125334067,0.4099432723188,11296.3975019223,12265.0092866878
accuracy with values,0.614745684630434,0.377912256771741,11311.8415233569,12427.7788986802


#### Accuracy K12 strain set

In [89]:
# read file data_gfp_zrinka_predicted_accuracy.csv
accuracy_model_k12 <- read.csv("dataframes/accuracy/accuracy_model_predictions_averaged_k12.csv", header = TRUE, sep = ",")

In [90]:
names(accuracy_model_k12)

In [91]:
results <- rbind(results, c("accuracy averaged k12", correlation(accuracy_model_k12), r_squared(accuracy_model_k12), mae(accuracy_model_k12), rmse(accuracy_model_k12)))

In [92]:
results

model,correlation,r_squared,mae,rmse
<chr>,<chr>,<chr>,<chr>,<chr>
random,-0.255847154044229,0.0654577662325313,14422.703458968,16521.1669432497
efficacy averaged,0.697063277094304,0.485897212273451,6490.55924593995,7546.13134273863
efficacy with values,0.66927686160308,0.447931517477269,6633.50162672675,7807.90200974659
efficacy averaged k12,0.621820720146541,0.386661008003563,3530.59047248711,4479.10427947054
efficacy averaged bl21,0.455880002455986,0.207826576639269,8522.23015539435,9266.04389652038
accuracy averaged,0.640268125334067,0.4099432723188,11296.3975019223,12265.0092866878
accuracy with values,0.614745684630434,0.377912256771741,11311.8415233569,12427.7788986802
accuracy averaged k12,0.732455051411732,0.536490402338562,7410.13940683992,8064.8560830199


#### Accuracy no outliers

In [93]:
# read file data_gfp_zrinka_predicted_accuracy.csv
accuracy_model_no_outliers <- read.csv("dataframes/accuracy/accuracy_model_predictions_averaged_no_outliers.csv", header = TRUE, sep = ",")

In [94]:
names(accuracy_model_no_outliers)

In [95]:
results <- rbind(results, c("accuracy averaged no outliers", correlation(accuracy_model_no_outliers), r_squared(accuracy_model_no_outliers), mae(accuracy_model_no_outliers), rmse(accuracy_model_no_outliers)))

In [96]:
results

model,correlation,r_squared,mae,rmse
<chr>,<chr>,<chr>,<chr>,<chr>
random,-0.255847154044229,0.0654577662325313,14422.703458968,16521.1669432497
efficacy averaged,0.697063277094304,0.485897212273451,6490.55924593995,7546.13134273863
efficacy with values,0.66927686160308,0.447931517477269,6633.50162672675,7807.90200974659
efficacy averaged k12,0.621820720146541,0.386661008003563,3530.59047248711,4479.10427947054
efficacy averaged bl21,0.455880002455986,0.207826576639269,8522.23015539435,9266.04389652038
accuracy averaged,0.640268125334067,0.4099432723188,11296.3975019223,12265.0092866878
accuracy with values,0.614745684630434,0.377912256771741,11311.8415233569,12427.7788986802
accuracy averaged k12,0.732455051411732,0.536490402338562,7410.13940683992,8064.8560830199
accuracy averaged no outliers,0.658960894444605,0.434229460407232,11308.5388527419,12307.1090952363


In [98]:
# read file data_ecoli_grouped.csv
eff_acc_grouped <- read.csv("dataframes/efficacy_accuracy/data_ecoli_grouped.csv", header = TRUE, sep = ",")

In [99]:
names(eff_acc_grouped)

In [100]:
results <- rbind(results, c("efficacy & accuracy averaged", correlation(eff_acc_grouped), r_squared(eff_acc_grouped), mae(eff_acc_grouped), rmse(eff_acc_grouped)))

In [101]:
results

model,correlation,r_squared,mae,rmse
<chr>,<chr>,<chr>,<chr>,<chr>
random,-0.255847154044229,0.0654577662325313,14422.703458968,16521.1669432497
efficacy averaged,0.697063277094304,0.485897212273451,6490.55924593995,7546.13134273863
efficacy with values,0.66927686160308,0.447931517477269,6633.50162672675,7807.90200974659
efficacy averaged k12,0.621820720146541,0.386661008003563,3530.59047248711,4479.10427947054
efficacy averaged bl21,0.455880002455986,0.207826576639269,8522.23015539435,9266.04389652038
accuracy averaged,0.640268125334067,0.4099432723188,11296.3975019223,12265.0092866878
accuracy with values,0.614745684630434,0.377912256771741,11311.8415233569,12427.7788986802
accuracy averaged k12,0.732455051411732,0.536490402338562,7410.13940683992,8064.8560830199
accuracy averaged no outliers,0.658960894444605,0.434229460407232,11308.5388527419,12307.1090952363
efficacy & accuracy averaged,0.375656653188502,0.141117921084786,55683.1643693768,119537.591264089


#### Accuracy BL21DE3 strain set

In [97]:
# read file data_gfp_zrinka_predicted_accuracy.csv
accuracy_model_bl21 <- read.csv("dataframes/accuracy/accuracy_model_predictions_bl21.csv", header = TRUE, sep = ",")

In [48]:
names(accuracy_model_bl21)

In [50]:
results <- rbind(results, c("accuracy averaged bl21", correlation(accuracy_model_bl21), r_squared(accuracy_model_bl21), mae(accuracy_model_bl21), rmse(accuracy_model_bl21)))

ERROR: Error in `[.data.frame`(model, , gt_col): colonnes non d'efinies s'electionn'ees


In [None]:
results

model,correlation,r_squared,mae,rmse
<chr>,<chr>,<chr>,<chr>,<chr>
random,-0.111843267939443,0.012508916583374,11195.6141561934,13941.210317551
efficiency averaged,0.697063277094304,0.485897212273451,6490.55924593995,7546.13134273863
efficiency with values,0.66927686160308,0.447931517477269,6633.50162672675,7807.90200974659
efficacy averaged k12,0.73667383237626,0.542688335307926,3299.53221960326,3746.2178370665
efficacy averaged bl21,0.470697035275706,0.22155569901734,8432.92352284054,9113.90137859249
accuracy averaged,0.640268125334067,0.4099432723188,11296.3975019223,12265.0092866878
accuracy with values,0.614745684630434,0.377912256771741,11311.8415233569,12427.7788986802
accuracy averaged k12,0.732455051411732,0.536490402338562,7410.13940683992,8064.8560830199
accuracy averaged bl21,0.477728210294581,0.228224242911263,13661.9459076246,14226.9494770777


In [None]:
# read file data_gfp_zrinka_predicted_accuracy.csv
accuracy_model_averaged_no_outliers <- read.csv("dataframes/accuracy/accuracy_model_predictions_averaged_no_outliers.csv", header = TRUE, sep = ",")

In [None]:
results <- rbind(results, c("accuracy averaged no outliers", correlation(accuracy_model_averaged_no_outliers), r_squared(accuracy_model_averaged_no_outliers), mae(accuracy_model_averaged_no_outliers), rmse(accuracy_model_averaged_no_outliers)))

ERROR: Error in rbind(results, c("accuracy averaged no outliers", correlation(accuracy_model_averaged_no_outliers), : objeto 'results' no encontrado


In [None]:
results