In [1]:
# This script defines functions to build a Translational Efficiency Hypothesis (TEH) - Efficacy model 
# that predicts protein expression in E. coli. 
# The focus is on the cost that derives from elongation times.


#retrieve elongation rate data of 
# E. coli from published data (Shah and Gilchrist, 2010, Plos Genetics)
# and to derive a predicted protein expression rate for a given codon sequence from them. 

# Authors: 
# Victor Garcia, November 2022, Zurich University of Applied Sciences, 
# Institute for Computational Life Sciences
# Alejandra Lopez Sosa, April 2023, Zurich University of Applied Sciences,
# Institute for Chemistry and Biotechnology

# Data description

# Elongation rates of Escherichia coli 
# Elongation rates of E.coli are taken from: 
# https://storage.googleapis.com/plos-corpus-prod/10.1371/journal.pgen.1001128/1/pgen.1001128.s007.pdf?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=wombat-sa%40plos-prod.iam.gserviceaccount.com%2F20221129%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20221129T145404Z&X-Goog-Expires=86400&X-Goog-SignedHeaders=host&X-Goog-Signature=0212a3da2a2726d356ae2c98490e155bd3bfe782d9a83fc14a9d1b66fb0f1e028a6e1aaf011d3ed8e16214e0bbb7b73737143768e37c2af69d094fd2f03a2d653ad49185883551e8324642410a500367aa831184bfc50a9eb54a9f06c867ff372775698a384cc579bcb41246f6400ee8ece362a6b8a84fa7af039a217c85db361e8ac9d7736bfc9f07c07bb7bc7dbc0ea630746e5f33642451e27b8315e488196cf6651ea83200a70184b6ca3eb00560640388c3669c93283d281a3c7e49c457c064df62a37a630f235eb09c3409baccbc66bb280e01382a65e94c4ec1f196fcbbc4b8e825b53bc588d9bf109e1aa6a7fef6e53c1ee4a86f5eb7bae99072ca28
# From the publication of Shah and Gilchrist, 2010, Plos Genetics:
# https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1001128#s5
# Data are stored locally under the file name: codon_suberin/Rscripts/data/data_ecoli_shah2010plosgen.csv


# Protein expression levels measurements (empirical data)
# These data were generated by Marco Gees, Zrinka Raguz Nakic and Christin Peters 
# from the Institute of Biotechnology, ZHAW Wädenswil, during the years 2020-2022. 
# They are stored under the name Testing_the_TEH/data/data_gfp_zrinka.csv

### Preparations

In [None]:
# First we set the corresponding working directory

# setting working path
setwd("/Users/ale/Documents/thesis_codon_bias/Testing_the_TEH/Rscripts")

# Installing all necessary packages
install.packages("RColorBrewer")
install.packages("seqinr")
install.packages("VGAM")
install.packages("doSNOW")
install.packages("coda")
install.packages("EMCluster")
install.packages("Biostrings")
if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
# Need 3.14 with R version 4.1
BiocManager::install(version = "3.14")
BiocManager::install(c("GeneGA"))
BiocManager::install("sscu")
install.packages("bioseq")
install.packages("GeneGA")
install.packages("AnaCoDa")

# load necessary pacakges
library("RColorBrewer")
library("seqinr")
library("VGAM")
library("doSNOW")
library("coda")
library("EMCluster")
library("Biostrings")
library("bioseq")
library("GeneGA")
library("AnaCoDa")

# Source the functions
source("sequence_conversions.R")

### Retrieve Data

In [3]:
# Retrieve Elongation Rates
# Read the elongation rates file
# Elongation rates are stored in file "data_ecoli_shah2010plosgen.csv" under $Rc
ecoli_codon_data = read.csv(file = "../data/data_ecoli_shah2010plosgen.csv", header = TRUE)

In [4]:
head(ecoli_codon_data)

Unnamed: 0_level_0,X...AA,Codon,Cognates,Pseudo..cognates,Near.cognates,Rc,Rn,X..M,X..N
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
1,A,GCA,UGC,GGC,"UCC, UAC, UGA, UGU, UGG, UUC",21.496,0.055,0.00255,0.000146
2,A,GCC,"GGC, UGC",,"GAC, GGU, GUC, GCC, GGA, GGG",27.218,0.0586,0.00215,0.000115
3,A,GCG,UGC,GGC,"CGA, CGG, CCC, CGU",13.76,0.0211,0.00153,0.000228
4,A,GCU,"GGC, UGC",,,22.061,0.0,0.0,0.000143
5,C,UGC,GCA,,"GCU, GUA, GCC, GGA, GAA, CCA",7.163,0.0542,0.00751,0.000436
6,C,UGU,GCA,,"ACG, CCA",4.584,0.0216,0.00469,0.000683


In [5]:
# Assign elongation rates (units: amino acids per second [aa/s])
elongation_rates_ecoli <- ecoli_codon_data$Rc
names(elongation_rates_ecoli) <- mrna.string.to.seq.string(ecoli_codon_data$Codon)

In [6]:
# Assign elongation times
# Elongation times = 1/elongation rates (units: seconds per amino acids [s/aa])
elongation_times_ecoli <- 1 / elongation_rates_ecoli

In [7]:
# Retrieve Protein Abundances/Protein Expression Levels
# Load measured protein expression levels Zrinka
zrinka_data <- read.csv(file = "../data/data_gfp_zrinka.csv", header = TRUE)

In [8]:
# Assign measured protein expression levels Zrinka
colnames(zrinka_data) <- c("sequence_name", "strain", "time_of_measurement", "value", "outlier_status")
prot_expr_levs <- zrinka_data$value
names(prot_expr_levs) <- zrinka_data$sequence_name

In [9]:
# Retrieve sequences
# Read sequence constructs file
sequence_constructs <- read.csv("../data/sequence_names_mRNA.csv")

# test
seq_cand <- sequence_constructs$Sequence[1]
seq_cand

### Computing elongation times of a mRNA sequence

In [10]:
source("sequence_conversions.R")

# This function computes the total elongation time 
# of an mRNA sequence from the codons of that sequence (argument: sequence) 
# and the elongation times for each individual codon of said sequence.
# Elongation times are sourced from a reference table ("elong_time_table").
# Elongation times of all threes stop codons are excluded.
total.elongation.time.of.mRNA <- function(seq_cand, elong_time_table, verbose = FALSE) {
  codons_of_sequence <- seq.string.to.cod.string(seq_cand)
  L <- length(codons_of_sequence)
  
  if (verbose) {
    cat("length candidate sequence is:", L, '\n')
  }
  
  # Exclude the last codon (stop codon)
  codons <- codons_of_sequence[1:(L-1)]
  
  # Identify positions of codons in the elongation time table
  positions_in_table <- match(codons, names(elong_time_table))
  
  # Retrieve elongation times for codons
  elong_times <- elong_time_table[positions_in_table]
  
  if (verbose) {
    cat("Elongation times of the codons:", elong_times, '\n')
  }
  
  # Calculate total elongation time
  total_elon_time <- sum(elong_times)
  
  return(total_elon_time)
}

# Test
elong_time_table <- elongation_times_ecoli
seq_cand <- sequence_constructs$Sequence[1]
total.elongation.time.of.mRNA(seq_cand, elong_time_table)

In [11]:
# ORIGINAL
source("sequence_conversions.R")

# This function computes the total elongation time 
# of an mRNA sequence from the codons of that sequence (argument: sequence) 
# and the elongation times for each individual codon of said sequence.
# Elongation times are sourced from a reference table ("elong_time_table").
# Elongation times of all three stop codons are excluded.
total.elongation.time.of.mRNA <- function(seq_cand, elong_time_table, verbose = FALSE){

  # retrieve all the codons of the sequence
  # function "seq.string.to.cod.string" is sourced from "sequence_conversions.R".
  codons_of_sequence <- seq.string.to.cod.string(seq_cand)
  
  # length of the sequence in codons
  L <- length(codons_of_sequence)
  
  if(verbose){
    cat("length candidate sequence is:", L, '\n')
  }
  
  # total elongation time
  total_elon_time <- 0
  
  # loop through all codons one by one 
  # and retrieve the codon elongation time
  # adding it to the sum total each time
  # This procedure excludes the stop codon (L-1)
  for(i in 1:(L-1)){
    
    if(verbose){
      cat("looping through codon",i, '\n')
    }
    
    ith_codon <- codons_of_sequence[i]
    
    if(verbose){
      cat("the ith codon is", ith_codon, '\n')
    }
    
    # identify location of current codon in elongation time table
    position_of_i_in_table <- which(names(elong_time_table) == ith_codon)
    
    # retrieve the elongation time of the ith codon
    elong_time_of_i <- elong_time_table[position_of_i_in_table]
    
    if(verbose){
      cat("the elongation time of the ith codon is", elong_time_of_i, '\n')
    }
    
    # add the that elongation time to the sum total
    total_elon_time <- total_elon_time + elong_time_of_i
  }
  
  # return the sum total
  return(total_elon_time)
  
}

# test 
elong_time_table <- elongation_times_ecoli
seq_cand <- sequence_constructs$Sequence[1]
total.elongation.time.of.mRNA(seq_cand, elong_time_table)

### Creating a dataframe to collect all information

In [12]:
# We build a table with the all the information combined for ease of plotting

# Extract the identifier numbers from the protein expression data
n_cand_seqs <- length(zrinka_data$value)
names_prot_expr_levels <- zrinka_data$sequence_name
identifiers <- sapply(1:n_cand_seqs, function(x) substr(names_prot_expr_levels[x], 3, 4))
identifiers <- as.numeric(identifiers)

# Assing the controls to another variable so they can be removed later, since they carry no information on any sequence
ind_controls <- (1:n_cand_seqs)[is.na(identifiers)]

# Generate a data frame only for protein expression levels and after remove the controls
prot_expr_data_frame <- cbind(id = identifiers, zrinka_data)
prot_expr_data_frame_nocontrols <- prot_expr_data_frame[-ind_controls, ]
prot_expr_data_frame_nocontrols

# Extract the identifier numbers from the sequence data
n_cand_seqs_s <- length(sequence_constructs$Sequence)
names_constructs <- sequence_constructs$Construct.name
identifiers_s <- sapply(1:n_cand_seqs_s, function(x) substr(names_constructs[x], 3, 4))
identifiers_s <- as.numeric(identifiers_s)

# Add identifiers to the data frame of sequence constructs
sequence_constructs <- cbind(id =identifiers_s, sequence_constructs)

# Compute elongation times of all the sequences
elongation_times_sequences <- sapply(1:n_cand_seqs_s, function(x) total.elongation.time.of.mRNA(sequence_constructs$Sequence[x], elongation_times_ecoli))
names(elongation_times_sequences) <- names_constructs

# Add the elongation times to the data frame of sequence constructs
sequence_constructs <- cbind(sequence_constructs, elongation_times_sequences)
dim(sequence_constructs)

# Combine sequence constructs and protein expression levels data sets via the identifiers in a new data frame
n_prot_expr_meas <- dim(prot_expr_data_frame_nocontrols)[2]
n_rows <- dim(prot_expr_data_frame_nocontrols)[1]
n_cols <- (dim(prot_expr_data_frame_nocontrols)[2] + dim(sequence_constructs)[2] + 1)
total_entries <- n_rows*n_cols

# Build the empty data frame
all_data <- data.frame(matrix(vector(mode = 'numeric', length = total_entries), nrow = n_rows, ncol = n_cols))
#all_data <- data.frame(matrix(nrow = n_rows, ncol = n_cols))
dim(all_data)

# Fill in the protein expression data with the controls removed
all_data[1:n_rows, 1:dim(prot_expr_data_frame_nocontrols)[2]]<- prot_expr_data_frame_nocontrols

# Add the information on column names
colnames(all_data) <- c(colnames(prot_expr_data_frame_nocontrols), colnames(sequence_constructs), "predicted_protein_expression_level")
all_data

"NAs introducidos por coerci'on"


Unnamed: 0_level_0,id,sequence_name,strain,time_of_measurement,value,outlier_status
Unnamed: 0_level_1,<dbl>,<chr>,<chr>,<int>,<dbl>,<chr>
9,15,V015-wildtype,K12,20,14853.5,no
10,15,V015-wildtype,K12,20,14101.5,no
11,15,V015-wildtype,K12,20,15070.5,no
12,15,V015-wildtype,K12,20,14050.5,no
13,15,V015-wildtype,K12,20,14740.0,no
14,15,V015-wildtype,K12,20,14743.0,no
15,15,V015-wildtype,K12,20,13430.0,no
16,15,V015-wildtype,K12,20,11611.0,yes
17,16,V016-AnaCoda,K12,20,21446.5,no
18,16,V016-AnaCoda,K12,20,21939.5,no


id,sequence_name,strain,time_of_measurement,value,outlier_status,id,X...Organism,Construct.name,algorithm,Sequence,elongation_times_sequences,predicted_protein_expression_level
<dbl>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<dbl>.1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
15,V015-wildtype,K12,20,14853.5,no,0,0,0,0,0,0,0
15,V015-wildtype,K12,20,14101.5,no,0,0,0,0,0,0,0
15,V015-wildtype,K12,20,15070.5,no,0,0,0,0,0,0,0
15,V015-wildtype,K12,20,14050.5,no,0,0,0,0,0,0,0
15,V015-wildtype,K12,20,14740.0,no,0,0,0,0,0,0,0
15,V015-wildtype,K12,20,14743.0,no,0,0,0,0,0,0,0
15,V015-wildtype,K12,20,13430.0,no,0,0,0,0,0,0,0
15,V015-wildtype,K12,20,11611.0,yes,0,0,0,0,0,0,0
16,V016-AnaCoda,K12,20,21446.5,no,0,0,0,0,0,0,0
16,V016-AnaCoda,K12,20,21939.5,no,0,0,0,0,0,0,0


### Compute calibration constants for K12 and BL21DE3 strains via wildtype

In [13]:
# id = 15 is the wildtype
id_wt <- which(prot_expr_data_frame_nocontrols$id == 15)
id_wt_seq <- which(sequence_constructs$id == 15)

# Write a function to calculate alpha values
calculate.alpha <- function(strain, prot_expr_data, sequence_data, id_wt_seq) {
  id_strain <- which(prot_expr_data$strain == strain)
  id_wt_strain <- intersect(id_strain, id_wt)
  avg_wt_prot_expr_strain <- mean(prot_expr_data$value[id_wt_strain])
  alpha <- avg_wt_prot_expr_strain * sequence_data$elongation_times_sequences[id_wt_seq]
  return(alpha)
}

# Calculate alpha values for K12 and BL21DE3
alpha_k12 <- calculate.alpha("K12", prot_expr_data_frame_nocontrols, sequence_constructs, id_wt_seq)
alpha_b21 <- calculate.alpha("BL21DE3", prot_expr_data_frame_nocontrols, sequence_constructs, id_wt_seq)
alpha_k12
alpha_b21

### Predict protein expression levels

In [14]:
# Find row indices in sequence constructs for all_data rows
row_ids_seq_constr <- match(all_data$id, sequence_constructs$id)

# Predict protein expression levels and store in all_data
predicted_prot_expr_levels <- ifelse(
  all_data$strain == "K12", 
  alpha_k12 / sequence_constructs$elongation_times_sequences[row_ids_seq_constr],
  ifelse(
    all_data$strain == "BL21DE3", 
    alpha_b21 / sequence_constructs$elongation_times_sequences[row_ids_seq_constr], 
    0
  )
)

all_data[, (n_prot_expr_meas + 1):n_cols] <- cbind(sequence_constructs[row_ids_seq_constr, ], predicted_prot_expr_levels)
all_data

id,sequence_name,strain,time_of_measurement,value,outlier_status,id,X...Organism,Construct.name,algorithm,Sequence,elongation_times_sequences,predicted_protein_expression_level
<dbl>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<dbl>.1,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
15,V015-wildtype,K12,20,14853.5,no,15,AtCAD5,V015_AtCAD5_wt,wt,ATGGGAATAATGGAGGCAGAGAGGAAAACAACAGGCTGGGCTGCCAGAGACCCATCTGGCATCCTCTCTCCTTACACTTACACTCTTAGAGAGACTGGACCAGAGGATGTGAACATAAGAATCATTTGCTGTGGAATCTGCCACACCGATCTTCATCAAACTAAAAATGATCTTGGCATGTCTAATTACCCCATGGTTCCTGGGCATGAAGTGGTAGGGGAAGTAGTGGAGGTGGGATCAGATGTGAGCAAGTTCACCGTAGGGGACATAGTTGGAGTTGGTTGCCTCGTTGGATGTTGCGGAGGTTGTAGCCCCTGCGAGAGAGATCTGGAACAGTATTGTCCAAAGAAGATTTGGAGCTACAATGATGTTTACATCAATGGTCAACCTACACAAGGCGGCTTCGCTAAAGCCACCGTCGTTCACCAAAAGTTTGTGGTCAAGATTCCAGAAGGAATGGCGGTTGAGCAGGCTGCGCCGCTACTGTGCGCTGGTGTGACTGTGTACAGTCCACTGAGCCACTTTGGTCTGAAACAACCAGGCCTAAGAGGAGGTATACTAGGGTTAGGTGGAGTCGGTCACATGGGTGTGAAAATAGCCAAAGCAATGGGTCACCATGTGACTGTCATAAGCTCATCAAACAAGAAGAGAGAAGAGGCATTGCAAGATCTTGGAGCTGATGATTACGTGATCGGTTCCGACCAAGCGAAGATGAGCGAATTGGCTGATTCGTTGGATTACGTAATTGACACGGTGCCTGTTCATCATGCACTTGAGCCATATTTGTCTCTGCTTAAGCTTGATGGTAAACTCATTCTCATGGGAGTTATCAACAATCCATTACAGTTTCTCACTCCTCTGCTTATGCTTGGGAGGAAAGTGATAACGGGGAGCTTCATAGGGAGCATGAAGGAGACAGAGGAGATGCTTGAGTTCTGTAAAGAAAAGGGTTTGAGTTCGATTATCGAAGTTGTGAAGATGGATTATGTTAACACTGCGTTTGAGAGACTTGAGAAGAACGATGTGCGTTATAGGTTCGTCGTTGATGTCGAAGGAAGCAATCTCGACGCTTTAATTGGCTCCGATGGAGGGTCTGGTGGCGGATCAACAAGTCGTGACCACATGGTCCTTCATGAGTACGTAAATGCTGCTGGGATTACATGA,29.15669,14075.00
15,V015-wildtype,K12,20,14101.5,no,15,AtCAD5,V015_AtCAD5_wt,wt,ATGGGAATAATGGAGGCAGAGAGGAAAACAACAGGCTGGGCTGCCAGAGACCCATCTGGCATCCTCTCTCCTTACACTTACACTCTTAGAGAGACTGGACCAGAGGATGTGAACATAAGAATCATTTGCTGTGGAATCTGCCACACCGATCTTCATCAAACTAAAAATGATCTTGGCATGTCTAATTACCCCATGGTTCCTGGGCATGAAGTGGTAGGGGAAGTAGTGGAGGTGGGATCAGATGTGAGCAAGTTCACCGTAGGGGACATAGTTGGAGTTGGTTGCCTCGTTGGATGTTGCGGAGGTTGTAGCCCCTGCGAGAGAGATCTGGAACAGTATTGTCCAAAGAAGATTTGGAGCTACAATGATGTTTACATCAATGGTCAACCTACACAAGGCGGCTTCGCTAAAGCCACCGTCGTTCACCAAAAGTTTGTGGTCAAGATTCCAGAAGGAATGGCGGTTGAGCAGGCTGCGCCGCTACTGTGCGCTGGTGTGACTGTGTACAGTCCACTGAGCCACTTTGGTCTGAAACAACCAGGCCTAAGAGGAGGTATACTAGGGTTAGGTGGAGTCGGTCACATGGGTGTGAAAATAGCCAAAGCAATGGGTCACCATGTGACTGTCATAAGCTCATCAAACAAGAAGAGAGAAGAGGCATTGCAAGATCTTGGAGCTGATGATTACGTGATCGGTTCCGACCAAGCGAAGATGAGCGAATTGGCTGATTCGTTGGATTACGTAATTGACACGGTGCCTGTTCATCATGCACTTGAGCCATATTTGTCTCTGCTTAAGCTTGATGGTAAACTCATTCTCATGGGAGTTATCAACAATCCATTACAGTTTCTCACTCCTCTGCTTATGCTTGGGAGGAAAGTGATAACGGGGAGCTTCATAGGGAGCATGAAGGAGACAGAGGAGATGCTTGAGTTCTGTAAAGAAAAGGGTTTGAGTTCGATTATCGAAGTTGTGAAGATGGATTATGTTAACACTGCGTTTGAGAGACTTGAGAAGAACGATGTGCGTTATAGGTTCGTCGTTGATGTCGAAGGAAGCAATCTCGACGCTTTAATTGGCTCCGATGGAGGGTCTGGTGGCGGATCAACAAGTCGTGACCACATGGTCCTTCATGAGTACGTAAATGCTGCTGGGATTACATGA,29.15669,14075.00
15,V015-wildtype,K12,20,15070.5,no,15,AtCAD5,V015_AtCAD5_wt,wt,ATGGGAATAATGGAGGCAGAGAGGAAAACAACAGGCTGGGCTGCCAGAGACCCATCTGGCATCCTCTCTCCTTACACTTACACTCTTAGAGAGACTGGACCAGAGGATGTGAACATAAGAATCATTTGCTGTGGAATCTGCCACACCGATCTTCATCAAACTAAAAATGATCTTGGCATGTCTAATTACCCCATGGTTCCTGGGCATGAAGTGGTAGGGGAAGTAGTGGAGGTGGGATCAGATGTGAGCAAGTTCACCGTAGGGGACATAGTTGGAGTTGGTTGCCTCGTTGGATGTTGCGGAGGTTGTAGCCCCTGCGAGAGAGATCTGGAACAGTATTGTCCAAAGAAGATTTGGAGCTACAATGATGTTTACATCAATGGTCAACCTACACAAGGCGGCTTCGCTAAAGCCACCGTCGTTCACCAAAAGTTTGTGGTCAAGATTCCAGAAGGAATGGCGGTTGAGCAGGCTGCGCCGCTACTGTGCGCTGGTGTGACTGTGTACAGTCCACTGAGCCACTTTGGTCTGAAACAACCAGGCCTAAGAGGAGGTATACTAGGGTTAGGTGGAGTCGGTCACATGGGTGTGAAAATAGCCAAAGCAATGGGTCACCATGTGACTGTCATAAGCTCATCAAACAAGAAGAGAGAAGAGGCATTGCAAGATCTTGGAGCTGATGATTACGTGATCGGTTCCGACCAAGCGAAGATGAGCGAATTGGCTGATTCGTTGGATTACGTAATTGACACGGTGCCTGTTCATCATGCACTTGAGCCATATTTGTCTCTGCTTAAGCTTGATGGTAAACTCATTCTCATGGGAGTTATCAACAATCCATTACAGTTTCTCACTCCTCTGCTTATGCTTGGGAGGAAAGTGATAACGGGGAGCTTCATAGGGAGCATGAAGGAGACAGAGGAGATGCTTGAGTTCTGTAAAGAAAAGGGTTTGAGTTCGATTATCGAAGTTGTGAAGATGGATTATGTTAACACTGCGTTTGAGAGACTTGAGAAGAACGATGTGCGTTATAGGTTCGTCGTTGATGTCGAAGGAAGCAATCTCGACGCTTTAATTGGCTCCGATGGAGGGTCTGGTGGCGGATCAACAAGTCGTGACCACATGGTCCTTCATGAGTACGTAAATGCTGCTGGGATTACATGA,29.15669,14075.00
15,V015-wildtype,K12,20,14050.5,no,15,AtCAD5,V015_AtCAD5_wt,wt,ATGGGAATAATGGAGGCAGAGAGGAAAACAACAGGCTGGGCTGCCAGAGACCCATCTGGCATCCTCTCTCCTTACACTTACACTCTTAGAGAGACTGGACCAGAGGATGTGAACATAAGAATCATTTGCTGTGGAATCTGCCACACCGATCTTCATCAAACTAAAAATGATCTTGGCATGTCTAATTACCCCATGGTTCCTGGGCATGAAGTGGTAGGGGAAGTAGTGGAGGTGGGATCAGATGTGAGCAAGTTCACCGTAGGGGACATAGTTGGAGTTGGTTGCCTCGTTGGATGTTGCGGAGGTTGTAGCCCCTGCGAGAGAGATCTGGAACAGTATTGTCCAAAGAAGATTTGGAGCTACAATGATGTTTACATCAATGGTCAACCTACACAAGGCGGCTTCGCTAAAGCCACCGTCGTTCACCAAAAGTTTGTGGTCAAGATTCCAGAAGGAATGGCGGTTGAGCAGGCTGCGCCGCTACTGTGCGCTGGTGTGACTGTGTACAGTCCACTGAGCCACTTTGGTCTGAAACAACCAGGCCTAAGAGGAGGTATACTAGGGTTAGGTGGAGTCGGTCACATGGGTGTGAAAATAGCCAAAGCAATGGGTCACCATGTGACTGTCATAAGCTCATCAAACAAGAAGAGAGAAGAGGCATTGCAAGATCTTGGAGCTGATGATTACGTGATCGGTTCCGACCAAGCGAAGATGAGCGAATTGGCTGATTCGTTGGATTACGTAATTGACACGGTGCCTGTTCATCATGCACTTGAGCCATATTTGTCTCTGCTTAAGCTTGATGGTAAACTCATTCTCATGGGAGTTATCAACAATCCATTACAGTTTCTCACTCCTCTGCTTATGCTTGGGAGGAAAGTGATAACGGGGAGCTTCATAGGGAGCATGAAGGAGACAGAGGAGATGCTTGAGTTCTGTAAAGAAAAGGGTTTGAGTTCGATTATCGAAGTTGTGAAGATGGATTATGTTAACACTGCGTTTGAGAGACTTGAGAAGAACGATGTGCGTTATAGGTTCGTCGTTGATGTCGAAGGAAGCAATCTCGACGCTTTAATTGGCTCCGATGGAGGGTCTGGTGGCGGATCAACAAGTCGTGACCACATGGTCCTTCATGAGTACGTAAATGCTGCTGGGATTACATGA,29.15669,14075.00
15,V015-wildtype,K12,20,14740.0,no,15,AtCAD5,V015_AtCAD5_wt,wt,ATGGGAATAATGGAGGCAGAGAGGAAAACAACAGGCTGGGCTGCCAGAGACCCATCTGGCATCCTCTCTCCTTACACTTACACTCTTAGAGAGACTGGACCAGAGGATGTGAACATAAGAATCATTTGCTGTGGAATCTGCCACACCGATCTTCATCAAACTAAAAATGATCTTGGCATGTCTAATTACCCCATGGTTCCTGGGCATGAAGTGGTAGGGGAAGTAGTGGAGGTGGGATCAGATGTGAGCAAGTTCACCGTAGGGGACATAGTTGGAGTTGGTTGCCTCGTTGGATGTTGCGGAGGTTGTAGCCCCTGCGAGAGAGATCTGGAACAGTATTGTCCAAAGAAGATTTGGAGCTACAATGATGTTTACATCAATGGTCAACCTACACAAGGCGGCTTCGCTAAAGCCACCGTCGTTCACCAAAAGTTTGTGGTCAAGATTCCAGAAGGAATGGCGGTTGAGCAGGCTGCGCCGCTACTGTGCGCTGGTGTGACTGTGTACAGTCCACTGAGCCACTTTGGTCTGAAACAACCAGGCCTAAGAGGAGGTATACTAGGGTTAGGTGGAGTCGGTCACATGGGTGTGAAAATAGCCAAAGCAATGGGTCACCATGTGACTGTCATAAGCTCATCAAACAAGAAGAGAGAAGAGGCATTGCAAGATCTTGGAGCTGATGATTACGTGATCGGTTCCGACCAAGCGAAGATGAGCGAATTGGCTGATTCGTTGGATTACGTAATTGACACGGTGCCTGTTCATCATGCACTTGAGCCATATTTGTCTCTGCTTAAGCTTGATGGTAAACTCATTCTCATGGGAGTTATCAACAATCCATTACAGTTTCTCACTCCTCTGCTTATGCTTGGGAGGAAAGTGATAACGGGGAGCTTCATAGGGAGCATGAAGGAGACAGAGGAGATGCTTGAGTTCTGTAAAGAAAAGGGTTTGAGTTCGATTATCGAAGTTGTGAAGATGGATTATGTTAACACTGCGTTTGAGAGACTTGAGAAGAACGATGTGCGTTATAGGTTCGTCGTTGATGTCGAAGGAAGCAATCTCGACGCTTTAATTGGCTCCGATGGAGGGTCTGGTGGCGGATCAACAAGTCGTGACCACATGGTCCTTCATGAGTACGTAAATGCTGCTGGGATTACATGA,29.15669,14075.00
15,V015-wildtype,K12,20,14743.0,no,15,AtCAD5,V015_AtCAD5_wt,wt,ATGGGAATAATGGAGGCAGAGAGGAAAACAACAGGCTGGGCTGCCAGAGACCCATCTGGCATCCTCTCTCCTTACACTTACACTCTTAGAGAGACTGGACCAGAGGATGTGAACATAAGAATCATTTGCTGTGGAATCTGCCACACCGATCTTCATCAAACTAAAAATGATCTTGGCATGTCTAATTACCCCATGGTTCCTGGGCATGAAGTGGTAGGGGAAGTAGTGGAGGTGGGATCAGATGTGAGCAAGTTCACCGTAGGGGACATAGTTGGAGTTGGTTGCCTCGTTGGATGTTGCGGAGGTTGTAGCCCCTGCGAGAGAGATCTGGAACAGTATTGTCCAAAGAAGATTTGGAGCTACAATGATGTTTACATCAATGGTCAACCTACACAAGGCGGCTTCGCTAAAGCCACCGTCGTTCACCAAAAGTTTGTGGTCAAGATTCCAGAAGGAATGGCGGTTGAGCAGGCTGCGCCGCTACTGTGCGCTGGTGTGACTGTGTACAGTCCACTGAGCCACTTTGGTCTGAAACAACCAGGCCTAAGAGGAGGTATACTAGGGTTAGGTGGAGTCGGTCACATGGGTGTGAAAATAGCCAAAGCAATGGGTCACCATGTGACTGTCATAAGCTCATCAAACAAGAAGAGAGAAGAGGCATTGCAAGATCTTGGAGCTGATGATTACGTGATCGGTTCCGACCAAGCGAAGATGAGCGAATTGGCTGATTCGTTGGATTACGTAATTGACACGGTGCCTGTTCATCATGCACTTGAGCCATATTTGTCTCTGCTTAAGCTTGATGGTAAACTCATTCTCATGGGAGTTATCAACAATCCATTACAGTTTCTCACTCCTCTGCTTATGCTTGGGAGGAAAGTGATAACGGGGAGCTTCATAGGGAGCATGAAGGAGACAGAGGAGATGCTTGAGTTCTGTAAAGAAAAGGGTTTGAGTTCGATTATCGAAGTTGTGAAGATGGATTATGTTAACACTGCGTTTGAGAGACTTGAGAAGAACGATGTGCGTTATAGGTTCGTCGTTGATGTCGAAGGAAGCAATCTCGACGCTTTAATTGGCTCCGATGGAGGGTCTGGTGGCGGATCAACAAGTCGTGACCACATGGTCCTTCATGAGTACGTAAATGCTGCTGGGATTACATGA,29.15669,14075.00
15,V015-wildtype,K12,20,13430.0,no,15,AtCAD5,V015_AtCAD5_wt,wt,ATGGGAATAATGGAGGCAGAGAGGAAAACAACAGGCTGGGCTGCCAGAGACCCATCTGGCATCCTCTCTCCTTACACTTACACTCTTAGAGAGACTGGACCAGAGGATGTGAACATAAGAATCATTTGCTGTGGAATCTGCCACACCGATCTTCATCAAACTAAAAATGATCTTGGCATGTCTAATTACCCCATGGTTCCTGGGCATGAAGTGGTAGGGGAAGTAGTGGAGGTGGGATCAGATGTGAGCAAGTTCACCGTAGGGGACATAGTTGGAGTTGGTTGCCTCGTTGGATGTTGCGGAGGTTGTAGCCCCTGCGAGAGAGATCTGGAACAGTATTGTCCAAAGAAGATTTGGAGCTACAATGATGTTTACATCAATGGTCAACCTACACAAGGCGGCTTCGCTAAAGCCACCGTCGTTCACCAAAAGTTTGTGGTCAAGATTCCAGAAGGAATGGCGGTTGAGCAGGCTGCGCCGCTACTGTGCGCTGGTGTGACTGTGTACAGTCCACTGAGCCACTTTGGTCTGAAACAACCAGGCCTAAGAGGAGGTATACTAGGGTTAGGTGGAGTCGGTCACATGGGTGTGAAAATAGCCAAAGCAATGGGTCACCATGTGACTGTCATAAGCTCATCAAACAAGAAGAGAGAAGAGGCATTGCAAGATCTTGGAGCTGATGATTACGTGATCGGTTCCGACCAAGCGAAGATGAGCGAATTGGCTGATTCGTTGGATTACGTAATTGACACGGTGCCTGTTCATCATGCACTTGAGCCATATTTGTCTCTGCTTAAGCTTGATGGTAAACTCATTCTCATGGGAGTTATCAACAATCCATTACAGTTTCTCACTCCTCTGCTTATGCTTGGGAGGAAAGTGATAACGGGGAGCTTCATAGGGAGCATGAAGGAGACAGAGGAGATGCTTGAGTTCTGTAAAGAAAAGGGTTTGAGTTCGATTATCGAAGTTGTGAAGATGGATTATGTTAACACTGCGTTTGAGAGACTTGAGAAGAACGATGTGCGTTATAGGTTCGTCGTTGATGTCGAAGGAAGCAATCTCGACGCTTTAATTGGCTCCGATGGAGGGTCTGGTGGCGGATCAACAAGTCGTGACCACATGGTCCTTCATGAGTACGTAAATGCTGCTGGGATTACATGA,29.15669,14075.00
15,V015-wildtype,K12,20,11611.0,yes,15,AtCAD5,V015_AtCAD5_wt,wt,ATGGGAATAATGGAGGCAGAGAGGAAAACAACAGGCTGGGCTGCCAGAGACCCATCTGGCATCCTCTCTCCTTACACTTACACTCTTAGAGAGACTGGACCAGAGGATGTGAACATAAGAATCATTTGCTGTGGAATCTGCCACACCGATCTTCATCAAACTAAAAATGATCTTGGCATGTCTAATTACCCCATGGTTCCTGGGCATGAAGTGGTAGGGGAAGTAGTGGAGGTGGGATCAGATGTGAGCAAGTTCACCGTAGGGGACATAGTTGGAGTTGGTTGCCTCGTTGGATGTTGCGGAGGTTGTAGCCCCTGCGAGAGAGATCTGGAACAGTATTGTCCAAAGAAGATTTGGAGCTACAATGATGTTTACATCAATGGTCAACCTACACAAGGCGGCTTCGCTAAAGCCACCGTCGTTCACCAAAAGTTTGTGGTCAAGATTCCAGAAGGAATGGCGGTTGAGCAGGCTGCGCCGCTACTGTGCGCTGGTGTGACTGTGTACAGTCCACTGAGCCACTTTGGTCTGAAACAACCAGGCCTAAGAGGAGGTATACTAGGGTTAGGTGGAGTCGGTCACATGGGTGTGAAAATAGCCAAAGCAATGGGTCACCATGTGACTGTCATAAGCTCATCAAACAAGAAGAGAGAAGAGGCATTGCAAGATCTTGGAGCTGATGATTACGTGATCGGTTCCGACCAAGCGAAGATGAGCGAATTGGCTGATTCGTTGGATTACGTAATTGACACGGTGCCTGTTCATCATGCACTTGAGCCATATTTGTCTCTGCTTAAGCTTGATGGTAAACTCATTCTCATGGGAGTTATCAACAATCCATTACAGTTTCTCACTCCTCTGCTTATGCTTGGGAGGAAAGTGATAACGGGGAGCTTCATAGGGAGCATGAAGGAGACAGAGGAGATGCTTGAGTTCTGTAAAGAAAAGGGTTTGAGTTCGATTATCGAAGTTGTGAAGATGGATTATGTTAACACTGCGTTTGAGAGACTTGAGAAGAACGATGTGCGTTATAGGTTCGTCGTTGATGTCGAAGGAAGCAATCTCGACGCTTTAATTGGCTCCGATGGAGGGTCTGGTGGCGGATCAACAAGTCGTGACCACATGGTCCTTCATGAGTACGTAAATGCTGCTGGGATTACATGA,29.15669,14075.00
16,V016-AnaCoda,K12,20,21446.5,no,16,AtCAD5,V016_AtCAD5_AnaCoda(ROC),AnaCoda,ATGGGTATCATGGAAGCTGAACGTAAAACTACTGGTTGGGCTGCTCGTGACCCGTCTGGTATCCTGTCTCCGTACACTTACACTCTGCGTGAAACTGGTCCGGAAGACGTTAACATCCGTATCATCTGCTGCGGTATCTGCCACACTGACCTGCACCAGACTAAAAACGACCTGGGTATGTCTAACTACCCGATGGTTCCGGGTCACGAAGTTGTTGGTGAAGTTGTTGAAGTTGGTTCTGACGTTTCTAAATTCACTGTTGGTGACATCGTTGGTGTTGGTTGCCTGGTTGGTTGCTGCGGTGGTTGCTCTCCGTGCGAACGTGACCTGGAACAGTACTGCCCGAAAAAAATCTGGTCTTACAACGACGTTTACATCAACGGTCAGCCGACTCAGGGTGGTTTCGCTAAAGCTACTGTTGTTCACCAGAAATTCGTTGTTAAAATCCCGGAAGGTATGGCTGTTGAACAGGCTGCTCCGCTGCTGTGCGCTGGTGTTACTGTTTACTCTCCGCTGTCTCACTTCGGTCTGAAACAGCCGGGTCTGCGTGGTGGTATCCTGGGTCTGGGTGGTGTTGGTCACATGGGTGTTAAAATCGCTAAAGCTATGGGTCACCACGTTACTGTTATCTCTTCTTCTAACAAAAAACGTGAAGAAGCTCTGCAGGACCTGGGTGCTGACGACTACGTTATCGGTTCTGACCAGGCTAAAATGTCTGAACTGGCTGACTCTCTGGACTACGTTATCGACACTGTTCCGGTTCACCACGCTCTGGAACCGTACCTGTCTCTGCTGAAACTGGACGGTAAACTGATCCTGATGGGTGTTATCAACAACCCGCTGCAGTTCCTGACTCCGCTGCTGATGCTGGGTCGTAAAGTTATCACTGGTTCTTTCATCGGTTCTATGAAAGAAACTGAAGAAATGCTGGAATTCTGCAAAGAAAAAGGTCTGTCTTCTATCATCGAAGTTGTTAAAATGGACTACGTTAACACTGCTTTCGAACGTCTGGAAAAAAACGACGTTCGTTACCGTTTCGTTGTTGACGTTGAAGGTTCTAACCTGGACGCTTTAATTGGCTCCGATGGAGGGTCTGGTGGCGGATCAACAAGTCGTGACCACATGGTCCTTCATGAGTACGTAAATGCTGCTGGGATTACATAG,20.54051,19979.08
16,V016-AnaCoda,K12,20,21939.5,no,16,AtCAD5,V016_AtCAD5_AnaCoda(ROC),AnaCoda,ATGGGTATCATGGAAGCTGAACGTAAAACTACTGGTTGGGCTGCTCGTGACCCGTCTGGTATCCTGTCTCCGTACACTTACACTCTGCGTGAAACTGGTCCGGAAGACGTTAACATCCGTATCATCTGCTGCGGTATCTGCCACACTGACCTGCACCAGACTAAAAACGACCTGGGTATGTCTAACTACCCGATGGTTCCGGGTCACGAAGTTGTTGGTGAAGTTGTTGAAGTTGGTTCTGACGTTTCTAAATTCACTGTTGGTGACATCGTTGGTGTTGGTTGCCTGGTTGGTTGCTGCGGTGGTTGCTCTCCGTGCGAACGTGACCTGGAACAGTACTGCCCGAAAAAAATCTGGTCTTACAACGACGTTTACATCAACGGTCAGCCGACTCAGGGTGGTTTCGCTAAAGCTACTGTTGTTCACCAGAAATTCGTTGTTAAAATCCCGGAAGGTATGGCTGTTGAACAGGCTGCTCCGCTGCTGTGCGCTGGTGTTACTGTTTACTCTCCGCTGTCTCACTTCGGTCTGAAACAGCCGGGTCTGCGTGGTGGTATCCTGGGTCTGGGTGGTGTTGGTCACATGGGTGTTAAAATCGCTAAAGCTATGGGTCACCACGTTACTGTTATCTCTTCTTCTAACAAAAAACGTGAAGAAGCTCTGCAGGACCTGGGTGCTGACGACTACGTTATCGGTTCTGACCAGGCTAAAATGTCTGAACTGGCTGACTCTCTGGACTACGTTATCGACACTGTTCCGGTTCACCACGCTCTGGAACCGTACCTGTCTCTGCTGAAACTGGACGGTAAACTGATCCTGATGGGTGTTATCAACAACCCGCTGCAGTTCCTGACTCCGCTGCTGATGCTGGGTCGTAAAGTTATCACTGGTTCTTTCATCGGTTCTATGAAAGAAACTGAAGAAATGCTGGAATTCTGCAAAGAAAAAGGTCTGTCTTCTATCATCGAAGTTGTTAAAATGGACTACGTTAACACTGCTTTCGAACGTCTGGAAAAAAACGACGTTCGTTACCGTTTCGTTGTTGACGTTGAAGGTTCTAACCTGGACGCTTTAATTGGCTCCGATGGAGGGTCTGGTGGCGGATCAACAAGTCGTGACCACATGGTCCTTCATGAGTACGTAAATGCTGCTGGGATTACATAG,20.54051,19979.08


### Create dataframe and .csv file for plotting and model evaluation

In [15]:
# Keep only 'sequence_name', 'strain', 'value' and 'predicted_protein_expression_level' columns
efficacy_model_df <- all_data[,c(2,3,5,13)]
# rename column 13 of efficiency_model_df to "predicted_level"
colnames(efficacy_model_df)[4] <- "predicted_level"
names(efficacy_model_df)

# create a new dataframe called "data_gfp_zrinka_k12" storing only strain == K12 from data_gfp_zrinka
data_gfp_zrinka_k12 <- efficacy_model_df[efficacy_model_df$strain == "K12",] 

# create a new dataframe called "data_gfp_zrinka_bl21" storing only strain == BL21DE3 from data_gfp_zrinka
data_gfp_zrinka_bl21 <- efficacy_model_df[efficacy_model_df$strain == "BL21DE3",]

In [16]:
# Create a dataframe without outliers
efficacy_df_no_outliers <- all_data[,c(2,3,5,6,13)]
efficacy_df_no_outliers
# Store the non-outlier data in efficacy_df_no_outliers
efficacy_df_no_outliers <- efficacy_df_no_outliers[efficacy_df_no_outliers$outlier_status == "no",]
# remove the column "outlier_status"
efficacy_df_no_outliers <- efficacy_df_no_outliers[,c(1,2,3,5)]

# rename column 13 of efficacy_df_no_outliers to "predicted_level"
colnames(efficacy_df_no_outliers)[4] <- "predicted_level"
names(efficacy_df_no_outliers)
efficacy_df_no_outliers

# Create a new dataframe with strain = K12 and without outliers
efficacy_df_no_outliers_k12 <- efficacy_df_no_outliers[efficacy_df_no_outliers$strain == "K12",]

# Create a new dataframe with strain = BL21DE3 and without outliers
efficacy_df_no_outliers_bl21 <- efficacy_df_no_outliers[efficacy_df_no_outliers$strain == "BL21DE3",]

sequence_name,strain,value,outlier_status,predicted_protein_expression_level
<chr>,<chr>,<dbl>,<chr>,<dbl>
V015-wildtype,K12,14853.5,no,14075.00
V015-wildtype,K12,14101.5,no,14075.00
V015-wildtype,K12,15070.5,no,14075.00
V015-wildtype,K12,14050.5,no,14075.00
V015-wildtype,K12,14740.0,no,14075.00
V015-wildtype,K12,14743.0,no,14075.00
V015-wildtype,K12,13430.0,no,14075.00
V015-wildtype,K12,11611.0,yes,14075.00
V016-AnaCoda,K12,21446.5,no,19979.08
V016-AnaCoda,K12,21939.5,no,19979.08


Unnamed: 0_level_0,sequence_name,strain,value,predicted_level
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>
1,V015-wildtype,K12,14853.5,14075.00
2,V015-wildtype,K12,14101.5,14075.00
3,V015-wildtype,K12,15070.5,14075.00
4,V015-wildtype,K12,14050.5,14075.00
5,V015-wildtype,K12,14740.0,14075.00
6,V015-wildtype,K12,14743.0,14075.00
7,V015-wildtype,K12,13430.0,14075.00
9,V016-AnaCoda,K12,21446.5,19979.08
10,V016-AnaCoda,K12,21939.5,19979.08
12,V016-AnaCoda,K12,20637.5,19979.08


In [17]:
# Group by (sequence name, strain, predicted level) and calculate mean fluorescence value
efficacy_model_df_grouped <- aggregate(efficacy_model_df$value, by = list(efficacy_model_df$sequence_name, efficacy_model_df$strain, efficacy_model_df$predicted_level), FUN = mean)
names(efficacy_model_df_grouped) <- c("sequence_name", "strain", "mean_fluorescence", "predicted_level")

# create a new dataframe called "data_gfp_zrinka_grouped_k12" storing only strain == K12 from data_gfp_zrinka_grouped
data_gfp_zrinka_grouped_k12 <- efficacy_model_df_grouped[efficacy_model_df_grouped$strain == "K12",]
data_gfp_zrinka_grouped_k12 

# create a new dataframe called "data_gfp_zrinka_grouped_bl21" storing only strain == BL21DE3 from data_gfp_zrinka_grouped
data_gfp_zrinka_grouped_bl21 <- efficacy_model_df_grouped[efficacy_model_df_grouped$strain == "BL21DE3",]
data_gfp_zrinka_grouped_bl21

Unnamed: 0_level_0,sequence_name,strain,mean_fluorescence,predicted_level
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>
1,V015-wildtype,K12,14075.0,14075.0
2,V032-IDT (Variant 2),K12,14570.85,17725.875
3,V019-Twist (Variant 1),K12,14835.03,18675.625
4,V035-RTCIM (Variant 1),K12,14986.92,16535.875
5,V031-IDT (Variant 1),K12,15164.43,17275.5
6,V021-Twist (Variant 3),K12,15439.28,9991.125
7,V022-GENEius (Variant 1),K12,15897.9,20263.5
8,V023-GENEius (Variant 2),K12,15897.9,17870.375
9,V026-Twist (Variant 4),K12,16539.64,21145.0
10,V025-GenScript (Variant 1),K12,16626.39,21068.375


Unnamed: 0_level_0,sequence_name,strain,mean_fluorescence,predicted_level
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>
11,V015-wildtype,BL21DE3,17098.38,17098.38
12,V032-IDT (Variant 2),BL21DE3,17700.74,28134.0
13,V033-IDT (Variant 3),BL21DE3,18012.29,29180.38
14,V019-Twist (Variant 1),BL21DE3,18021.66,31469.88
15,V035-RTCIM (Variant 1),BL21DE3,18206.18,23893.62
16,V036-RTCIM (Variant 2),BL21DE3,18242.3,25167.88
18,V031-IDT (Variant 1),BL21DE3,18421.82,28125.0
20,V021-Twist (Variant 3),BL21DE3,18755.71,19942.5
21,V020-Twist (Variant 2),BL21DE3,18985.05,26363.5
22,V022-GENEius (Variant 1),BL21DE3,19312.85,29456.62


In [18]:
# Group by (sequence name, strain, predicted level) and calculate mean fluorescence value in efficacy_df_no_outliers
efficacy_df_no_outliers_grouped <- aggregate(efficacy_df_no_outliers$value, by = list(efficacy_df_no_outliers$sequence_name, efficacy_df_no_outliers$strain, efficacy_df_no_outliers$predicted_level), FUN = mean)
names(efficacy_df_no_outliers_grouped) <- c("sequence_name", "strain", "mean_fluorescence", "predicted_level")

# create a new dataframe called "data_gfp_zrinka_grouped_no_outliers_k12" storing only strain == K12 from efficacy_df_no_outliers_grouped
efficacy_grouped_no_outliers_k12 <- efficacy_df_no_outliers_grouped[efficacy_df_no_outliers_grouped$strain == "K12",]

# create a new dataframe called "data_gfp_zrinka_grouped_no_outliers_bl21" storing only strain == BL21DE3 from efficacy_df_no_outliers_grouped
efficacy_grouped_no_outliers_bl21 <- efficacy_df_no_outliers_grouped[efficacy_df_no_outliers_grouped$strain == "BL21DE3",]

In [19]:
# write a .csv file with efficiency_model_df with the measured values (not averaged) for evaluation 
write.csv(efficacy_model_df, file = "dataframes/efficacy/predicted_efficacy.csv")

# Write csv file of efficiency model dataframe with measured values for "K12" strain for model evaluation
write.csv(data_gfp_zrinka_k12, file = "dataframes/efficacy/predicted_efficacy_k12.csv")

# Write csv file of efficiency model dataframe with measured values for "BL21DE3" strain for model evaluation
write.csv(data_gfp_zrinka_bl21, file = "dataframes/efficacy/predicted_efficacy_bl21.csv")

# Write csv file of efficiency model dataframe with averaged values for model evaluation
write.csv(efficacy_model_df_grouped, file = "dataframes/efficacy/predicted_efficacy_averaged.csv")

# Write csv file of efficiency model dataframe with averaged values for "K12" strain for model evaluation
write.csv(data_gfp_zrinka_grouped_k12, file = "dataframes/efficacy/predicted_efficacy_averaged_k12.csv")

# Write csv file of efficiency model dataframe with averaged values for "BL21DE3" strain for model evaluation
write.csv(data_gfp_zrinka_grouped_bl21, file = "dataframes/efficacy/predicted_efficacy_averaged_bl21.csv")

dataframes with no outliers

In [20]:
# dataframe without outliers
write.csv(efficacy_df_no_outliers, file = "dataframes/efficacy/predicted_efficacy_no_outliers.csv")

# dataframe without outliers strain = K12
write.csv(efficacy_df_no_outliers_k12, file = "dataframes/efficacy/predicted_efficacy_no_outliers_k12.csv")

# dataframe without outliers strain = BL21DE3
write.csv(efficacy_df_no_outliers_bl21, file = "dataframes/efficacy/predicted_efficacy_no_outliers_bl21.csv")

# dataframe without outliers and averaged
write.csv(efficacy_df_no_outliers_grouped, file = "dataframes/efficacy/predicted_efficacy_no_outliers_averaged.csv")

# dataframe without outliers and averaged strain = K12
write.csv(efficacy_grouped_no_outliers_k12, file = "dataframes/efficacy/predicted_efficacy_no_outliers_averaged_k12.csv")

# dataframe without outliers and averaged strain = BL21DE3
write.csv(efficacy_grouped_no_outliers_bl21, file = "dataframes/efficacy/predicted_efficacy_no_outliers_averaged_bl21.csv")