To compare with LTEE (Consuegra, 2021) I identify the non-essential sequences of LTEE and our study and compare the frequency of IS insertions in these sequences.
My motive is not to make a thorough analysis of the LTEE, but to use it as a reference to show the potential effect of higher population size, despite the effect of genetic draft.

# Load packages and directories
Use R for most analysis. If heavy data manipulation is required, use Julia instead.

In [1]:
# JULIA v1.9.3
import Pkg
using RCall
using Random
using DataFrames, DataFramesMeta
using Revise
using CSV
using StatsBase
using Statistics
using NPZ

In [2]:
figure_export_dir = "./exp/20240703_Compare_INS_freq_per_NonESS_LTEE/Fig"
base_dir = "exp/multiple_runs11" 
base_dir_event_classification = joinpath(base_dir, "export/classify_IS_events")

@rput figure_export_dir
@rput base_dir

R"""
Sys.setlocale("LC_COLLATE","C") # order by dictionary order
library(tidyverse)
library(cowplot)
library(latex2exp)
library(stats)
library(scales)
library(lemon)
library(gghalves)
library(ggbeeswarm)
library(ggpointdensity)
library(viridis)
library(forcats)
library(MASS)
library(sfsmisc)
library(ggnewscale)
library(ggrastr)
cbp <- c("#999999", "#E69F00", "#56B4E9", "#009E73", 
                       "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
select <- dplyr::select
filter <- dplyr::filter
lag <- dplyr::lag
library(arrow)
library(reticulate)
"""

[33m[1m│ [22m[39m✔ dplyr     1.1.3     ✔ readr     2.1.4
[33m[1m│ [22m[39m✔ forcats   1.0.0     ✔ stringr   1.5.0
[33m[1m│ [22m[39m✔ ggplot2   3.4.3     ✔ tibble    3.2.1
[33m[1m│ [22m[39m✔ lubridate 1.9.2     ✔ tidyr     1.3.0
[33m[1m│ [22m[39m✔ purrr     1.0.2     
[33m[1m│ [22m[39m── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
[33m[1m│ [22m[39m✖ dplyr::filter() masks stats::filter()
[33m[1m│ [22m[39m✖ dplyr::lag()    masks stats::lag()
[33m[1m│ [22m[39mℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
[33m[1m└ [22m[39m[90m@ RCall ~/.julia/packages/RCall/gOwEW/src/io.jl:172[39m
[33m[1m│ [22m[39mAttaching package: ‘cowplot’
[33m[1m│ [22m[39m
[33m[1m│ [22m[39mThe following object is masked from ‘package:lubridate’:
[33m[1m│ [22m[39m
[33m[1m│ [22m[39m    stamp
[33m[1m│ [22m[39m
[33m[1m└ [22m[39m[90m@ RCall ~/.julia/packages/RCall/gO

RObject{StrSxp}
 [1] "reticulate"     "arrow"          "ggrastr"        "ggnewscale"    
 [5] "sfsmisc"        "MASS"           "viridis"        "viridisLite"   
 [9] "ggpointdensity" "ggbeeswarm"     "gghalves"       "lemon"         
[13] "scales"         "latex2exp"      "cowplot"        "lubridate"     
[17] "forcats"        "stringr"        "dplyr"          "purrr"         
[21] "readr"          "tidyr"          "tibble"         "ggplot2"       
[25] "tidyverse"      "stats"          "graphics"       "grDevices"     
[29] "utils"          "datasets"       "methods"        "base"          


# Import Data
- LTEE
- This study

The first analysis uses essential genes based on LTEE, which is not used in the paper. Go to 

## LTEE
The number of essential genes includes non-essential genes too, and much larger than the estimate in other K12 strains.

In [3]:
R"""
# Essential or almost essential genes in Couce et al. 2017 PNAS
LTEE_ess <- read_csv(file.path("../misc", "20240301_Essential_Genes_in_REL606.csv"))
LTEE_genes <- read_csv(file.path("../misc", "REL606.csv"))
LTEE_ess_couce <- LTEE_ess
LTEE_genes_couce <- LTEE_genes

LTEE_genome_size <- LTEE_genes %>% filter(end - start > 4e6) %>% pull(end) %>% max()
LTEE_gene_cnt <- LTEE_genes %>% filter(feat_type == "gene") %>% nrow()
LTEE_gene_length <- LTEE_genes %>% filter(feat_type == "gene") %>% mutate(length = end - start +1 ) %>% pull(length)
LTEE_ess_genes <- LTEE_genes %>% filter(feat_type == "gene" & gene %in% LTEE_ess$Name)
LTEE_ess_genes_length <- LTEE_ess_genes %>% mutate(length = end - start +1 ) %>% pull(length)
# genome size gene cnt, total gene length and essential gene cnt total ess gene length ess gene % of total genom
print(paste0("LTEE has ", LTEE_genome_size, " bp genome size", " and ", LTEE_gene_cnt, " genes", " with total gene length ", sum(LTEE_gene_length), " bp"))
print(paste0("Genes are ", sum(LTEE_gene_length)/LTEE_genome_size*100, "% of the genome"))
print(paste0("LTEE has ", nrow(LTEE_ess_genes), "(", LTEE_ess %>% nrow, ") essential genes", " with total essential gene length ", sum(LTEE_ess_genes_length), " bp"))
print(paste0("Essential genes are ", sum(LTEE_ess_genes_length)/LTEE_genome_size*100, "% of the genome"))
""";

[1] "LTEE has 4629812 bp genome size and 4386 genes with total gene length 4128856 bp"
[1] "Genes are 89.179776630239% of the genome"
[1] "LTEE has 542(544) essential genes with total essential gene length 500665 bp"
[1] "Essential genes are 10.8139380173536% of the genome"


[33m[1m│ [22m[39m── Column specification ────────────────────────────────────────────────────────
[33m[1m│ [22m[39mDelimiter: ","
[33m[1m│ [22m[39mchr (1): Name
[33m[1m│ [22m[39m
[33m[1m│ [22m[39mℹ Use `spec()` to retrieve the full column specification for this data.
[33m[1m│ [22m[39mℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
[33m[1m└ [22m[39m[90m@ RCall ~/.julia/packages/RCall/gOwEW/src/io.jl:172[39m
[33m[1m│ [22m[39m── Column specification ────────────────────────────────────────────────────────
[33m[1m│ [22m[39mDelimiter: ","
[33m[1m│ [22m[39mchr (5): seq_id, feat_id, feat_type, gene, desc
[33m[1m│ [22m[39mdbl (3): start, end, strand
[33m[1m│ [22m[39m
[33m[1m│ [22m[39mℹ Use `spec()` to retrieve the full column specification for this data.
[33m[1m│ [22m[39mℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
[33m[1m└ [22m[39m[90m@ RCall ~/.julia/packages/

In [5]:
R"""
# Loading Consuegra 2021 Nat. Commun. T1, basically as in 20231005_ins_del_analysis.ipynb
data_string <- "
Population	Clone	IS1	IS2a	IS3	IS4	IS30	IS150	IS186	IS600a	IS911a	New insertions
Ancestor	REL606	28	1	5	1	1	5	5	1	2	
Ara+1	11392	30 (2; 0)	-	4 (0; 1)	1 (0; 0)	1 (0; 0)	40 (35; 0)	6 (1; 0)	-	-	38
Ara+1	11393	32 (4; 0)	-	4 (0; 1)	1 (0; 0)	1 (0; 0)	40 (35; 0)	8 (3; 0)	-	-	41b
Ara+2	11342	31 (4; 1)	-	5 (0; 0)	1 (0; 0)	1 (0; 0)	8 (3; 0)	9 (4; 0)	-	-	10b
Ara+2	11343	31 (4; 1)	-	5 (0; 0)	1 (0; 0)	1 (0; 0)	8 (3; 0)	9 (4; 0)	-	-	10b
Ara+3	10953	28 (0; 0)	-	4 (0; 1)	2 (1; 0)	1 (0; 0)	5 (0; 0)	7 (2; 0)	-	-	3
Ara+3	10954	30 (2; 0)	-	5 (0; 0)	1 (0; 0)	1 (0; 0)	6 (1; 0)	7 (2; 0)	-	-	5
Ara+4	11348	28 (1; 1)	-	5 (0; 0)	3 (2; 0)	0 (0; 1)	7 (2; 0)	5 (0; 0)	-	-	5
Ara+4	11349	29 (2; 1)	-	5 (0; 0)	3 (2; 0)	1 (0; 0)	7 (2; 0)	6 (1; 0)	-	-	7
Ara+5	11367	31 (3; 0)	-	5 (0; 0)	1 (0; 0)	1 (0; 0)	15 (10; 0)	8 (3; 0)	-	-	16
Ara+5	11368	31 (3; 0)	-	5 (0; 0)	1 (0; 0)	1 (0; 0)	15 (10; 0)	8 (3; 0)	-	-	16
Ara+6	11370	27 (0; 1)	-	5 (0; 0)	1 (0; 0)	1 (0; 0)	6 (1; 0)	5 (1; 1)	-	-	2
Ara+6	11371	32 (4; 0)	-	5 (0; 0)	1 (0; 0)	1 (0; 0)	6 (1; 0)	6 (1; 0)	-	-	6
Ara–1	11330	30 (2; 0)	-	5 (0; 0)	1 (0; 0)	1 (0; 0)	12 (7; 0)	7 (2; 0)	-	-	11
Ara–1	11331	30 (2; 0)	-	5 (0; 0)	1 (0; 0)	1 (0; 0)	12 (7; 0)	7 (2; 0)	-	-	11
Ara–2	11335 S	29 (2; 1)	-	8 (3; 0)	1 (0; 0)	1 (0; 0)	21 (16; 0)	7 (2; 0)	-	-	23
Ara–2	11333 L	31 (5; 2)	-	5 (0; 0)	2 (1; 0)	1 (0; 0)	5 (0; 0)	4 (0; 1)	-	-	6
Ara–3	11364	25 (3; 6)	-	3 (1; 3)	0 (0; 1)	1 (0; 0)	25 (21; 1)	12 (7; 0)	-	-	30b
Ara–3	11365	25 (2; 5)	-	3 (1; 3)	1 (0; 0)	1 (0; 0)	25 (21; 1)	12 (7; 0)	-	-	29b
Ara–4	11336	26 (1; 3)	-	4 (0; 1)	1 (0; 0)	1 (0; 0)	5 (0; 0)	5 (1; 1)	-	-	2
Ara–4	11337	27 (2; 3)	-	4 (0; 1)	1 (0; 0)	1 (0; 0)	5 (0; 0)	5 (1; 1)	-	-	3
Ara–5	11339	31 (4; 1)	-	4 (0; 1)	1 (0; 0)	1 (0; 0)	19 (14; 0)	10 (5; 0)	-	-	23
Ara–5	11340	31 (4; 1)	-	4 (0; 1)	2 (1; 0)	1 (0; 0)	19 (14; 0)	10 (5; 0)	-	-	24
Ara–6	11389	30 (2; 0)	-	7 (2; 0)	2 (1; 0)	1 (0;0)	23 (18; 0)	7 (2; 0)	-	-	25
Ara–6	11390	33 (5; 0)	-	6 (1; 0)	2 (1; 0)	1 (0; 0)	12 (7; 0)	8 (3; 0)	-	-	17
"

# Replace '-' with '0 (0; 0)'
data_string <- str_replace_all(data_string, "-\t", "0 (0; 0)\t")
data_string <- str_replace_all(data_string, "-\n", "0 (0; 0)\n")

data <- read.delim(text = data_string, stringsAsFactors = FALSE)

# Process data
data_processed <- data %>%
  mutate(across(starts_with("IS"), list(
    number = ~str_extract(., "\\d+"),
    gain = ~str_extract(., "(?<=\\()\\d+"),
    loss = ~str_extract(., "(?<=;) \\d+(?=\\))")
  ))) %>%
  # starts with IS but does not end with number, gain, or loss
  select(Population, Clone, matches("^IS.*(number|gain|loss)")) %>%
  pivot_longer(-c(Population, Clone), names_to = c(".value", "type"), names_sep = "_") %>%
  pivot_longer(-c(Population, Clone, type), names_to = "IS") %>%
  mutate(value = as.numeric(value))%>%
  mutate(value = ifelse(is.na(value), 0, value)) %>%
  pivot_wider(names_from = type, values_from = value)

LTEE50000 <- data_processed %>% group_by(Population, Clone) %>% 
	summarise(Total = sum(number), Insertion = sum(gain), Loss = sum(loss)) %>% 
	mutate(Generations = 50000)

# Did not add the ancestor unlike in 20231005_ins_del_analysis.ipynb

LTEE50000 %>% tail %>% print

LTEE_INS_cnt <- LTEE50000 %>% group_by(Population) %>% summarise(INS = mean(Insertion)) %>%
  filter(Population != "Ancestor") 
print(paste0("INS count in ", nrow(LTEE_INS_cnt), " populations. Total: ", sum(LTEE_INS_cnt$INS), " Mean: ", mean(LTEE_INS_cnt$INS), " SD: ", sd(LTEE_INS_cnt$INS)))
print(paste(sum(LTEE_INS_cnt$INS)/12))
""";

# A tibble: 6 × 6
# Groups:   Population [3]
  Population Clone Total Insertion  Loss Generations
  <chr>      <chr> <dbl>     <dbl> <dbl>       <dbl>
1 Ara–4      11336    42         2     5       50000
2 Ara–4      11337    43         3     5       50000
3 Ara–5      11339    66        23     2       50000
4 Ara–5      11340    67        24     2       50000
5 Ara–6      11389    70        25     0       50000
6 Ara–6      11390    62        17     0       50000
[1] "INS count in 12 populations. Total: 185 Mean: 15.4166666666667 SD: 11.7392762153638"


[33m[1m│ [22m[39m`.groups` argument.
[33m[1m└ [22m[39m[90m@ RCall ~/.julia/packages/RCall/gOwEW/src/io.jl:172[39m


[1] "15.4166666666667"


## This study
Get IS insertion site distribution based on MDS42 coordinates.

In [6]:
R"""
# raw data IS insertion site data
is_in_mds42_df <- read_csv(file.path(base_dir, "export", "classify_IS_events_MDS", "IS_positions_in_ref_genome.csv"))

# Get only those not found in the previous generation as in 20231005_ins_del_analysis.ipynb
generations <- c("FACS", "MA08", "MA20")

is_in_mds42.df <- is_in_mds42_df %>%
separate(Line, c("Parent", "Subline"), sep = "-", remove = FALSE) %>%
mutate(Parent = str_sub(Parent, 2)) %>%
mutate(Subline = as.integer(Subline)) %>%
rename("Gen_id" = Gen) %>%
mutate(Gen = map_chr(Gen_id, ~generations[.x])) %>%
filter(genome == "Query")

is_in_df.prev <- is_in_mds42.df %>%
filter(Gen_id < length(generations)) %>%
mutate(Gen = map_chr(Gen_id, ~generations[.x+1]))

is_in_df.facs <- is_in_mds42.df %>% filter(Gen_id == 1)
is_in_df.facs <- is_in_df.facs %>% slice(rep(1:n(), 3)) %>% mutate(Gen_id = rep(c(1,2,3), n()/3)) %>%
mutate(Gen = map_chr(Gen_id, ~generations[.x])) 

is_in_mds42.df.prevgen <- is_in_mds42.df %>%
	mutate(matched_row.prev = pmap(list(Parent, Subline, Gen, pos),
	function(Parent, Subline, Gen, pos){
		matched_row <- unname(which(is_in_df.prev$Parent == Parent & 
		is_in_df.prev$Subline == Subline & 
		is_in_df.prev$Gen == Gen &
		abs(is_in_df.prev$pos - pos) <= 20))
		return(matched_row[1])
	}) %>% unlist() %>% as.integer()) 

print(is_in_mds42.df.prevgen %>% head %>% data.frame)
print(is_in_mds42.df.prevgen %>% tail %>% data.frame)
""";

[33m[1m│ [22m[39m── Column specification ────────────────────────────────────────────────────────
[33m[1m│ [22m[39mDelimiter: ","
[33m[1m│ [22m[39mchr (5): flank, genome, position_status, sv, Line
[33m[1m│ [22m[39mdbl (5): pos_id, pos, cluster_id, insert_id, Gen
[33m[1m│ [22m[39mlgl (2): IS_strand, flankMatchDir
[33m[1m│ [22m[39m
[33m[1m│ [22m[39mℹ Use `spec()` to retrieve the full column specification for this data.
[33m[1m│ [22m[39mℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
[33m[1m└ [22m[39m[90m@ RCall ~/.julia/packages/RCall/gOwEW/src/io.jl:172[39m


  pos_id IS_strand flank     pos genome cluster_id insert_id flankMatchDir
1      0     FALSE     r  476936  Query          2         1          TRUE
2      0     FALSE     f  476923  Query          2         1         FALSE
3      1     FALSE     r  554076  Query          4         2          TRUE
4      1     FALSE     f  554063  Query          4         2         FALSE
5      2      TRUE     f 1181600  Query          3         3          TRUE
6      2     FALSE     r 1488330  Query         13         6          TRUE
  position_status               sv  Line Parent Subline Gen_id  Gen
1             new simple_insertion L01-1     01       1      1 FACS
2             new simple_insertion L01-1     01       1      1 FACS
3             new simple_insertion L01-1     01       1      1 FACS
4             new simple_insertion L01-1     01       1      1 FACS
5             new          unknown L01-1     01       1      1 FACS
6             new          unknown L01-1     01       1      1 FACS

In [7]:
R"""
is_in_mds42.df.prevgen.rmdup <- is_in_mds42.df.prevgen %>%
filter(Gen != "FACS") %>%
filter(is.na(matched_row.prev)) %>%
mutate(Type = ifelse(sv == "simple_insertion", "Simple", "Complex")) %>%
group_by(Line, Gen, insert_id) %>% ## remove duplicates due to two ends of the same insertion
summarise(pos = mean(pos), sv = ifelse(all(sv == "simple_insertion"), "simple_insertion", "unknown"))

is_in_mds42.df.prevgen.rmdup %>% head %>% print

## Note the number is based on MDS42 coordinates rather than the ancestor of MA.
print(paste0("Number of unique INS: ", nrow(is_in_mds42.df.prevgen.rmdup)))
# simple or not
print(paste0("Number of simple INS: ", sum(is_in_mds42.df.prevgen.rmdup$sv == "simple_insertion")))
# MA08 or not
print(paste0("Number of MA08 INS: ", sum(is_in_mds42.df.prevgen.rmdup$Gen == "MA08")))
""";

# A tibble: 6 × 5
# Groups:   Line, Gen [1]
  Line  Gen   insert_id      pos sv              
  <chr> <chr>     <dbl>    <dbl> <chr>           
1 L01-1 MA08          2  549440  unknown         
2 L01-1 MA08          3  554375  unknown         
3 L01-1 MA08          4  705281  simple_insertion
4 L01-1 MA08          8 1475270  unknown         
5 L01-1 MA08         11 1660676. simple_insertion
6 L01-1 MA08         12 1938080. simple_insertion
[1] "Number of unique INS: 910"
[1] "Number of simple INS: 364"
[1] "Number of MA08 INS: 481"


[33m[1m│ [22m[39m`.groups` argument.
[33m[1m└ [22m[39m[90m@ RCall ~/.julia/packages/RCall/gOwEW/src/io.jl:172[39m


### Import Gene data of MDS42

Note: Goodall 2018 mBio say PEC identified 300 essential genes, but the database contains 302 genes. Maybe the database was updated or a mistake in the paper. The paper is not citing the original source of the data nor the paper that did the analysis (but citing a related methods paper instead), so I cannot check. Here, I use 302 genes.

Source: 20240301 (but the number has been the same at least since 2021)
https://shigen.nig.ac.jp/ecoli/pec/download/files/PECData.dat

In [8]:
R""" # Here, the essential gene length does not consider overlap of essential genes.
mds42_genes <- read_csv("../misc/gene_df_ess_mds42_based_on_PEC.csv") %>%
	mutate(start = start_ori_shift, end = end_ori_shift) %>%
	select(-start_ori_shift, -end_ori_shift)
	# remove duplicated genes by selecting the one with longest length
	# (with group_by(gene) %>% filter(length == max(length)) %>% ungroup() 
mds42_genome_size = 3980425

mds42_genes %>% head %>% data.frame %>% print
mds42_ess_genes <- mds42_genes %>% filter(Essential_PEC)
mds42_ess_genes_length <- mds42_ess_genes %>% mutate(length = end - start +1 ) %>% pull(length)

print(paste0("MDS42 has ", mds42_genome_size, " bp genome size", " and ", nrow(mds42_genes), " genes", " with total gene length ", sum(mds42_genes$length), " bp"))
print(paste0("Genes are ", sum(mds42_genes$length)/mds42_genome_size*100, "% of the genome"))
print("(With some duplicates)")
print(paste0("MDS42 has ", nrow(mds42_ess_genes), " essential genes", " with total essential gene length ", sum(mds42_ess_genes_length), " bp"))
print(paste0("Essential genes are ", sum(mds42_ess_genes_length)/mds42_genome_size*100, "% of the genome"))
print("(No duplicateda genes were essential as the next cell)")
""";

  gene  start    end strand length bnumber  ECK_id Insertion.Index.Score
1 thrL 621429 621494      1     66   b0001 ECK0001             0.3939394
2 thrA 621576 624038      1   2463   b0002 ECK0002             0.2192448
3 thrB 624040 624972      1    933   b0003 ECK0003             0.2658092
4 thrC 624973 626259      1   1287   b0004 ECK0004             0.2059052
5 yaaX 626473 626769      1    297   b0005 ECK0005             0.2255892
6 yaaA 626922 627698     -1    777   b0006 ECK0006             0.2084942
  Log.Likelihood.Ratio Essential Non.essential Unclear Essential_PEC
1             31.43106     FALSE          TRUE   FALSE         FALSE
2             17.20078     FALSE          TRUE   FALSE         FALSE
3             21.23455     FALSE          TRUE   FALSE         FALSE
4             15.99492     FALSE          TRUE   FALSE         FALSE
5             17.76551     FALSE          TRUE   FALSE         FALSE
6             16.23103     FALSE          TRUE   FALSE         FALSE
[1] "M

[33m[1m│ [22m[39m── Column specification ────────────────────────────────────────────────────────
[33m[1m│ [22m[39mDelimiter: ","
[33m[1m│ [22m[39mchr (3): gene, bnumber, ECK_id
[33m[1m│ [22m[39mdbl (8): start, end, strand, length, Insertion Index Score, Log Likelihood R...
[33m[1m│ [22m[39mlgl (4): Essential, Non-essential, Unclear, Essential_PEC
[33m[1m│ [22m[39m
[33m[1m│ [22m[39mℹ Use `spec()` to retrieve the full column specification for this data.
[33m[1m│ [22m[39mℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
[33m[1m└ [22m[39m[90m@ RCall ~/.julia/packages/RCall/gOwEW/src/io.jl:172[39m


## Analysis of overlap of genes
There shouldn't be much but there can be overlaps in essential genes.

In [9]:
## Do essential genes overlap?
@rget mds42_ess_genes
@rget mds42_genome_size

mds42_genome_size = Int(mds42_genome_size)
mds42_ess_depth = zeros(Int, mds42_genome_size)

for row in eachrow(mds42_ess_genes)
	mds42_ess_depth[Int(row.start):Int(row.end)] .= 1
end

# convert to range. DF ID, start end
mds42_ess_depth_range_df = DataFrame(ID = Int[], r_start = Int[], r_end = Int[])
id_ = 1
flag_ = 0
start_ = 1
for (i, v) in enumerate(mds42_ess_depth)
	if v == 1 && flag_ == 0
		flag_ = 1
		start_ = i
	elseif v == 0 && flag_ == 1
		flag_ = 0
		push!(mds42_ess_depth_range_df, [id_, start_, i-1])
		id_ += 1
	end
end
if flag_ == 1
	push!(mds42_ess_depth_range_df, [id_, start_, mds42_genome_size])
end

@rput mds42_ess_depth_range_df

R"""
mds42_ess_depth_range_df %>% head
print(paste("number of ranges:", nrow(mds42_ess_depth_range_df)))
print(paste("total length of ranges:", sum(mds42_ess_depth_range_df$r_end - mds42_ess_depth_range_df$r_start)))
"""

[1] "number of ranges: 274"
[1] "total length of ranges: 308314"


RObject{StrSxp}
[1] "total length of ranges: 308314"


## Found duplicated genes
Some genes have duplicated names

In [10]:
R"""
# find duplicates
mds42_genes %>% 
	select(gene, start, end, length, Essential_PEC) %>% 
	group_by(gene) %>%
	mutate(n = n()) %>%
	mutate(min_start = min(start), max_end = max(end)) %>%
	mutate(is_minmax = (start == min_start & end == max_end)) %>%
	filter(n > 1) %>%
	arrange(gene, -length) %>% ungroup() %>% data.frame %>% print
## all duplicates are non-essential genes

# LTEE
LTEE_genes %>% filter(feat_type == "gene") %>%
	select(gene, start, end) %>% 
	mutate(length = end - start +1 ) %>%
	group_by(gene) %>%
	mutate(n = n()) %>%
	mutate(min_start = min(start), max_end = max(end)) %>%
	mutate(is_minmax = (start == min_start & end == max_end)) %>%
	mutate(is_ess = gene %in% LTEE_ess$Name) %>%
	ungroup %>%
	filter(n > 1) %>%
	arrange(gene, -length) %>% data.frame %>% print
""";

   gene   start     end length Essential_PEC n min_start max_end is_minmax
1  arpB 2110191 2112090   1900         FALSE 3   2110146 2112090     FALSE
2  arpB 2110630 2112045   1416         FALSE 3   2110146 2112090     FALSE
3  arpB 2110146 2110619    474         FALSE 3   2110146 2112090     FALSE
4  gapC 1827789 1828789   1001         FALSE 3   1827744 1828789     FALSE
5  gapC 1827992 1828744    753         FALSE 3   1827744 1828789     FALSE
6  gapC 1827744 1827995    252         FALSE 3   1827744 1828789     FALSE
7  ilvG   24870   26514   1645         FALSE 3     24817   26514     FALSE
8  ilvG   24817   25800    984         FALSE 3     24817   26514     FALSE
9  ilvG   25880   26461    582         FALSE 3     24817   26514     FALSE
10 molR 2396135 2399930   3796         FALSE 4   2396090 2399930     FALSE
11 molR 2397026 2398963   1938         FALSE 4   2396090 2399930     FALSE
12 molR 2398938 2399885    948         FALSE 4   2396090 2399930     FALSE
13 molR 2396090 2396914  

Comment:
ldrD and yhhI is essential? It might be a mistake in the essential gene list by Couce et al. 2017
Their criteria is too loose. Even recA, B, C, dnaQ, ompF is considered essential.
Oh, I see their experiment was done under LTEE conditions.
In that case I may need to analyze Limdi's data under LB.
-> No Limdi's data is likely insufficient for this purpose as well.
Their data is only used for "differential" essentiality.


In [11]:
R"""
LTEE_genes_mds <- LTEE_genes %>% filter(feat_type == "gene") %>%
	left_join(mds42_genes %>% select(gene, bnumber, ECK_id, Essential_PEC), by = c("gene"))
LTEE_genes_mds %>% head %>% print

LTEE_ess_genes_mds <- LTEE_genes_mds %>% filter(Essential_PEC)
# genes identical and essential genes identical
print(paste("Out of", LTEE_gene_cnt, "genes in REL606,", nrow(LTEE_genes_mds), "genes are identical to MDS42",
"and", nrow(LTEE_ess_genes_mds), "are essential genes identical to MDS42"))
""";

# A tibble: 6 × 11
  seq_id start   end strand feat_id   feat_type gene  desc  bnumber ECK_id 
  <chr>  <dbl> <dbl>  <dbl> <chr>     <chr>     <chr> <chr> <chr>   <chr>  
1 REL606   190   255      1 ECB_00001 gene      thrL  <NA>  b0001   ECK0001
2 REL606   336  2798      1 ECB_00002 gene      thrA  <NA>  b0002   ECK0002
3 REL606  2800  3732      1 ECB_00003 gene      thrB  <NA>  b0003   ECK0003
4 REL606  3733  5019      1 ECB_00004 gene      thrC  <NA>  b0004   ECK0004
5 REL606  5232  5528      1 ECB_00005 gene      yaaX  <NA>  b0005   ECK0005
6 REL606  5681  6457     -1 ECB_00006 gene      yaaA  <NA>  b0006   ECK0006
# ℹ 1 more variable: Essential_PEC <lgl>
[1] "Out of 4386 genes in REL606, 4408 genes are identical to MDS42 and 279 are essential genes identical to MDS42"


[33m[1m│ [22m[39m  Detected an unexpected many-to-many relationship between `x` and `y`.
[33m[1m│ [22m[39mℹ Row 477 of `x` matches multiple rows in `y`.
[33m[1m│ [22m[39mℹ Row 628 of `y` matches multiple rows in `x`.
[33m[1m│ [22m[39mℹ If a many-to-many relationship is expected, set `relationship =
[33m[1m└ [22m[39m[90m@ RCall ~/.julia/packages/RCall/gOwEW/src/io.jl:172[39m


# Analysis of IS-devoid regions

Consuegra 2021.
> All regions that were nearly devoid of IS elements in the ancestor were colonized by IS elements in one or more populations, including the replication origin and termination regions, except for a 152-kbp region from 3.315 to 3.467 Mbp.


In [12]:
R"""
devoid.start <- 3315000 ## yhcP
devoid.end <- 3467000 ## yhgF
# get first two and last two in and around the devoid region
LTEE_genes %>% filter(start <= devoid.start, feat_type == "gene") %>% 
	arrange(start) %>% tail(2) %>% print
LTEE_genes %>% filter(start > devoid.start & end < devoid.end, feat_type == "gene") %>% 
	arrange(start) %>% head(2) %>% print
LTEE_genes %>% filter(start > devoid.start & end < devoid.end, , feat_type == "gene") %>% 
	arrange(desc(end)) %>% head(2) %>% print
LTEE_genes %>% filter(end >= devoid.end, feat_type == "gene") %>% 
	arrange(desc(end)) %>% tail(2) %>% print
""";

# A tibble: 2 × 8
  seq_id   start     end strand feat_id   feat_type gene  desc 
  <chr>    <dbl>   <dbl>  <dbl> <chr>     <chr>     <chr> <chr>
1 REL606 3313754 3314026     -1 ECB_03099 gene      yhcO  <NA> 
2 REL606 3314118 3316085     -1 ECB_03100 gene      yhcP  <NA> 
# A tibble: 2 × 8
  seq_id   start     end strand feat_id   feat_type gene  desc 
  <chr>    <dbl>   <dbl>  <dbl> <chr>     <chr>     <chr> <chr>
1 REL606 3316091 3317023     -1 ECB_03101 gene      yhcQ  <NA> 
2 REL606 3317031 3317234     -1 ECB_03102 gene      yhcR  <NA> 
# A tibble: 2 × 8
  seq_id   start     end strand feat_id   feat_type gene  desc 
  <chr>    <dbl>   <dbl>  <dbl> <chr>     <chr>     <chr> <chr>
1 REL606 3465496 3465972      1 ECB_03258 gene      greB  <NA> 
2 REL606 3464549 3465268     -1 ECB_03257 gene      ompR  <NA> 
# A tibble: 2 × 8
  seq_id   start     end strand feat_id   feat_type gene  desc 
  <chr>    <dbl>   <dbl>  <dbl> <chr>     <chr>     <chr> <chr>
1 REL606 3468828 3469055      1 

In [13]:
R"""
LTEE_INS_cnt 
"""

RObject{VecSxp}
# A tibble: 12 × 2
   Population   INS
   <chr>      <dbl>
 1 Ara+1       40  
 2 Ara+2       11  
 3 Ara+3        4  
 4 Ara+4        6  
 5 Ara+5       16  
 6 Ara+6        4  
 7 Ara–1       11  
 8 Ara–2       14.5
 9 Ara–3       31.5
10 Ara–4        2.5
11 Ara–5       23.5
12 Ara–6       21  


In [14]:
R"""
# Expected INS frequency per bp
LTEE.exp.all <- sum(LTEE_INS_cnt$INS) / LTEE_genome_size
LTEE.exp.ness <- sum(LTEE_INS_cnt$INS) / (LTEE_genome_size - sum(LTEE_ess_genes_length))
devoid.length.ness_ <- LTEE_ess_genes %>% filter(start > devoid.start & end < devoid.end) %>% 
	mutate(length = end - start + 1) %>% pull(length) %>% sum()
devoid.length.ness <- (devoid.end - devoid.start) - devoid.length.ness_
print(paste0("Expected INS frequency per bp: ", LTEE.exp.all, "; In devoid region: ",
	LTEE.exp.all * (devoid.end - devoid.start)))
print(paste0("In the ", (devoid.end - devoid.start)," bp devoid region, there are ",
	devoid.length.ness, " bp of non-essential sequence (", (devoid.length.ness/(devoid.end - devoid.start)*100), "%)")
)
print(paste0("Expected INS frequency per bp in non-essential region: ", LTEE.exp.ness, "; In devoid region: ",
	LTEE.exp.ness * (devoid.end - devoid.start)))
""";

[1] "Expected INS frequency per bp: 3.99584259576847e-05; In devoid region: 6.07368074556807"
[1] "In the 152000 bp devoid region, there are 124985 bp of non-essential sequence (82.2269736842105%)"
[1] "Expected INS frequency per bp in non-essential region: 4.48034424543374e-05; In devoid region: 6.81012325305929"


In [15]:
R"""
LTEE_ess_genes %>% filter(start > devoid.start & end < devoid.end) %>% mutate(length = end - start + 1)  %>% print
LTEE_ess_genes %>% filter(start > devoid.start & end < devoid.end) %>% mutate(length = end - start + 1) %>%
	pull(gene) %>% print
""";

# A tibble: 47 × 9
   seq_id   start     end strand feat_id   feat_type gene  desc  length
   <chr>    <dbl>   <dbl>  <dbl> <chr>     <chr>     <chr> <chr>  <dbl>
 1 REL606 3326278 3326766     -1 ECB_03108 gene      mreD  <NA>     489
 2 REL606 3326766 3327869     -1 ECB_03109 gene      mreC  <NA>    1104
 3 REL606 3327935 3328978     -1 ECB_03110 gene      mreB  <NA>    1044
 4 REL606 3333327 3333797      1 ECB_03113 gene      accB  <NA>     471
 5 REL606 3333808 3335157      1 ECB_03114 gene      accC  <NA>    1350
 6 REL606 3358748 3359320     -1 ECB_03133 gene      yrdC  <NA>     573
 7 REL606 3361595 3362104      1 ECB_03137 gene      def   <NA>     510
 8 REL606 3362119 3363066      1 ECB_03138 gene      fmt   <NA>     948
 9 REL606 3367521 3367904     -1 ECB_03145 gene      rplQ  <NA>     384
10 REL606 3367945 3368934     -1 ECB_03146 gene      rpoA  <NA>     990
# ℹ 37 more rows
 [1] "mreD" "mreC" "mreB" "accB" "accC" "yrdC" "def"  "fmt"  "rplQ" "rpoA"
[11] "rpsD" "rpsK" "rpsM"

#### Statistical test

In [16]:
R"""
## The number of INS=0 is significantly lower than expected from binomial distribution
binom.test(0, 12 * devoid.length.ness, LTEE.exp.ness/12, alternative = "less")
"""

RObject{VecSxp}

	Exact binomial test

data:  0 and 12 * devoid.length.ness
number of successes = 0, number of trials = 1499820, p-value = 0.003699
alternative hypothesis: true probability of success is less than 3.73362e-06
95 percent confidence interval:
 0.000000e+00 1.997393e-06
sample estimates:
probability of success 
                     0 



#### Analysis of this study

In [17]:
R"""
# Boundary genes
devoid.start.ma <- 3480424 ## aaeB = yhcP
devoid.end.ma <- 3615711 ## yhgF

# get first two and last two in and around the devoid region
mds42_genes %>% filter(start <= devoid.start.ma) %>% 
	arrange(start) %>% tail(2) %>% data.frame %>% print
mds42_genes %>% filter(start > devoid.start.ma & end < devoid.end.ma) %>%
	arrange(start) %>% head(2) %>% data.frame %>% print
mds42_genes %>% filter(start > devoid.start.ma & end < devoid.end.ma) %>%
	arrange(desc(end)) %>% head(2) %>% data.frame %>% print
mds42_genes %>% filter(end >= devoid.end.ma) %>%
	arrange(desc(end)) %>% tail(2) %>% data.frame %>% print
""";

  gene   start     end strand length bnumber  ECK_id Insertion.Index.Score
1 yhcO 3479178 3479450     -1    273   b3239 ECK3228             0.1465201
2 aaeB 3479542 3481509     -1   1968   b3240 ECK3229             0.2215447
  Log.Likelihood.Ratio Essential Non.essential Unclear Essential_PEC
1             10.22452     FALSE          TRUE   FALSE         FALSE
2             17.40612     FALSE          TRUE   FALSE         FALSE
  gene   start     end strand length bnumber  ECK_id Insertion.Index.Score
1 aaeA 3481515 3482447     -1    933   b3241 ECK3230             0.2636656
2 aaeX 3482455 3482658     -1    204   b3242 ECK3231             0.2549020
  Log.Likelihood.Ratio Essential Non.essential Unclear Essential_PEC
1             21.05394     FALSE          TRUE   FALSE         FALSE
2             20.31093     FALSE          TRUE   FALSE         FALSE
  gene   start     end strand length bnumber  ECK_id Insertion.Index.Score
1 greB 3614207 3614683      1    477   b3406 ECK3393         

There is quite a lot of insertion in the region.

In [18]:
R"""
MA.exp.all <- nrow(is_in_mds42.df.prevgen.rmdup) / mds42_genome_size # frequency genome wide / bp
MA.exp.ness <- nrow(is_in_mds42.df.prevgen.rmdup) / (mds42_genome_size - sum(mds42_ess_genes_length)) # frequency genome wide in noness (without caring overlaps)

devoid.length.ness.ma_ <- mds42_ess_genes %>% filter(start > devoid.start.ma & end < devoid.end.ma) %>% 
	mutate(length = end - start + 1) %>% pull(length) %>% sum()
devoid.length.ness.ma <- (devoid.end.ma - devoid.start.ma) - devoid.length.ness.ma_

print(paste0("Expected INS frequency per bp: ", MA.exp.all, "; In devoid region: ",
	MA.exp.all * (devoid.end.ma - devoid.start.ma)))
print(paste0("In the ", (devoid.end.ma - devoid.start.ma)," bp devoid region, there are ",
	devoid.length.ness.ma, " bp of non-essential sequence"))
print(paste0("Expected INS frequency per bp in non-essential region: ", MA.exp.ness, "; In devoid region: ",
	MA.exp.ness * devoid.length.ness.ma))

print(paste0("Observed INS frequency in devoid region: ",
	nrow(is_in_mds42.df.prevgen.rmdup %>% filter(pos >= devoid.start.ma & pos <= devoid.end.ma))))
""";

[1] "Expected INS frequency per bp: 0.000228618803268495; In devoid region: 30.9291520377849"
[1] "In the 135287 bp devoid region, there are 109775 bp of non-essential sequence"
[1] "Expected INS frequency per bp in non-essential region: 0.000247841666759721; In devoid region: 27.2068189685484"
[1] "Observed INS frequency in devoid region: 39"


## STAT: t-test raw genome size devoid region 

In [19]:
R"devoid.length.ness.ma"

RObject{RealSxp}
[1] 109775


In [20]:
R"""
binom.test(nrow(is_in_mds42.df.prevgen.rmdup %>% filter(pos >= devoid.start.ma & pos <= devoid.end.ma)), 
	#devoid.length.ness.ma * 44, MA.exp.ness / 44, alternative = "two.sided")
	devoid.length.ness.ma * 44, MA.exp.ness / 44, alternative = "less")
"""

RObject{VecSxp}

	Exact binomial test

data:  nrow(is_in_mds42.df.prevgen.rmdup %>% filter(pos >= devoid.start.ma & pos <= devoid.end.ma)) and devoid.length.ness.ma * 44
number of successes = 39, number of trials = 4830100, p-value = 0.9873
alternative hypothesis: true probability of success is less than 5.632765e-06
95 percent confidence interval:
 0.00000e+00 1.05463e-05
sample estimates:
probability of success 
          8.074367e-06 



In [21]:
# expected
R"""
(devoid.end.ma - devoid.start.ma) * MA.exp.all
"""

RObject{RealSxp}
[1] 30.92915


In [22]:
R"""
print((devoid.end.ma - devoid.start.ma) * MA.exp.all)
binom.test(nrow(is_in_mds42.df.prevgen.rmdup %>% filter(pos >= devoid.start.ma & pos <= devoid.end.ma)), 
	#devoid.length.ness.ma * 44, MA.exp.ness / 44, alternative = "two.sided")
	(devoid.end.ma - devoid.start.ma) * 44, # trials
	MA.exp.all/ 44, # expected ins / bp
	alternative = "less")
"""

[1] 30.92915


RObject{VecSxp}

	Exact binomial test

data:  nrow(is_in_mds42.df.prevgen.rmdup %>% filter(pos >= devoid.start.ma & pos <= devoid.end.ma)) and (devoid.end.ma - devoid.start.ma) * 44
number of successes = 39, number of trials = 5952628, p-value = 0.934
alternative hypothesis: true probability of success is less than 5.195882e-06
95 percent confidence interval:
 0.000000e+00 8.557512e-06
sample estimates:
probability of success 
          6.551728e-06 



In [23]:
R"""
# For instance deletions starting from an IS might be making a large contribution
is_in_mds42.df.prevgen.rmdup %>% filter(pos >= devoid.start.ma & pos <= devoid.end.ma) %>% data.frame %>%
	filter(startsWith(Line, "L05"))
"""

RObject{VecSxp}
   Line  Gen insert_id     pos      sv
1 L05-1 MA08        28 3503989 unknown
2 L05-1 MA08        29 3511933 unknown
3 L05-2 MA08        33 3510953 unknown
4 L05-2 MA08        34 3515914 unknown
5 L05-3 MA08        25 3505850 unknown
6 L05-3 MA08        26 3512685 unknown
7 L05-4 MA08        28 3511387 unknown
8 L05-4 MA08        29 3512873 unknown
9 L05-4 MA20        41 3509376 unknown


In [24]:
# limiting analysis to simple insertions still gives a significant difference
# rather unexpectedly
R"""
MA.exp.all.simple <- nrow(is_in_mds42.df.prevgen.rmdup %>% filter(sv == "simple_insertion")) / mds42_genome_size
MA.exp.ness.simple <- nrow(is_in_mds42.df.prevgen.rmdup %>% filter(sv == "simple_insertion")) / (mds42_genome_size - sum(mds42_ess_genes_length))

print(paste0("Expected simple INS frequency per bp: ", MA.exp.all.simple, "; In devoid region: ",
	MA.exp.all.simple * (devoid.end.ma - devoid.start.ma)))
print(paste0("In the ", (devoid.end.ma - devoid.start.ma)," bp devoid region, there are ",
	devoid.length.ness.ma, " bp of non-essential sequence (", devoid.length.ness.ma / (devoid.end.ma - devoid.start.ma) * 100, "%)")
)
print(paste0("Expected simple INS frequency per bp in non-essential region: ", MA.exp.ness.simple, "; In devoid region: ",
	MA.exp.ness.simple * devoid.length.ness.ma))
print(paste0("Observed simple INS frequency in devoid region: ",
	nrow(is_in_mds42.df.prevgen.rmdup %>% filter(pos >= devoid.start.ma & pos <= devoid.end.ma & sv == "simple_insertion"))))
""";

[1] "Expected simple INS frequency per bp: 9.14475213073981e-05; In devoid region: 12.371660815114"
[1] "In the 135287 bp devoid region, there are 109775 bp of non-essential sequence (81.1423122694716%)"
[1] "Expected simple INS frequency per bp in non-essential region: 9.91366667038883e-05; In devoid region: 10.8827275874193"
[1] "Observed simple INS frequency in devoid region: 12"


In [25]:
# 12 vs 10.8 so not significant as expected
R""" 
binom.test(nrow(is_in_mds42.df.prevgen.rmdup %>% filter(pos >= devoid.start.ma & pos <= devoid.end.ma, sv == "simple_insertion")),
	44*devoid.length.ness.ma, MA.exp.ness.simple/44, alternative = "two.sided")
"""

RObject{VecSxp}

	Exact binomial test

data:  nrow(is_in_mds42.df.prevgen.rmdup %>% filter(pos >= devoid.start.ma & pos <= devoid.end.ma, sv == "simple_insertion")) and 44 * devoid.length.ness.ma
number of successes = 12, number of trials = 4830100, p-value = 0.6493
alternative hypothesis: true probability of success is not equal to 2.253106e-06
95 percent confidence interval:
 1.283737e-06 4.339779e-06
sample estimates:
probability of success 
          2.484421e-06 



Considering the per base-pair rate of IS insertion in non-essential sequences,
the frequency in the zone in the LTEE is significantly lower than the average (Binomial test, one-sided, P = 3.7e-3).
In contrast, the corresponding zone in our study had a significantly higher (Binomial test, one-sided, P = 0.027) or insignificant (Binomial test, two-sided, P = 0.65) insertion frequency than the average.

Therefore, the lack of non-essential genes in our study likely does not explain the difference in IS insertion frequency in the region.
Besides, the locus is not that "rich" in essential genes.
Well, its double the average, but still just ~20 %.
The genome size changes is also not that large from 4.6 Mbp to 4.0 Mbp.

In [26]:
R"""
(sum(LTEE_INS_cnt$INS)/12 /50000) %>% print
(sum(LTEE_INS_cnt$INS)/12 / 50000/ 4.6e6 * 152000 * 1e-3  * 0.20)  %>% print
"""

[1] 0.0003083333
[1] 2.037681e-09


RObject{RealSxp}
[1] 2.037681e-09


## The rate and length of essential genes

Since the criterion of essential genes is different between the two studies,
the direct comparison does not make sense in reality.
Yet, the rate of essential genes (MDS42: 8 %, REL606: 11 %) is ~10 %,
and the difference in the size of the genome is not that large (4.6 Mbp to 4.0 Mbp)
in terms of ratio.
$N_e ~ 1e+7$ and $T ~ 5e+4$ in the LTEE, so fitness effects $s > 1e-4$ may be observed.
Given the transposition rate the difference in the essential gene ratio should be negligible.
What might make a differnece is the consecutive length of non-essential genes.

-> Plot consecutive length of non-essential genes in REL 606 and MDS42.

In [27]:
@rget mds42_genes
print(first(mds42_genes, 5))

@rget mds42_genome_size
mds42_ess_array = zeros(Int, Int(mds42_genome_size))
@rget mds42_ess_genes
for row in eachrow(mds42_ess_genes)
	if row.start > row.end
		println(row)
	end
	mds42_ess_array[Int(row.start):Int(row.end)] .= 1
end
# array to range: dataframe with start, end, count in range

mds42_ess_ranges = DataFrames.DataFrame(Start = Int[], End = Int[], length = Int[], Essential = Int[])
start = 1
cnt = 0
for (i, v) in enumerate(mds42_ess_array)
	if cnt == 0
		if v > 0
			push!(mds42_ess_ranges, (start, i-1, i-start+1, 0))
			start = i
			cnt = 1
		end
	else
		if v == 0
			push!(mds42_ess_ranges, (start, i-1, i-start+1, 1))
			start = i
			cnt = 0
		end
	end
end
push!(mds42_ess_ranges, (start, mds42_genome_size, mds42_genome_size-start+1, cnt))
first(mds42_ess_ranges, 5) |> print

## round ranges smaller than 300 bp
start = 1
max_length = mds42_ess_ranges.length[1]
threshold = 300
mds42_ess_ranges_rounded_ = mds42_ess_ranges
mds42_ess_ranges_rounded_.id = 1:length(mds42_ess_ranges_rounded_.Start)
mds42_ess_ranges_rounded_.rounded = mds42_ess_ranges_rounded_.id
mds42_ess_ranges_rounded_.Essential_rounded = mds42_ess_ranges_rounded_.Essential

start_range = 1
cnt = 0
for (i, v) in enumerate(mds42_ess_ranges.length)
	if v > threshold && cnt != mds42_ess_ranges.Essential[i]
		cnt = mds42_ess_ranges.Essential[i]
		start_range = i
	end
	mds42_ess_ranges_rounded_.rounded[i] = start_range
	mds42_ess_ranges_rounded_.Essential_rounded[i] = cnt
end
first(mds42_ess_ranges_rounded_, 5) |> print
print(size(mds42_ess_ranges_rounded_))

## merge ranges group by rounded nrow, start end essential name as N, Start, End, Essential 
mds42_ess_ranges_rounded = combine(groupby(mds42_ess_ranges_rounded_, :rounded), nrow, :Start => minimum, :End => maximum, :Essential => first)
# colnames 
mds42_ess_ranges_rounded = rename(mds42_ess_ranges_rounded, :nrow => :Count, :Start_minimum => :Start, :End_maximum => :End, :Essential_first => :Essential)
mds42_ess_ranges_rounded.length .= mds42_ess_ranges_rounded.End .- mds42_ess_ranges_rounded.Start .+ 1
print(first(mds42_ess_ranges_rounded, 20))

[1m5×13 DataFrame[0m
[1m Row [0m│[1m gene   [0m[1m start    [0m[1m end      [0m[1m strand  [0m[1m length  [0m[1m bnumber [0m[1m ECK_id  [0m[1m Insertion Index Score [0m[1m Log Likelihood Ratio [0m[1m Essential [0m[1m Non-essential [0m[1m Unclear [0m[1m Essential_PEC [0m
     │[90m String [0m[90m Float64  [0m[90m Float64  [0m[90m Float64 [0m[90m Float64 [0m[90m String  [0m[90m String  [0m[90m Float64?              [0m[90m Float64?             [0m[90m Bool?     [0m[90m Bool?         [0m[90m Bool?   [0m[90m Bool?         [0m
─────┼───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
   1 │ thrL    621429.0  621494.0      1.0     66.0  b0001    ECK0001               0.393939               31.4311      false           true    false          false
   2 │ thrA    621576.0  624038.0      1.0   2463.0  b0002    ECK0002               0

In [28]:
# same for REL606 
@rget LTEE_genes
@rget LTEE_genome_size
@rget LTEE_ess_genes
LTEE_genes = filter(row -> row.feat_type == "gene", LTEE_genes)

LTEE_ess_array = zeros(Int, Int(LTEE_genome_size))
for row in eachrow(LTEE_ess_genes)
	if row.start > row.end
		println(row)
	end
	LTEE_ess_array[Int(row.start):Int(row.end)] .= 1
end
LTEE_ess_ranges = DataFrames.DataFrame(Start = Int[], End = Int[], length = Int[], Essential = Int[])
start = 1
cnt = 0
for (i, v) in enumerate(LTEE_ess_array)
	if cnt == 0
		if v > 0
			push!(LTEE_ess_ranges, (start, i-1, i-start+1, 0))
			start = i
			cnt = 1
		end
	else
		if v == 0
			push!(LTEE_ess_ranges, (start, i-1, i-start+1, 1))
			start = i
			cnt = 0
		end
	end
end
push!(LTEE_ess_ranges, (start, LTEE_genome_size, LTEE_genome_size-start+1, cnt))
first(LTEE_ess_ranges, 5) |> print

## round ranges smaller than 300 bp
start = 1
max_length = LTEE_ess_ranges.length[1]
threshold = 300
LTEE_ess_ranges_rounded_ = LTEE_ess_ranges
LTEE_ess_ranges_rounded_.id = 1:length(LTEE_ess_ranges_rounded_.Start)
LTEE_ess_ranges_rounded_.rounded = LTEE_ess_ranges_rounded_.id
LTEE_ess_ranges_rounded_.Essential_rounded = LTEE_ess_ranges_rounded_.Essential
 
start_range = 1
cnt = 0
for (i, v) in enumerate(LTEE_ess_ranges.length)
	if v > threshold && cnt != LTEE_ess_ranges.Essential[i]
		cnt = LTEE_ess_ranges.Essential[i]
		start_range = i
	end
	LTEE_ess_ranges_rounded_.rounded[i] = start_range
	LTEE_ess_ranges_rounded_.Essential_rounded[i] = cnt
end
first(LTEE_ess_ranges_rounded_, 5) |> print
print(size(LTEE_ess_ranges_rounded_))

## merge ranges group by rounded nrow, start end essential name as N, Start, End, Essential
LTEE_ess_ranges_rounded = combine(groupby(LTEE_ess_ranges_rounded_, :rounded), nrow, :Start => minimum, :End => maximum, :Essential => first)
# colnames
LTEE_ess_ranges_rounded = rename(LTEE_ess_ranges_rounded, :nrow => :Count, :Start_minimum => :Start, :End_maximum => :End, :Essential_first => :Essential)
LTEE_ess_ranges_rounded.length .= LTEE_ess_ranges_rounded.End .- LTEE_ess_ranges_rounded.Start .+ 1
print(first(LTEE_ess_ranges_rounded, 20))

[1m5×4 DataFrame[0m
[1m Row [0m│[1m Start [0m[1m End   [0m[1m length [0m[1m Essential [0m
     │[90m Int64 [0m[90m Int64 [0m[90m Int64  [0m[90m Int64     [0m
─────┼─────────────────────────────────
   1 │     1   3732    3733          0
   2 │  3733   5019    1288          1
   3 │  5020  12160    7142          0
   4 │ 12161  14077    1918          1
   5 │ 14078  14165      89          0[1m5×7 DataFrame[0m
[1m Row [0m│[1m Start [0m[1m End   [0m[1m length [0m[1m Essential [0m[1m id    [0m[1m rounded [0m[1m Essential_rounded [0m
     │[90m Int64 [0m[90m Int64 [0m[90m Int64  [0m[90m Int64     [0m[90m Int64 [0m[90m Int64   [0m[90m Int64             [0m
─────┼────────────────────────────────────────────────────────────────────
   1 │     1   3732    3733          0      1        1                  0
   2 │  3733   5019    1288          1      2        2                  1
   3 │  5020  12160    7142          0      3        3            

In [29]:
# top large nonessential regions
threshold = 2698 
mds42_ess_ranges_rounded_noness_large = filter(row -> row.length > threshold && row.Essential == 0, mds42_ess_ranges_rounded)
sort(mds42_ess_ranges_rounded_noness_large, :length, rev=true) |> x -> first(x, 20) |> print
println()
print(size(mds42_ess_ranges_rounded_noness_large))
print(sum(mds42_ess_ranges_rounded_noness_large.length))

[1m20×6 DataFrame[0m
[1m Row [0m│[1m rounded [0m[1m Count [0m[1m Start   [0m[1m End     [0m[1m Essential [0m[1m length [0m
     │[90m Int64   [0m[90m Int64 [0m[90m Int64   [0m[90m Int64   [0m[90m Int64     [0m[90m Int64  [0m
─────┼─────────────────────────────────────────────────────
   1 │     269      1  1780738  2022999          0  242262
   2 │     209      3  1205036  1421425          0  216390
   3 │     299      1  2281669  2393915          0  112247
   4 │     515      1  3677730  3785260          0  107531
   5 │     157      1   828876   927642          0   98767
   6 │      55      3   348919   445170          0   96252
   7 │     231      3  1504673  1596270          0   91598
   8 │     311      1  2544351  2627509          0   83159
   9 │     189      3  1079436  1159955          0   80520
  10 │      19      1   124987   196636          0   71650
  11 │     375      1  3102042  3167310          0   65269
  12 │      45      1   263831   32727

In [30]:
ltee_ess_ranges_rounded_noness_large = filter(row -> row.length > threshold && row.Essential == 0, LTEE_ess_ranges_rounded)
sort(ltee_ess_ranges_rounded_noness_large, :length, rev=true) |> x -> first(x, 20) |> print
println()
print(size(ltee_ess_ranges_rounded_noness_large))
print(sum(ltee_ess_ranges_rounded_noness_large.length))

[1m20×6 DataFrame[0m
[1m Row [0m│[1m rounded [0m[1m Count [0m[1m Start   [0m[1m End     [0m[1m Essential [0m[1m length [0m
     │[90m Int64   [0m[90m Int64 [0m[90m Int64   [0m[90m Int64   [0m[90m Int64     [0m[90m Int64  [0m
─────┼─────────────────────────────────────────────────────
   1 │     239      1   760471   882541          0  122071
   2 │     973      1  4471628  4589471          0  117844
   3 │     619      1  2979668  3096915          0  117248
   4 │     119      1   248693   357660          0  108968
   5 │     377      9  1507170  1606612          0   99443
   6 │     787      7  3556203  3652785          0   96583
   7 │     939      3  4253595  4349745          0   96151
   8 │     455      7  2194891  2283492          0   88602
   9 │     279      1  1047760  1130230          0   82471
  10 │     371      3  1418525  1500276          0   81752
  11 │     643      3  3166962  3248010          0   81049
  12 │     825      1  3762866  383806

In [31]:
#IS-devoid regions
devoid_start = 3315000 ## yhcP
devoid_end = 3467000 ## yhgF

df1_ = filter(row -> (devoid_start <= row.Start <= devoid_end) .& (devoid_start <= row.End <= devoid_end), LTEE_ess_ranges_rounded)
df2_ = filter(row -> (devoid_start <= row.Start <= devoid_end) .& (devoid_start <= row.End <= devoid_end), ltee_ess_ranges_rounded_noness_large)
print(df1_)
print(df2_)
println()
println("REL606: In the ", devoid_end - devoid_start, " bp devoid region, there are ", sum(df1_[df1_.Essential .== 0, :length]),
	" bp of internal non-essential sequence",
	"(", sum(df1_[df1_.Essential .== 0, :length]) / (devoid_end - devoid_start) * 100, "%)")
print(sum(df2_.length), " bp of non-essential sequence is large")

[1m19×6 DataFrame[0m
[1m Row [0m│[1m rounded [0m[1m Count [0m[1m Start   [0m[1m End     [0m[1m Essential [0m[1m length [0m
     │[90m Int64   [0m[90m Int64 [0m[90m Int64   [0m[90m Int64   [0m[90m Int64     [0m[90m Int64  [0m
─────┼─────────────────────────────────────────────────────
   1 │     694      3  3326278  3328978          1    2701
   2 │     697      1  3328979  3333326          0    4348
   3 │     698      3  3333327  3335157          1    1831
   4 │     701      1  3335158  3358747          0   23590
   5 │     702      1  3358748  3359320          1     573
   6 │     703      1  3359321  3361594          0    2274
   7 │     704      3  3361595  3363066          1    1472
   8 │     707      1  3363067  3367520          0    4454
   9 │     708     49  3367521  3381175          1   13655
  10 │     757      1  3381176  3399304          0   18129
  11 │     758      9  3399305  3404345          1    5041
  12 │     767      1  3404346  341402

In [32]:
# same with mds42
devoid_start = 3480424 ## aaeB = yhcP
devoid_end = 3615711 ## yhgF

df1_ = filter(row -> (devoid_start <= row.Start <= devoid_end) .& (devoid_start <= row.End <= devoid_end), mds42_ess_ranges_rounded)
df2_ = filter(row -> (devoid_start <= row.Start <= devoid_end) .& (devoid_start <= row.End <= devoid_end), mds42_ess_ranges_rounded_noness_large)
print(df1_)
#print(df2_)
# sort by length and show the first 5
print(sort(df2_, :length, rev=true) |> x -> first(x, 5))

println()
println("MDS42: In the ", devoid_end - devoid_start, " bp devoid region, there are ", sum(df1_[df1_.Essential .== 0, :length]),
	" bp of internal non-essential sequence",
	"(", sum(df1_[df1_.Essential .== 0, :length]) / (devoid_end - devoid_start) * 100, "%)")
print(sum(df2_.length), " bp of non-essential sequence is large")

mds42_ess_ranges_rounded_ = filter(row -> (devoid_start <= row.Start <= devoid_end) .| (devoid_start <= row.End <= devoid_end), mds42_ess_ranges_rounded)
mds42_ess_ranges_rounded_[mds42_ess_ranges_rounded_.Start .<= devoid_start, :Start] .= devoid_start
mds42_ess_ranges_rounded_[mds42_ess_ranges_rounded_.End .>= devoid_end, :End] .= devoid_end
mds42_ess_ranges_rounded_.length = mds42_ess_ranges_rounded_.End .- mds42_ess_ranges_rounded_.Start .+ 1

mds42_ess_ranges_rounded_noness_large_ = filter(row -> (devoid_start <= row.Start <= devoid_end) .& (devoid_start <= row.End <= devoid_end), mds42_ess_ranges_rounded_noness_large)
mds42_ess_ranges_rounded_noness_large_[mds42_ess_ranges_rounded_noness_large_.Start .<= devoid_start, :Start] .= devoid_start
mds42_ess_ranges_rounded_noness_large_[mds42_ess_ranges_rounded_noness_large_.End .>= devoid_end, :End] .= devoid_end
mds42_ess_ranges_rounded_noness_large_.length = mds42_ess_ranges_rounded_noness_large_.End .- mds42_ess_ranges_rounded_noness_large_.Start .+ 1
println()
println("MDS42: In the ", devoid_end - devoid_start, " bp devoid region, there are ", sum(mds42_ess_ranges_rounded_[mds42_ess_ranges_rounded_.Essential .== 0, :length]),
	" bp of  non-essential sequence",
	"(", sum(mds42_ess_ranges_rounded_[mds42_ess_ranges_rounded_.Essential .== 0, :length]) / (devoid_end - devoid_start) * 100, "%)")
print(sum(mds42_ess_ranges_rounded_noness_large_.length), " bp of non-essential sequence is large",
	"(", sum(mds42_ess_ranges_rounded_noness_large_.length) / (devoid_end - devoid_start) * 100, "%)")

[1m15×6 DataFrame[0m
[1m Row [0m│[1m rounded [0m[1m Count [0m[1m Start   [0m[1m End     [0m[1m Essential [0m[1m length [0m
     │[90m Int64   [0m[90m Int64 [0m[90m Int64   [0m[90m Int64   [0m[90m Int64     [0m[90m Int64  [0m
─────┼─────────────────────────────────────────────────────
   1 │     436      3  3491708  3494408          1    2701
   2 │     439      1  3494409  3498756          0    4348
   3 │     440      3  3498757  3500587          1    1831
   4 │     443      1  3500588  3524163          0   23576
   5 │     444      1  3524164  3524736          1     573
   6 │     445      1  3524737  3527010          0    2274
   7 │     446      3  3527011  3528482          1    1472
   8 │     449      1  3528483  3532936          0    4454
   9 │     450     47  3532937  3546591          1   13655
  10 │     497      1  3546592  3548794          0    2203
  11 │     498      5  3548795  3551947          1    3153
  12 │     503      1  3551948  359002

## Same analysis using PEC (used in paper)
Since the essential genes of Couche and Limdi used above are overestimated, here I will use the essential genes of PEC.

- Using data curated by Limdi assign essentiality based on bnumbers
- Using data curated by Barrick assign essentiality based on gene names
- Some bnumbers of Limdi seem to assign orthologs with same gene names, remove duplicate genes manually based on sequence order and position.

In [33]:
R"""
limdi_ltee_ann <- read_tsv("../misc/LimdiLTEE2024/Metadata/all_metadata_REL606.txt")
colnames(limdi_ltee_ann) <- c("gene", "prokka", "bnumber", "start", "end", "strand", "uniprot", "protein")
head(limdi_ltee_ann) %>% print
""";

# A tibble: 6 × 8
  gene  prokka         bnumber start   end strand uniprot protein               
  <chr> <chr>          <chr>   <dbl> <dbl>  <dbl> <chr>   <chr>                 
1 thrA  FJKNNBLA_00001 b0002     336  2798      1 P00561  fused aspartate kinas…
2 thrB  FJKNNBLA_00002 b0003    2800  3732      1 P00547  homoserine kinase     
3 thrC  FJKNNBLA_00003 b0004    3733  5019      1 P00934  threonine synthase    
4 yaaX  FJKNNBLA_00004 b0005    5232  5528      1 P75616  DUF2502 domain-contai…
5 yaaA  FJKNNBLA_00005 b0006    6457  5681     -1 P0A8I3  peroxide stress resis…
6 yaaJ  FJKNNBLA_00006 b0007    7957  6527     -1 P30143  putative transporter …


[33m[1m│ [22m[39m── Column specification ────────────────────────────────────────────────────────
[33m[1m│ [22m[39mDelimiter: "\t"
[33m[1m│ [22m[39mchr (5): Gene Name, Locus Tag (prokka_output), Locus Tag (K12 reference), Un...
[33m[1m│ [22m[39mdbl (3): Start of Gene, End of Gene, Strand
[33m[1m│ [22m[39m
[33m[1m│ [22m[39mℹ Use `spec()` to retrieve the full column specification for this data.
[33m[1m│ [22m[39mℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
[33m[1m└ [22m[39m[90m@ RCall ~/.julia/packages/RCall/gOwEW/src/io.jl:172[39m


In [34]:
# From 20230923_create_gene_list_from_genbank.ipynb
#pec_kato = pd.read_csv("../dat/shigen.nig.ac.jp_ecoli_pec_download_files_PECData.dat.txt", sep="\t")
#pec_kato['b_numbers'] = pec_kato['Alternative name'].str.extract(r'(b\d+)')
#pec_kato['ECK_id'] = pec_kato['Alternative name'].str.extract(r'(ECK\d+)')
#pec_kato['Essential_PEC'] = pec_kato['Class(1:essential 2:noessential 3:unknown)'].apply(lambda x: True if x==1 else False)

R"""
pec_kato <- read_tsv("../misc/shigen.nig.ac.jp_ecoli_pec_download_files_PECData.dat.txt")
pec_kato <- pec_kato %>%
  mutate(bnumber = str_extract(`Alternative name`, "b\\d+"),
         ECK_id = str_extract(`Alternative name`, "ECK\\d+")) %>%
		 mutate(Essential_PEC = if_else(`Class(1:essential 2:noessential 3:unknown)` == 1, TRUE, FALSE))
pec_kato %>% head %>% print

limdi_ltee_PEC <- limdi_ltee_ann %>% select(-prokka, -protein) %>%
	left_join(pec_kato %>% select(bnumber, ECK_id, Essential_PEC) %>%
		filter(Essential_PEC))
limdi_ltee_PEC %>% head %>% print

print(paste("Out of ", length(pec_kato %>% filter(Essential_PEC) %>% pull(bnumber) %>% unique),
	"essential genes in PEC,", length(limdi_ltee_PEC %>% filter(Essential_PEC) %>% pull(bnumber) %>% unique),
	"were detected"))

limdi_ltee_PEC_ <- limdi_ltee_ann %>% select(-prokka, -protein) %>%
	right_join(pec_kato %>% select(bnumber, ECK_id, Essential_PEC, Orf, `Start(BP)`) %>%
		filter(Essential_PEC))
print("There are some genes missing")
limdi_ltee_PEC_ %>% filter(is.na(gene)) %>% select(-uniprot, end) %>% print

# add genes based on names too
missing_genes <- limdi_ltee_PEC_ %>% filter(is.na(gene)) %>% select(Orf)
found_in_couce <- LTEE_genes_couce %>% filter(feat_type == "gene") %>% filter(gene %in% missing_genes$Orf) %>% 
	pull(gene) %>% unique
still_missing <- setdiff(missing_genes$Orf, found_in_couce)
print(paste("Out of ", length(missing_genes$Orf), "missing genes,", length(found_in_couce), "were found in Couce",
	"and", length(still_missing), "are still missing"))
print(pec_kato %>% filter(Orf %in% still_missing) %>% select(Orf, `Start(BP)`, `Length(BP)`) %>% print)
""";

[33m[1m│ [22m[39m── Column specification ────────────────────────────────────────────────────────
[33m[1m│ [22m[39mDelimiter: "\t"
[33m[1m│ [22m[39mchr (4): Orf, Alternative name, Product, PMID
[33m[1m│ [22m[39mdbl (9): Orf ID, Feature Type(1:gene 2:rRNA 3:tRNA 4:ncRNA 7:tmRNA 8:sRNA), ...
[33m[1m│ [22m[39m
[33m[1m│ [22m[39mℹ Use `spec()` to retrieve the full column specification for this data.
[33m[1m│ [22m[39mℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
[33m[1m└ [22m[39m[90m@ RCall ~/.julia/packages/RCall/gOwEW/src/io.jl:172[39m


# A tibble: 6 × 16
  `Orf ID` Feature Type(1:gene 2:rRNA 3:t…¹ Orf   `Alternative name` MapPosition
     <dbl>                            <dbl> <chr> <chr>                    <dbl>
1        1                                1 thrL  b0001,ECK0001,JW4…     0.00410
2        2                                1 thrA  b0002,ECK0002,Hs,…     0.00726
3        3                                1 thrB  b0003,ECK0003,JW0…     0.0604 
4        4                                1 thrC  b0004,ECK0004,JW0…     0.0805 
5        5                                1 yaaX  b0005,ECK0005,JW0…     0.113  
6        6                                1 yaaA  b0006,ECK0006,JW0…     0.122  
# ℹ abbreviated name:
#   ¹​`Feature Type(1:gene 2:rRNA 3:tRNA 4:ncRNA 7:tmRNA 8:sRNA)`
# ℹ 11 more variables: `Start(BP)` <dbl>, `End(BP)` <dbl>, `Length(BP)` <dbl>,
#   `Direction(0:+ 1:-)` <dbl>,
#   `Class(1:essential 2:noessential 3:unknown)` <dbl>, PID <dbl>,
#   Product <chr>, PMID <chr>, bnumber <chr>, ECK_id <chr>, Essenti

[33m[1m└ [22m[39m[90m@ RCall ~/.julia/packages/RCall/gOwEW/src/io.jl:172[39m
[33m[1m└ [22m[39m[90m@ RCall ~/.julia/packages/RCall/gOwEW/src/io.jl:172[39m


In [35]:
R"""
LTEE_ess <- limdi_ltee_PEC %>% filter(Essential_PEC) %>% pull(gene)
LTEE_ess <- unique(c(LTEE_ess, found_in_couce))
LTEE_ess <- data.frame(Name = LTEE_ess)
LTEE_genes <- limdi_ltee_PEC %>% mutate(feat_type = "gene") %>% mutate(start_ = ifelse(start < end, start, end), end_ = ifelse(start < end, end, start)) %>% select(-start, -end) %>% rename(start = start_, end = end_)
LTEE_genes <- rbind(LTEE_genes %>% select(gene, start, end, feat_type),
	LTEE_genes_couce %>% filter(feat_type == "gene", gene %in% found_in_couce) %>%
	select(gene, start, end, feat_type))
## Manually add genes. ffs and rnpB were annotated in MDS42 but not in REL606.
## for yceQ the gene name was different.
LTEE_genes <- LTEE_genes %>% rbind(
	# ffs: 448044, 450061;  yceQ = ECB_01081; rnpB: 1159071, 1159391
	data.frame(gene = c("ffs", "rnpB"), start = c(448044, 1159071), end = c(450061, 1159391), feat_type = "gene")
) %>%
rbind(
	LTEE_genes_couce %>% filter(feat_type == "gene", gene %in% c("ECB_01081")) %>%
		select(gene, start, end, feat_type)
)
LTEE_ess <- LTEE_ess %>% rbind(data.frame(Name = c("ffs", "rnpB", "ECB_01081")))
LTEE_ess_pec <- LTEE_ess
LTEE_genes_pec <- LTEE_genes

#LTEE_genome_size <- LTEE_genes %>% filter(end - start > 4e6) %>% pull(end) %>% max()
LTEE_gene_cnt <- LTEE_genes %>% filter(feat_type == "gene") %>% nrow()
LTEE_gene_length <- LTEE_genes %>% filter(feat_type == "gene") %>% mutate(length = end - start +1 ) %>% pull(length)
LTEE_ess_genes <- LTEE_genes %>% filter(feat_type == "gene" & gene %in% LTEE_ess$Name)
LTEE_ess_genes_length <- LTEE_ess_genes %>% mutate(length = end - start +1 ) %>% pull(length)
# genome size gene cnt, total gene length and essential gene cnt total ess gene length ess gene % of total genom
print(paste0("LTEE has ", LTEE_genome_size, " bp genome size", " and ", LTEE_gene_cnt, " genes", " with total gene length ", sum(LTEE_gene_length), " bp"))
print(paste0("Genes are ", sum(LTEE_gene_length)/LTEE_genome_size*100, "% of the genome"))
print(paste0("LTEE has ", nrow(LTEE_ess_genes), "(", LTEE_ess %>% nrow, ") essential genes", " with total essential gene length ", sum(LTEE_ess_genes_length), " bp"))
print(paste0("Essential genes are ", sum(LTEE_ess_genes_length)/LTEE_genome_size*100, "% of the genome"))
""";

[1] "LTEE has 4629812 bp genome size and 4034 genes with total gene length 3906825 bp"
[1] "Genes are 84.3840959416927% of the genome"
[1] "LTEE has 307(302) essential genes with total essential gene length 316209 bp"
[1] "Essential genes are 6.82984535873163% of the genome"


Remove duplicated essential genes

In [36]:
R"""
#There seems to be some duplicated essential genes
# thereseems to be difference in annotation. for instance, gapC was annotated as gapA in limdi data.
LTEE_ess_genes %>% group_by(gene) %>% mutate(n = n()) %>% filter(n > 1) %>% arrange(gene, -start) %>%
	ungroup() %>% mutate(length = end - start + 1) %>% select(gene, start, end, length, n) %>% print
# lets change the names of orthologs in limdi curated data according to names in REL606.gbk
# gapA -> gapC length 1002; acpP -> ECB_02853 length 249; 
# kdsB -> kpsU length 741; lptG -> ECB_02855 1080;
# make polA unique
LTEE_ess_genes <- LTEE_ess_genes %>% mutate(length = end - start + 1) %>%
	mutate(gene = ifelse(gene == "gapA" & length == 1002, "gapC", gene)) %>%
	mutate(gene = ifelse(gene == "acpP" & length == 249, "ECB_02853", gene)) %>%
	mutate(gene = ifelse(gene == "kdsB" & length == 741, "kpsU", gene)) %>%
	mutate(gene = ifelse(gene == "lptG" & length == 1080, "ECB_02855", gene)) %>%
	# drop duplicated polA
	distinct(gene, start, end, .keep_all = TRUE) %>%
	filter(!(gene %in% c("gapC", "ECB_02853", "kpsU", "ECB_02855")))

print("After changing the names")
LTEE_ess_genes %>% group_by(gene) %>% mutate(n = n()) %>% filter(n > 1) %>% arrange(gene, -start) %>%
	ungroup() %>% mutate(length = end - start + 1) %>% select(gene, start, end, length, n) %>% print

write_csv(LTEE_ess_genes %>% select(-feat_type), "../misc/output/REL606_ess_genes.csv")
""";

# A tibble: 10 × 5
   gene    start     end length     n
   <chr>   <dbl>   <dbl>  <dbl> <int>
 1 acpP  3060118 3060366    249     2
 2 acpP  1166184 1166420    237     2
 3 gapA  1840000 1840995    996     2
 4 gapA  1459044 1460045   1002     2
 5 kdsB  3016266 3017006    741     2
 6 kdsB   987944  988690    747     2
 7 lptG  4470545 4471627   1083     2
 8 lptG  3061573 3062652   1080     2
 9 polA  4025263 4028049   2787     2
10 polA  4025263 4028049   2787     2
[1] "After changing the names"
# A tibble: 0 × 5
# ℹ 5 variables: gene <chr>, start <dbl>, end <dbl>, length <dbl>, n <int>


In [37]:
R"""
#LTEE_genome_size <- LTEE_genes %>% filter(end - start > 4e6) %>% pull(end) %>% max()
#LTEE_gene_cnt <- LTEE_genes %>% filter(feat_type == "gene") %>% nrow()
#LTEE_gene_length <- LTEE_genes %>% filter(feat_type == "gene") %>% mutate(length = end - start +1 ) %>% pull(length)
#LTEE_ess_genes <- LTEE_genes %>% filter(feat_type == "gene" & gene %in% LTEE_ess$Name)
LTEE_ess_genes_length <- LTEE_ess_genes %>% mutate(length = end - start +1 ) %>% pull(length)
# genome size gene cnt, total gene length and essential gene cnt total ess gene length ess gene % of total genom
print(paste0("LTEE has ", LTEE_genome_size, " bp genome size", " and ", LTEE_gene_cnt, " genes", " with total gene length ", sum(LTEE_gene_length), " bp"))
print(paste0("Genes are ", sum(LTEE_gene_length)/LTEE_genome_size*100, "% of the genome"))
print(paste0("LTEE has ", nrow(LTEE_ess_genes), "(", LTEE_ess %>% nrow, ") essential genes", " with total essential gene length ", sum(LTEE_ess_genes_length), " bp"))
print(paste0("Essential genes are ", sum(LTEE_ess_genes_length)/LTEE_genome_size*100, "% of the genome"))
""";

[1] "LTEE has 4629812 bp genome size and 4034 genes with total gene length 3906825 bp"
[1] "Genes are 84.3840959416927% of the genome"
[1] "LTEE has 302(302) essential genes with total essential gene length 310350 bp"
[1] "Essential genes are 6.70329594376618% of the genome"


In [38]:
R""" # rough analysis
# Expected INS frequency per bp
LTEE.exp.all <- sum(LTEE_INS_cnt$INS) / LTEE_genome_size
LTEE.exp.ness <- sum(LTEE_INS_cnt$INS) / (LTEE_genome_size - sum(LTEE_ess_genes_length))
devoid.length.ness_ <- LTEE_ess_genes %>% filter(start > devoid.start & end < devoid.end) %>% 
	mutate(length = end - start + 1) %>% pull(length) %>% sum()
devoid.length.ness <- (devoid.end - devoid.start) - devoid.length.ness_
print(paste0("Expected INS frequency per bp: ", LTEE.exp.all, "; In devoid region: ",
	LTEE.exp.all * (devoid.end - devoid.start)))
print(paste0("In the ", (devoid.end - devoid.start)," bp devoid region, there are ",
	devoid.length.ness, " bp of non-essential sequence (", (devoid.length.ness/(devoid.end - devoid.start)*100), "%)")
)
print(paste0("Expected INS frequency per bp in non-essential region: ", LTEE.exp.ness, "; In devoid region: ",
	LTEE.exp.ness * (devoid.end - devoid.start)))

binom.test(0, 12 * devoid.length.ness, LTEE.exp.ness/12, alternative = "less")
"""

[1] "Expected INS frequency per bp: 3.99584259576847e-05; In devoid region: 6.07368074556807"
[1] "In the 152000 bp devoid region, there are 126569 bp of non-essential sequence (83.2690789473684%)"
[1] "Expected INS frequency per bp in non-essential region: 4.28294079216347e-05; In devoid region: 6.51007000408847"


RObject{VecSxp}

	Exact binomial test

data:  0 and 12 * devoid.length.ness
number of successes = 0, number of trials = 1518828, p-value = 0.004423
alternative hypothesis: true probability of success is less than 3.569117e-06
95 percent confidence interval:
 0.000000e+00 1.972395e-06
sample estimates:
probability of success 
                     0 



In [39]:
# same for REL606 
@rget LTEE_genes
@rget LTEE_genome_size
@rget LTEE_ess_genes
LTEE_genes = filter(row -> row.feat_type == "gene", LTEE_genes)

LTEE_ess_array = zeros(Int, Int(LTEE_genome_size))
for row in eachrow(LTEE_ess_genes)
	if row.start > row.end
		println(row)
	end
	LTEE_ess_array[Int(row.start):Int(row.end)] .= 1
end
LTEE_ess_ranges = DataFrames.DataFrame(Start = Int[], End = Int[], length = Int[], Essential = Int[])
start = 1
cnt = 0
for (i, v) in enumerate(LTEE_ess_array)
	if cnt == 0
		if v > 0
			push!(LTEE_ess_ranges, (start, i-1, i-start+1, 0))
			start = i
			cnt = 1
		end
	else
		if v == 0
			push!(LTEE_ess_ranges, (start, i-1, i-start+1, 1))
			start = i
			cnt = 0
		end
	end
end
push!(LTEE_ess_ranges, (start, LTEE_genome_size, LTEE_genome_size-start+1, cnt))
first(LTEE_ess_ranges, 5) |> print

## round ranges smaller than 300 bp
start = 1
max_length = LTEE_ess_ranges.length[1]
threshold = 300
LTEE_ess_ranges_rounded_ = LTEE_ess_ranges
LTEE_ess_ranges_rounded_.id = 1:length(LTEE_ess_ranges_rounded_.Start)
LTEE_ess_ranges_rounded_.rounded = LTEE_ess_ranges_rounded_.id
LTEE_ess_ranges_rounded_.Essential_rounded = LTEE_ess_ranges_rounded_.Essential
 
start_range = 1
cnt = 0
for (i, v) in enumerate(LTEE_ess_ranges.length)
	if v > threshold && cnt != LTEE_ess_ranges.Essential[i]
		cnt = LTEE_ess_ranges.Essential[i]
		start_range = i
	end
	LTEE_ess_ranges_rounded_.rounded[i] = start_range
	LTEE_ess_ranges_rounded_.Essential_rounded[i] = cnt
end
first(LTEE_ess_ranges_rounded_, 5) |> print
print(size(LTEE_ess_ranges_rounded_))

## merge ranges group by rounded nrow, start end essential name as N, Start, End, Essential
LTEE_ess_ranges_rounded = combine(groupby(LTEE_ess_ranges_rounded_, :rounded), nrow, :Start => minimum, :End => maximum, :Essential => first)
# colnames
LTEE_ess_ranges_rounded = rename(LTEE_ess_ranges_rounded, :nrow => :Count, :Start_minimum => :Start, :End_maximum => :End, :Essential_first => :Essential)
LTEE_ess_ranges_rounded.length .= LTEE_ess_ranges_rounded.End .- LTEE_ess_ranges_rounded.Start .+ 1
print(first(LTEE_ess_ranges_rounded, 5))


[1m5×4 DataFrame[0m
[1m Row [0m│[1m Start [0m[1m End   [0m[1m length [0m[1m Essential [0m
     │[90m Int64 [0m[90m Int64 [0m[90m Int64  [0m[90m Int64     [0m
─────┼─────────────────────────────────
   1 │     1  25478   25479          0
   2 │ 25479  26420     943          1
   3 │ 26421  26462      43          0
   4 │ 26463  29773    3312          1
   5 │ 29774  30348     576          0[1m5×7 DataFrame[0m
[1m Row [0m│[1m Start [0m[1m End   [0m[1m length [0m[1m Essential [0m[1m id    [0m[1m rounded [0m[1m Essential_rounded [0m
     │[90m Int64 [0m[90m Int64 [0m[90m Int64  [0m[90m Int64     [0m[90m Int64 [0m[90m Int64   [0m[90m Int64             [0m
─────┼────────────────────────────────────────────────────────────────────
   1 │     1  25478   25479          0      1        1                  0
   2 │ 25479  26420     943          1      2        2                  1
   3 │ 26421  26462      43          1      2        2            

In [40]:
ltee_ess_ranges_rounded_noness_large = filter(row -> row.length > threshold && row.Essential == 0, LTEE_ess_ranges_rounded)
sort(ltee_ess_ranges_rounded_noness_large, :length, rev=true) |> x -> first(x, 5) |> print
println()
print(size(ltee_ess_ranges_rounded_noness_large))
print(sum(ltee_ess_ranges_rounded_noness_large.length))

[1m5×6 DataFrame[0m
[1m Row [0m│[1m rounded [0m[1m Count [0m[1m Start   [0m[1m End     [0m[1m Essential [0m[1m length [0m
     │[90m Int64   [0m[90m Int64 [0m[90m Int64   [0m[90m Int64   [0m[90m Int64     [0m[90m Int64  [0m
─────┼─────────────────────────────────────────────────────
   1 │     189      1  1349409  1693141          0  343733
   2 │     129      3   693561   950281          0  256721
   3 │     219      1  1970811  2145496          0  174686
   4 │      77      1   221617   357660          0  136044
   5 │     109      3   529850   647784          0  117935
(157, 6)4315656

In [41]:
filter(row -> row.Essential != 0, LTEE_ess_ranges_rounded).length |> sum

314156

In [42]:
#IS-devoid regions
devoid_start = 3315000 ## yhcP
devoid_end = 3467000 ## yhgF

df1_ = filter(row -> (devoid_start <= row.Start <= devoid_end) .& (devoid_start <= row.End <= devoid_end), LTEE_ess_ranges_rounded)
df2_ = filter(row -> (devoid_start <= row.Start <= devoid_end) .& (devoid_start <= row.End <= devoid_end), ltee_ess_ranges_rounded_noness_large)
print(df1_)
#print(df2_)
# sort by length and show the first 5
print(sort(df2_, :length, rev=true) |> x -> first(x, 5))
println()
devoid_length_REL = devoid_end - devoid_start
devoid_length_REL_ness = sum(df1_[df1_.Essential .== 0, :length])
println("REL606: In the ", devoid_length_REL, " bp devoid region, there are ",
	devoid_length_REL_ness,
	" bp of internal non-essential sequence",
	"(", sum(df1_[df1_.Essential .== 0, :length]) / (devoid_end - devoid_start) * 100, "%)")
print(sum(df2_.length), " bp of non-essential sequence is large")


LTEE_ess_ranges_rounded_ = filter(row -> (devoid_start <= row.Start <= devoid_end) .| (devoid_start <= row.End <= devoid_end), LTEE_ess_ranges_rounded)
LTEE_ess_ranges_rounded_[LTEE_ess_ranges_rounded_.Start .< devoid_start, :Start] .= devoid_start
LTEE_ess_ranges_rounded_[LTEE_ess_ranges_rounded_.End .> devoid_end, :End] .= devoid_end
LTEE_ess_ranges_rounded_.length .= LTEE_ess_ranges_rounded_.End .- LTEE_ess_ranges_rounded_.Start .+ 1

ltee_ess_ranges_rounded_noness_large_ = filter(row -> (devoid_start <= row.Start <= devoid_end) .| (devoid_start <= row.End <= devoid_end), ltee_ess_ranges_rounded_noness_large)
ltee_ess_ranges_rounded_noness_large_[ltee_ess_ranges_rounded_noness_large_.Start .< devoid_start, :Start] .= devoid_start
ltee_ess_ranges_rounded_noness_large_[ltee_ess_ranges_rounded_noness_large_.End .> devoid_end, :End] .= devoid_end
ltee_ess_ranges_rounded_noness_large_.length .= ltee_ess_ranges_rounded_noness_large_.End .- ltee_ess_ranges_rounded_noness_large_.Start .+ 1
println()

devoid_length_REL_ness_rounded = sum(LTEE_ess_ranges_rounded_[LTEE_ess_ranges_rounded_.Essential .== 0, :length])
println("REL606: In the ", devoid_length_REL, " bp devoid region, there are ",
	devoid_length_REL_ness_rounded,
	" bp of non-essential sequence",
	"( ", devoid_length_REL_ness_rounded / devoid_length_REL * 100, "%)")
print(sum(ltee_ess_ranges_rounded_noness_large_.length), " bp of non-essential sequence is large",
	"(", sum(ltee_ess_ranges_rounded_noness_large_.length) / (devoid_end - devoid_start) * 100, "%)")

[1m15×6 DataFrame[0m
[1m Row [0m│[1m rounded [0m[1m Count [0m[1m Start   [0m[1m End     [0m[1m Essential [0m[1m length [0m
     │[90m Int64   [0m[90m Int64 [0m[90m Int64   [0m[90m Int64   [0m[90m Int64     [0m[90m Int64  [0m
─────┼─────────────────────────────────────────────────────
   1 │     354      3  3326278  3328978          1    2701
   2 │     357      1  3328979  3333326          0    4348
   3 │     358      3  3333327  3335157          1    1831
   4 │     361      1  3335158  3358747          0   23590
   5 │     362      1  3358748  3359320          1     573
   6 │     363      1  3359321  3361594          0    2274
   7 │     364      3  3361595  3363066          1    1472
   8 │     367      1  3363067  3367520          0    4454
   9 │     368     47  3367521  3381175          1   13655
  10 │     415      1  3381176  3399304          0   18129
  11 │     416      5  3399305  3402457          1    3153
  12 │     421      1  3402458  344131

# STAT: Comparison with T-test

In [43]:
@rput mds42_ess_ranges_rounded_noness_large
@rput mds42_ess_ranges_rounded_
R"""
mds42.exp.ness <- nrow(is_in_mds42.df.prevgen.rmdup) / sum(mds42_ess_ranges_rounded_noness_large$length)/44
mds42.exp.ness.devoid <- mds42.exp.ness * sum(mds42_ess_ranges_rounded_ %>% filter(Essential == 0) %>% select(length))
print(paste("Expected INS frequency per bp in non-essential region: ", mds42.exp.ness))
print(paste("Expected INS frequency in the IS-devoid region", mds42.exp.ness.devoid))
mds42.exp.ness.devoid.prob <-
is_in_mds42.df.prevgen.rmdup %>% 
	group_by(Line) %>% summarise(n = sum(pos >= devoid.start.ma & pos <= devoid.end.ma)) %>% select(-Line) %>%
	mutate(Study = "This", p = n/mds42.exp.ness.devoid) 
"""

[1] "Expected INS frequency per bp in non-essential region:  5.71762559751072e-06"
[1] "Expected INS frequency in the IS-devoid region 0.621860395236461"


RObject{VecSxp}
# A tibble: 44 × 3
       n Study     p
   <int> <chr> <dbl>
 1     0 This   0   
 2     2 This   3.22
 3     0 This   0   
 4     0 This   0   
 5     1 This   1.61
 6     0 This   0   
 7     1 This   1.61
 8     3 This   4.82
 9     0 This   0   
10     0 This   0   
# ℹ 34 more rows


## STAT: t-test nonessential non-overlap version of IS-devoid region analysis

In [44]:
R"""
mds42_ess_ranges_rounded_ %>% filter(Essential == 0) %>% select(length) %>% sum %>% print
mds42_ess_ranges_rounded_ %>% filter(Essential == 1) %>% select(length) %>% sum %>% print
""";

[1] 108762
[1] 26526


In [45]:
# one sided binomial test , this study
R"""
print(
	sum(mds42_ess_ranges_rounded_ %>% filter(Essential == 0) %>% select(length)) *44 *
	mds42.exp.ness
)
binom.test(is_in_mds42.df.prevgen.rmdup %>%
	filter(pos >= devoid.start.ma & pos <= devoid.end.ma) %>% nrow(),
	sum(mds42_ess_ranges_rounded_ %>% filter(Essential == 0) %>% select(length)) *44,
	mds42.exp.ness, alternative = "less")
"""

[1] 27.36186


RObject{VecSxp}

	Exact binomial test

data:  is_in_mds42.df.prevgen.rmdup %>% filter(pos >= devoid.start.ma & pos <= devoid.end.ma) %>% nrow() and sum(mds42_ess_ranges_rounded_ %>% filter(Essential == 0) %>% select(length)) * 44
number of successes = 39, number of trials = 4785528, p-value = 0.9863
alternative hypothesis: true probability of success is less than 5.717626e-06
95 percent confidence interval:
 0.000000e+00 1.064453e-05
sample estimates:
probability of success 
          8.149571e-06 



In [46]:
#(One-sided binomial test, LTEE)
LTEE_ness_genome_size_rounded = sum(LTEE_ess_ranges_rounded[LTEE_ess_ranges_rounded.Essential .== 0, :length])
@rput devoid_length_REL_ness
@rput LTEE_ness_genome_size_rounded
R"""
# nonessential genome size, nonessential length in devoid region, expected INS frequency per bp in nonessential region, expected INS count in devoid region
c(LTEE_ness_genome_size_rounded, devoid_length_REL_ness, sum(LTEE_INS_cnt$INS)/LTEE_ness_genome_size_rounded/12,
	devoid_length_REL_ness * sum(LTEE_INS_cnt$INS)/LTEE_ness_genome_size_rounded) %>% print
#binom.test(0, 12 * devoid_length_REL_ness, sum(LTEE_INS_cnt$INS)/LTEE_ness_genome_size_rounded/12, alternative = "two.sided")
print( devoid_length_REL_ness *sum(LTEE_INS_cnt$INS)/LTEE_ness_genome_size_rounded)
binom.test(0, 12 * devoid_length_REL_ness, sum(LTEE_INS_cnt$INS)/LTEE_ness_genome_size_rounded/12, alternative = "less")
"""

[1] 4.315656e+06 1.044970e+05 3.572265e-06 4.479492e+00
[1] 4.479492


RObject{VecSxp}

	Exact binomial test

data:  0 and 12 * devoid_length_REL_ness
number of successes = 0, number of trials = 1253964, p-value = 0.01134
alternative hypothesis: true probability of success is less than 3.572265e-06
95 percent confidence interval:
 0.000000e+00 2.389007e-06
sample estimates:
probability of success 
                     0 

