In [12]:
%matplotlib inline

# Imports

In [13]:
import pandas as pd
import numpy as np

# Load in the clean `.tsv` file


In [14]:
df = pd.read_csv('../Data/host_genomes/562.clean.tsv', sep='\t', index_col=0)
df.head()

Unnamed: 0,genome_id,source,type,start,stop,idk,strand,trash,qualifiers,coding_sequence,upstream_sequence,RBS_energy,RBS_energy_upstream,iCUB,GC_cds,GC_upstream,locus_tag,secondary_structure,secondary_structure_internal
0,NC_000913.3,RefSeq,CDS,190,255,.,+,0,ID=cds-NP_414542.1;Parent=gene-b0001;Dbxref=Un...,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...,CAGATAAAAATTACAGAGTACACAACATCC,-2.45,-2.46,32.046035,0.515152,0.333333,b0001,-1.56,
1,NC_000913.3,RefSeq,CDS,337,2799,.,+,0,ID=cds-NP_414543.1;Parent=gene-b0002;Dbxref=Un...,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,TTTTCGACCAAAGGTAACGAGGTAACAACC,-5.42,-2.52,55.949262,0.530654,0.433333,b0002,-9.78,-22.31
2,NC_000913.3,RefSeq,CDS,2801,3733,.,+,0,ID=cds-NP_414544.1;Parent=gene-b0003;Dbxref=Un...,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...,GTACCCTCTCATGGAAGTTAGGAGTCTGAC,-6.51,-3.6,56.062386,0.562701,0.5,b0003,-14.65,-19.59
3,NC_000913.3,RefSeq,CDS,3734,5020,.,+,0,ID=cds-NP_414545.1;Parent=gene-b0004;Dbxref=Un...,ATGAAACTCTACAATCTGAAAGATCACAACGAGCAGGTCAGCTTTG...,ACGGCGGGCGCACGAGTACTGGAAAACTAA,-3.4,-2.54,53.052776,0.528361,0.566667,b0004,-4.86,-21.46
4,NC_000913.3,RefSeq,CDS,5234,5530,.,+,0,ID=cds-NP_414546.1;Parent=gene-b0005;Dbxref=Un...,GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGG...,CATAACGGGCAATGATAAAAGGAGTAACCT,-6.51,-1.42,50.70553,0.538721,0.4,b0005,-7.8,-15.29


**Some later data will require the "Gene" merge so make it a dedicated column and filter any duplicates**

In [15]:
print(df.shape)
df['Gene'] = df['qualifiers'].str.split(';gene=', n=1, expand=True)[1]\
                                .str.split(';', n=1, expand=True)[0]
df = df.drop_duplicates('Gene', keep=False)
print(df.shape)

(4060, 19)
(4060, 20)


# Add in knowledge of protein abundances and join the dataframes


In [16]:
prot_df = pd.read_csv('../Data/562_associated_data/511145-WHOLE_ORGANISM-integrated.txt',\
                      sep='\t', comment='#', header=None)
print(prot_df.shape)

###Add in a dedicated `locus_tag` column by splitting column 1
prot_df['locus_tag']= prot_df[1].str.split(".", n = 1, expand = True)[1]
assert prot_df['locus_tag'].value_counts().max() == 1

###Rename the columns and drop the unnecessary bits
prot_df = prot_df.drop([0, 1], axis=1)
prot_df.columns = ['prot_abundance', 'locus_tag']

###Merge with the original dataframe on `locus_tag`
combined_df = df.merge(prot_df, on='locus_tag', how='left')
print(combined_df.shape)

###Not sure how 0.0 protein abundances make sense so let's change them to NaN
print(combined_df[combined_df['prot_abundance']==0.0].shape)
combined_df.at[combined_df[combined_df['prot_abundance']==0.0].index, 'prot_abundance'] = np.nan
print(combined_df[combined_df['prot_abundance'].isnull()].shape)
combined_df.head()

(4096, 3)
(4060, 21)
(317, 21)
(484, 21)


Unnamed: 0,genome_id,source,type,start,stop,idk,strand,trash,qualifiers,coding_sequence,...,RBS_energy,RBS_energy_upstream,iCUB,GC_cds,GC_upstream,locus_tag,secondary_structure,secondary_structure_internal,Gene,prot_abundance
0,NC_000913.3,RefSeq,CDS,190,255,.,+,0,ID=cds-NP_414542.1;Parent=gene-b0001;Dbxref=Un...,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...,...,-2.45,-2.46,32.046035,0.515152,0.333333,b0001,-1.56,,thrL,
1,NC_000913.3,RefSeq,CDS,337,2799,.,+,0,ID=cds-NP_414543.1;Parent=gene-b0002;Dbxref=Un...,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,...,-5.42,-2.52,55.949262,0.530654,0.433333,b0002,-9.78,-22.31,thrA,317.0
2,NC_000913.3,RefSeq,CDS,2801,3733,.,+,0,ID=cds-NP_414544.1;Parent=gene-b0003;Dbxref=Un...,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...,...,-6.51,-3.6,56.062386,0.562701,0.5,b0003,-14.65,-19.59,thrB,76.3
3,NC_000913.3,RefSeq,CDS,3734,5020,.,+,0,ID=cds-NP_414545.1;Parent=gene-b0004;Dbxref=Un...,ATGAAACTCTACAATCTGAAAGATCACAACGAGCAGGTCAGCTTTG...,...,-3.4,-2.54,53.052776,0.528361,0.566667,b0004,-4.86,-21.46,thrC,530.0
4,NC_000913.3,RefSeq,CDS,5234,5530,.,+,0,ID=cds-NP_414546.1;Parent=gene-b0005;Dbxref=Un...,GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGG...,...,-6.51,-1.42,50.70553,0.538721,0.4,b0005,-7.8,-15.29,yaaX,


# Add in translation efficiency

In [17]:
###I really am skeptical of the dataset from Li et al. 2014 and would prefer to not use it
# teff_df = pd.read_excel('../Data/562_associated_data/Li_et_al_2014.xlsx')
# print(teff_df.shape)
# teff_df = teff_df[teff_df['MOPS complete'].str.contains('\[')!=True]
# print(teff_df.shape)
# teff_df = teff_df[teff_df['MOPS minimal'].str.contains('\[')!=True]
# print(teff_df.shape)
# teff_df['MOPS complete'] = teff_df['MOPS complete'].astype(float)
# teff_df['MOPS minimal'] = teff_df['MOPS minimal'].astype(float)
# teff_df = teff_df[teff_df['MOPS complete']>0.]
# print(teff_df.shape)
# teff_df = teff_df[teff_df['MOPS minimal']>0.]
# print(teff_df.shape)
# teff_df = teff_df[np.abs(np.log2(teff_df['MOPS complete']/teff_df['MOPS minimal']))<=1.0]
# print(teff_df.shape)
# teff_df = teff_df.drop(['MOPS minimal', 'MOPS complete without methionine'], axis=1)
# teff_df.columns = ['Gene', 'trans_eff_1']

# combined_df = combined_df.merge(teff_df, on='Gene', how='left')
# print(combined_df.shape)
# combined_df.head()

In [18]:
import json
with open('../Data/562_associated_data/Buskirk1_teff.json', 'r') as infile:
          teff_1 = json.load(infile)
with open('../Data/562_associated_data/Buskirk1_teff.json', 'r') as infile:
          teff_2 = json.load(infile)
teff_df = pd.DataFrame.from_dict(teff_1, orient='index')
print(teff_df.shape)
teff_df[1] = teff_df.index.map(teff_2) 
print(teff_df.shape)
teff_df['locus_tag'] = teff_df.index
teff_df = teff_df[teff_df[0].isnull()==False]
print(teff_df.shape)
teff_df = teff_df[teff_df[1].isnull()==False]
print(teff_df.shape)
teff_df = teff_df[np.abs(np.log2(teff_df[0]/teff_df[1]))<=1.0]
print(teff_df.shape)
teff_df['trans_eff'] = np.mean([teff_df[0], teff_df[1]], axis=0)
teff_df = teff_df.drop([0, 1], axis=1)
combined_df = combined_df.merge(teff_df, on='locus_tag', how='left')
print(combined_df.shape)
combined_df.head()

(1278, 1)
(1278, 2)
(1278, 3)
(1278, 3)
(1278, 3)
(4060, 22)


Unnamed: 0,genome_id,source,type,start,stop,idk,strand,trash,qualifiers,coding_sequence,...,RBS_energy_upstream,iCUB,GC_cds,GC_upstream,locus_tag,secondary_structure,secondary_structure_internal,Gene,prot_abundance,trans_eff
0,NC_000913.3,RefSeq,CDS,190,255,.,+,0,ID=cds-NP_414542.1;Parent=gene-b0001;Dbxref=Un...,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...,...,-2.46,32.046035,0.515152,0.333333,b0001,-1.56,,thrL,,
1,NC_000913.3,RefSeq,CDS,337,2799,.,+,0,ID=cds-NP_414543.1;Parent=gene-b0002;Dbxref=Un...,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,...,-2.52,55.949262,0.530654,0.433333,b0002,-9.78,-22.31,thrA,317.0,0.696278
2,NC_000913.3,RefSeq,CDS,2801,3733,.,+,0,ID=cds-NP_414544.1;Parent=gene-b0003;Dbxref=Un...,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...,...,-3.6,56.062386,0.562701,0.5,b0003,-14.65,-19.59,thrB,76.3,0.472059
3,NC_000913.3,RefSeq,CDS,3734,5020,.,+,0,ID=cds-NP_414545.1;Parent=gene-b0004;Dbxref=Un...,ATGAAACTCTACAATCTGAAAGATCACAACGAGCAGGTCAGCTTTG...,...,-2.54,53.052776,0.528361,0.566667,b0004,-4.86,-21.46,thrC,530.0,0.934708
4,NC_000913.3,RefSeq,CDS,5234,5530,.,+,0,ID=cds-NP_414546.1;Parent=gene-b0005;Dbxref=Un...,GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGG...,...,-1.42,50.70553,0.538721,0.4,b0005,-7.8,-15.29,yaaX,,


# Add in knowledge from two essential gene sets

## First, using annotated essential genes from:
https://mbio.asm.org/content/9/1/e02096-17

In [19]:
essential_df = pd.read_csv('../Data/562_associated_data/ecoli_essential.txt', sep='\t')
print('All:', essential_df.shape)
print('Essential:', essential_df[essential_df['Essential'] == True].shape)
print('Non-essential:', essential_df[essential_df['Non-essential'] == True].shape)
print('Unclear:', essential_df[essential_df['Unclear'] == True].shape)

###Re-factoring this dataframe a bit
essential_df['essentiality_ds1'] = ''
indices = essential_df[essential_df['Essential']==True].index
essential_df.loc[indices, 'essentiality_ds1'] = 'Essential'
indices = essential_df[essential_df['Non-essential']==True].index
essential_df.loc[indices, 'essentiality_ds1'] = 'Non-essential'
indices = essential_df[essential_df['Unclear']==True].index
essential_df.loc[indices, 'essentiality_ds1'] = 'Unclear'
print(essential_df['essentiality_ds1'].value_counts())

###Drop the unnecessary bits
essential_df = essential_df.drop(set(essential_df.columns)-set(['Gene', 'essentiality_ds1']), axis=1)

###And combine
combined_df = combined_df.merge(essential_df, on="Gene", how='left')
print(combined_df.shape)
combined_df.head()

All: (4313, 6)
Essential: (358, 6)
Non-essential: (3793, 6)
Unclear: (162, 6)
Non-essential    3793
Essential         358
Unclear           162
Name: essentiality_ds1, dtype: int64
(4060, 23)


Unnamed: 0,genome_id,source,type,start,stop,idk,strand,trash,qualifiers,coding_sequence,...,iCUB,GC_cds,GC_upstream,locus_tag,secondary_structure,secondary_structure_internal,Gene,prot_abundance,trans_eff,essentiality_ds1
0,NC_000913.3,RefSeq,CDS,190,255,.,+,0,ID=cds-NP_414542.1;Parent=gene-b0001;Dbxref=Un...,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...,...,32.046035,0.515152,0.333333,b0001,-1.56,,thrL,,,Non-essential
1,NC_000913.3,RefSeq,CDS,337,2799,.,+,0,ID=cds-NP_414543.1;Parent=gene-b0002;Dbxref=Un...,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,...,55.949262,0.530654,0.433333,b0002,-9.78,-22.31,thrA,317.0,0.696278,Non-essential
2,NC_000913.3,RefSeq,CDS,2801,3733,.,+,0,ID=cds-NP_414544.1;Parent=gene-b0003;Dbxref=Un...,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...,...,56.062386,0.562701,0.5,b0003,-14.65,-19.59,thrB,76.3,0.472059,Non-essential
3,NC_000913.3,RefSeq,CDS,3734,5020,.,+,0,ID=cds-NP_414545.1;Parent=gene-b0004;Dbxref=Un...,ATGAAACTCTACAATCTGAAAGATCACAACGAGCAGGTCAGCTTTG...,...,53.052776,0.528361,0.566667,b0004,-4.86,-21.46,thrC,530.0,0.934708,Non-essential
4,NC_000913.3,RefSeq,CDS,5234,5530,.,+,0,ID=cds-NP_414546.1;Parent=gene-b0005;Dbxref=Un...,GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGG...,...,50.70553,0.538721,0.4,b0005,-7.8,-15.29,yaaX,,,Non-essential


## And from:
https://www.nature.com/articles/s41586-018-0124-0

In [20]:
essential_df = pd.read_csv('../Data/562_associated_data/ecoli_essential_Price.txt', sep='\t', skiprows=13)
print(essential_df.shape)

###Drop the unnecessary bits
essential_df['essentiality_ds2'] = 'Essential'
essential_df = essential_df.drop(set(essential_df.columns) - set(['sysName', 'essentiality_ds2']), axis=1)
essential_df.columns = ['locus_tag', 'essentiality_ds2']

###And merge
combined_df = combined_df.merge(essential_df, on='locus_tag', how='left')
print(combined_df.shape)
indices = combined_df[combined_df['essentiality_ds2']!='Essential'].index
combined_df.loc[indices, 'essentiality_ds2'] = 'Non-essential'
combined_df.head()

(324, 19)
(4060, 24)


Unnamed: 0,genome_id,source,type,start,stop,idk,strand,trash,qualifiers,coding_sequence,...,GC_cds,GC_upstream,locus_tag,secondary_structure,secondary_structure_internal,Gene,prot_abundance,trans_eff,essentiality_ds1,essentiality_ds2
0,NC_000913.3,RefSeq,CDS,190,255,.,+,0,ID=cds-NP_414542.1;Parent=gene-b0001;Dbxref=Un...,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...,...,0.515152,0.333333,b0001,-1.56,,thrL,,,Non-essential,Non-essential
1,NC_000913.3,RefSeq,CDS,337,2799,.,+,0,ID=cds-NP_414543.1;Parent=gene-b0002;Dbxref=Un...,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,...,0.530654,0.433333,b0002,-9.78,-22.31,thrA,317.0,0.696278,Non-essential,Non-essential
2,NC_000913.3,RefSeq,CDS,2801,3733,.,+,0,ID=cds-NP_414544.1;Parent=gene-b0003;Dbxref=Un...,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...,...,0.562701,0.5,b0003,-14.65,-19.59,thrB,76.3,0.472059,Non-essential,Non-essential
3,NC_000913.3,RefSeq,CDS,3734,5020,.,+,0,ID=cds-NP_414545.1;Parent=gene-b0004;Dbxref=Un...,ATGAAACTCTACAATCTGAAAGATCACAACGAGCAGGTCAGCTTTG...,...,0.528361,0.566667,b0004,-4.86,-21.46,thrC,530.0,0.934708,Non-essential,Non-essential
4,NC_000913.3,RefSeq,CDS,5234,5530,.,+,0,ID=cds-NP_414546.1;Parent=gene-b0005;Dbxref=Un...,GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGG...,...,0.538721,0.4,b0005,-7.8,-15.29,yaaX,,,Non-essential,Non-essential


## Finally add a consensus column of gene essentiality

In [21]:
combined_df['essentiality'] = 'Unclear'
indices = combined_df[(combined_df['essentiality_ds1']=='Essential')&
                      (combined_df['essentiality_ds2']=='Essential')].index
combined_df.at[indices, 'essentiality'] = 'Essential'
indices = combined_df[(combined_df['essentiality_ds1']!='Essential')&
                      (combined_df['essentiality_ds2']!='Essential')].index
combined_df.at[indices, 'essentiality'] = 'Non-essential'

print(combined_df['essentiality'].value_counts())
combined_df.head()

Non-essential    3669
Essential         269
Unclear           122
Name: essentiality, dtype: int64


Unnamed: 0,genome_id,source,type,start,stop,idk,strand,trash,qualifiers,coding_sequence,...,GC_upstream,locus_tag,secondary_structure,secondary_structure_internal,Gene,prot_abundance,trans_eff,essentiality_ds1,essentiality_ds2,essentiality
0,NC_000913.3,RefSeq,CDS,190,255,.,+,0,ID=cds-NP_414542.1;Parent=gene-b0001;Dbxref=Un...,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...,...,0.333333,b0001,-1.56,,thrL,,,Non-essential,Non-essential,Non-essential
1,NC_000913.3,RefSeq,CDS,337,2799,.,+,0,ID=cds-NP_414543.1;Parent=gene-b0002;Dbxref=Un...,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,...,0.433333,b0002,-9.78,-22.31,thrA,317.0,0.696278,Non-essential,Non-essential,Non-essential
2,NC_000913.3,RefSeq,CDS,2801,3733,.,+,0,ID=cds-NP_414544.1;Parent=gene-b0003;Dbxref=Un...,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...,...,0.5,b0003,-14.65,-19.59,thrB,76.3,0.472059,Non-essential,Non-essential,Non-essential
3,NC_000913.3,RefSeq,CDS,3734,5020,.,+,0,ID=cds-NP_414545.1;Parent=gene-b0004;Dbxref=Un...,ATGAAACTCTACAATCTGAAAGATCACAACGAGCAGGTCAGCTTTG...,...,0.566667,b0004,-4.86,-21.46,thrC,530.0,0.934708,Non-essential,Non-essential,Non-essential
4,NC_000913.3,RefSeq,CDS,5234,5530,.,+,0,ID=cds-NP_414546.1;Parent=gene-b0005;Dbxref=Un...,GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGG...,...,0.4,b0005,-7.8,-15.29,yaaX,,,Non-essential,Non-essential,Non-essential


# Save file

In [22]:
combined_df.to_csv('../Data/562_associated_data/ecoli.combined.tsv', sep='\t')