In [1]:
%matplotlib inline

# Imports

In [2]:
import pandas as pd
import numpy as np

# Load in the clean *E. coli* `.tsv` file


In [3]:
df = pd.read_csv('../Data/MVP_data/host_genomes/562.clean.tsv', sep='\t', index_col=0)
df.head()

Unnamed: 0,genome_id,source,type,start,stop,idk,strand,trash,qualifiers,coding_sequence,upstream_sequence,aSD_binding,sec_struct,sec_struct_bound,iCUB,GC_cds,GC_upstream,locus_tag
1,NC_000913.3,RefSeq,CDS,337,2799,.,+,0,ID=cds-NP_414543.1;Parent=gene-b0002;Dbxref=Un...,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,TTTTCGACCAAAGGTAACGAGGTAACAACC,-5.42,-21.31,-12.81,55.949262,0.530654,0.433333,b0002
2,NC_000913.3,RefSeq,CDS,2801,3733,.,+,0,ID=cds-NP_414544.1;Parent=gene-b0003;Dbxref=Un...,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...,GTACCCTCTCATGGAAGTTAGGAGTCTGAC,-6.51,-21.87,-14.05,56.062386,0.562701,0.5,b0003
3,NC_000913.3,RefSeq,CDS,3734,5020,.,+,0,ID=cds-NP_414545.1;Parent=gene-b0004;Dbxref=Un...,ATGAAACTCTACAATCTGAAAGATCACAACGAGCAGGTCAGCTTTG...,ACGGCGGGCGCACGAGTACTGGAAAACTAA,-3.4,-24.44,-20.71,53.052776,0.528361,0.566667,b0004
4,NC_000913.3,RefSeq,CDS,5234,5530,.,+,0,ID=cds-NP_414546.1;Parent=gene-b0005;Dbxref=Un...,GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGG...,CATAACGGGCAATGATAAAAGGAGTAACCT,-6.51,-17.15,-7.38,50.70553,0.538721,0.4,b0005
5,NC_000913.3,RefSeq,CDS,5683,6459,.,-,0,ID=cds-NP_414547.1;Parent=gene-b0006;Dbxref=Un...,ATGCTGATTCTTATTTCACCTGCGAAAACGCTTGATTACCAAAGCC...,GTCGGCATAAATTTCCTGCAAGGACTGGAT,-4.0,-18.6,-8.52,53.158862,0.496782,0.466667,b0006


**Some later data will require the "Gene" merge so make it a dedicated column and filter any duplicates**

In [4]:
print(df.shape)
df['Gene'] = df['qualifiers'].str.split(';gene=', n=1, expand=True)[1]\
                                .str.split(';', n=1, expand=True)[0]
df = df.drop_duplicates('Gene', keep=False)
print(df.shape)

(4016, 18)
(4016, 19)


# Add in knowledge of protein abundances and join the dataframes

This comes from PaxDB (release 4.1). See:

https://pax-db.org/

and the associated paper:

https://doi.org/10.1002/pmic.201400441

In [5]:
prot_df = pd.read_csv('../Data/MVP_data/562_associated_data/511145-WHOLE_ORGANISM-integrated.txt',\
                      sep='\t', comment='#', header=None)
print(prot_df.shape)

###Add in a dedicated `locus_tag` column by splitting column 1
prot_df['locus_tag']= prot_df[1].str.split(".", n = 1, expand = True)[1]
assert prot_df['locus_tag'].value_counts().max() == 1

###Rename the columns and drop the unnecessary bits
prot_df = prot_df.drop([0, 1], axis=1)
prot_df.columns = ['prot_abundance', 'locus_tag']

###Merge with the original dataframe on `locus_tag`
combined_df = df.merge(prot_df, on='locus_tag', how='left')
print(combined_df.shape)

###Not sure how 0.0 protein abundances are determined / make sense so let's change them to NaN
print(combined_df[combined_df['prot_abundance']==0.0].shape)
combined_df.at[combined_df[combined_df['prot_abundance']==0.0].index, 'prot_abundance'] = np.nan
print(combined_df[combined_df['prot_abundance'].isnull()].shape)
combined_df.head()

(4096, 3)
(4016, 20)
(310, 20)
(439, 20)


Unnamed: 0,genome_id,source,type,start,stop,idk,strand,trash,qualifiers,coding_sequence,upstream_sequence,aSD_binding,sec_struct,sec_struct_bound,iCUB,GC_cds,GC_upstream,locus_tag,Gene,prot_abundance
0,NC_000913.3,RefSeq,CDS,337,2799,.,+,0,ID=cds-NP_414543.1;Parent=gene-b0002;Dbxref=Un...,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,TTTTCGACCAAAGGTAACGAGGTAACAACC,-5.42,-21.31,-12.81,55.949262,0.530654,0.433333,b0002,thrA,317.0
1,NC_000913.3,RefSeq,CDS,2801,3733,.,+,0,ID=cds-NP_414544.1;Parent=gene-b0003;Dbxref=Un...,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...,GTACCCTCTCATGGAAGTTAGGAGTCTGAC,-6.51,-21.87,-14.05,56.062386,0.562701,0.5,b0003,thrB,76.3
2,NC_000913.3,RefSeq,CDS,3734,5020,.,+,0,ID=cds-NP_414545.1;Parent=gene-b0004;Dbxref=Un...,ATGAAACTCTACAATCTGAAAGATCACAACGAGCAGGTCAGCTTTG...,ACGGCGGGCGCACGAGTACTGGAAAACTAA,-3.4,-24.44,-20.71,53.052776,0.528361,0.566667,b0004,thrC,530.0
3,NC_000913.3,RefSeq,CDS,5234,5530,.,+,0,ID=cds-NP_414546.1;Parent=gene-b0005;Dbxref=Un...,GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGG...,CATAACGGGCAATGATAAAAGGAGTAACCT,-6.51,-17.15,-7.38,50.70553,0.538721,0.4,b0005,yaaX,
4,NC_000913.3,RefSeq,CDS,5683,6459,.,-,0,ID=cds-NP_414547.1;Parent=gene-b0006;Dbxref=Un...,ATGCTGATTCTTATTTCACCTGCGAAAACGCTTGATTACCAAAGCC...,GTCGGCATAAATTTCCTGCAAGGACTGGAT,-4.0,-18.6,-8.52,53.158862,0.496782,0.466667,b0006,yaaA,47.9


# Add in translation efficiency

This data comes from two separate data sources:

https://doi.org/10.1016/j.cell.2014.02.033

and more recently:

https://doi.org/10.15252/msb.20188719

In [6]:
teff_df = pd.read_excel('../Data/MVP_data/562_associated_data/Li_et_al_2014.xlsx')
print(teff_df.shape)
teff_df = teff_df.drop(['mRNA level (RPKM)'], axis=1)
teff_df.columns = ['Gene', 'trans_eff_2014']

combined_df = combined_df.merge(teff_df, on='Gene', how='left')
print(combined_df.shape)
combined_df.head()

(4095, 3)
(4016, 21)


Unnamed: 0,genome_id,source,type,start,stop,idk,strand,trash,qualifiers,coding_sequence,...,aSD_binding,sec_struct,sec_struct_bound,iCUB,GC_cds,GC_upstream,locus_tag,Gene,prot_abundance,trans_eff_2014
0,NC_000913.3,RefSeq,CDS,337,2799,.,+,0,ID=cds-NP_414543.1;Parent=gene-b0002;Dbxref=Un...,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,...,-5.42,-21.31,-12.81,55.949262,0.530654,0.433333,b0002,thrA,317.0,0.98
1,NC_000913.3,RefSeq,CDS,2801,3733,.,+,0,ID=cds-NP_414544.1;Parent=gene-b0003;Dbxref=Un...,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...,...,-6.51,-21.87,-14.05,56.062386,0.562701,0.5,b0003,thrB,76.3,0.63
2,NC_000913.3,RefSeq,CDS,3734,5020,.,+,0,ID=cds-NP_414545.1;Parent=gene-b0004;Dbxref=Un...,ATGAAACTCTACAATCTGAAAGATCACAACGAGCAGGTCAGCTTTG...,...,-3.4,-24.44,-20.71,53.052776,0.528361,0.566667,b0004,thrC,530.0,1.4
3,NC_000913.3,RefSeq,CDS,5234,5530,.,+,0,ID=cds-NP_414546.1;Parent=gene-b0005;Dbxref=Un...,GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGG...,...,-6.51,-17.15,-7.38,50.70553,0.538721,0.4,b0005,yaaX,,0.28
4,NC_000913.3,RefSeq,CDS,5683,6459,.,-,0,ID=cds-NP_414547.1;Parent=gene-b0006;Dbxref=Un...,ATGCTGATTCTTATTTCACCTGCGAAAACGCTTGATTACCAAAGCC...,...,-4.0,-18.6,-8.52,53.158862,0.496782,0.466667,b0006,yaaA,47.9,2.07


In [7]:
###Read in data from two separate experiments
teff_df1 = pd.read_excel('../Data/MVP_data/562_associated_data/msb188719-sup-0002-datasetev1.xlsx',\
                       sheet_name='LacZ')
teff_df2 = pd.read_excel('../Data/MVP_data/562_associated_data/msb188719-sup-0002-datasetev1.xlsx',\
                       sheet_name='PK-LacZ')

###And average them to calculate trans_eff_2019
teff_df1.columns = ['Gene', 'teff1', 'teff2']
teff_df2.columns = ['Gene', 'teff1', 'teff2']
teff_df = teff_df1.merge(teff_df2, on='Gene')
teff_df['trans_eff_2019'] = np.mean(teff_df[['teff1_x', 'teff1_y']], axis=1)

###Append to full dataframe
teff_df = teff_df.drop(['teff1_x', 'teff1_y', 'teff2_x', 'teff2_y'], axis=1)
combined_df = combined_df.merge(teff_df, on='Gene', how='left')
print(combined_df.shape)
combined_df.head()

(4016, 22)


Unnamed: 0,genome_id,source,type,start,stop,idk,strand,trash,qualifiers,coding_sequence,...,sec_struct,sec_struct_bound,iCUB,GC_cds,GC_upstream,locus_tag,Gene,prot_abundance,trans_eff_2014,trans_eff_2019
0,NC_000913.3,RefSeq,CDS,337,2799,.,+,0,ID=cds-NP_414543.1;Parent=gene-b0002;Dbxref=Un...,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,...,-21.31,-12.81,55.949262,0.530654,0.433333,b0002,thrA,317.0,0.98,
1,NC_000913.3,RefSeq,CDS,2801,3733,.,+,0,ID=cds-NP_414544.1;Parent=gene-b0003;Dbxref=Un...,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...,...,-21.87,-14.05,56.062386,0.562701,0.5,b0003,thrB,76.3,0.63,
2,NC_000913.3,RefSeq,CDS,3734,5020,.,+,0,ID=cds-NP_414545.1;Parent=gene-b0004;Dbxref=Un...,ATGAAACTCTACAATCTGAAAGATCACAACGAGCAGGTCAGCTTTG...,...,-24.44,-20.71,53.052776,0.528361,0.566667,b0004,thrC,530.0,1.4,
3,NC_000913.3,RefSeq,CDS,5234,5530,.,+,0,ID=cds-NP_414546.1;Parent=gene-b0005;Dbxref=Un...,GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGG...,...,-17.15,-7.38,50.70553,0.538721,0.4,b0005,yaaX,,0.28,
4,NC_000913.3,RefSeq,CDS,5683,6459,.,-,0,ID=cds-NP_414547.1;Parent=gene-b0006;Dbxref=Un...,ATGCTGATTCTTATTTCACCTGCGAAAACGCTTGATTACCAAAGCC...,...,-18.6,-8.52,53.158862,0.496782,0.466667,b0006,yaaA,47.9,2.07,0.410931


# Add in knowledge from two essential gene sets

Using two separate datasets for robustness. First:

https://doi.org/10.1128/mBio.02096-17

And then:
https://doi.org/10.1038/s41586-018-0124-0

In [8]:
essential_df = pd.read_csv('../Data/MVP_data/562_associated_data/ecoli_essential.txt', sep='\t')
print('All:', essential_df.shape)
print('Essential:', essential_df[essential_df['Essential'] == True].shape)
print('Non-essential:', essential_df[essential_df['Non-essential'] == True].shape)
print('Unclear:', essential_df[essential_df['Unclear'] == True].shape)

###Re-factoring this dataframe a bit
essential_df['essentiality_ds1'] = ''
indices = essential_df[essential_df['Essential']==True].index
essential_df.loc[indices, 'essentiality_ds1'] = 'Essential'
indices = essential_df[essential_df['Non-essential']==True].index
essential_df.loc[indices, 'essentiality_ds1'] = 'Non-essential'
indices = essential_df[essential_df['Unclear']==True].index
essential_df.loc[indices, 'essentiality_ds1'] = 'Unclear'
print(essential_df['essentiality_ds1'].value_counts())

###Drop the unnecessary bits
essential_df = essential_df.drop(set(essential_df.columns)-set(['Gene', 'essentiality_ds1']), axis=1)

###And combine
combined_df = combined_df.merge(essential_df, on="Gene", how='left')
print(combined_df.shape)
combined_df.head()

All: (4313, 6)
Essential: (358, 6)
Non-essential: (3793, 6)
Unclear: (162, 6)
Non-essential    3793
Essential         358
Unclear           162
Name: essentiality_ds1, dtype: int64
(4016, 23)


Unnamed: 0,genome_id,source,type,start,stop,idk,strand,trash,qualifiers,coding_sequence,...,sec_struct_bound,iCUB,GC_cds,GC_upstream,locus_tag,Gene,prot_abundance,trans_eff_2014,trans_eff_2019,essentiality_ds1
0,NC_000913.3,RefSeq,CDS,337,2799,.,+,0,ID=cds-NP_414543.1;Parent=gene-b0002;Dbxref=Un...,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,...,-12.81,55.949262,0.530654,0.433333,b0002,thrA,317.0,0.98,,Non-essential
1,NC_000913.3,RefSeq,CDS,2801,3733,.,+,0,ID=cds-NP_414544.1;Parent=gene-b0003;Dbxref=Un...,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...,...,-14.05,56.062386,0.562701,0.5,b0003,thrB,76.3,0.63,,Non-essential
2,NC_000913.3,RefSeq,CDS,3734,5020,.,+,0,ID=cds-NP_414545.1;Parent=gene-b0004;Dbxref=Un...,ATGAAACTCTACAATCTGAAAGATCACAACGAGCAGGTCAGCTTTG...,...,-20.71,53.052776,0.528361,0.566667,b0004,thrC,530.0,1.4,,Non-essential
3,NC_000913.3,RefSeq,CDS,5234,5530,.,+,0,ID=cds-NP_414546.1;Parent=gene-b0005;Dbxref=Un...,GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGG...,...,-7.38,50.70553,0.538721,0.4,b0005,yaaX,,0.28,,Non-essential
4,NC_000913.3,RefSeq,CDS,5683,6459,.,-,0,ID=cds-NP_414547.1;Parent=gene-b0006;Dbxref=Un...,ATGCTGATTCTTATTTCACCTGCGAAAACGCTTGATTACCAAAGCC...,...,-8.52,53.158862,0.496782,0.466667,b0006,yaaA,47.9,2.07,0.410931,Non-essential


In [9]:
essential_df = pd.read_csv('../Data/MVP_data/562_associated_data/ecoli_essential_Price.txt', sep='\t', skiprows=13)
print(essential_df.shape)

###Drop the unnecessary bits
essential_df['essentiality_ds2'] = 'Essential'
essential_df = essential_df.drop(set(essential_df.columns) - set(['sysName', 'essentiality_ds2']), axis=1)
essential_df.columns = ['locus_tag', 'essentiality_ds2']

###And merge
combined_df = combined_df.merge(essential_df, on='locus_tag', how='left')
print(combined_df.shape)
indices = combined_df[combined_df['essentiality_ds2']!='Essential'].index
combined_df.loc[indices, 'essentiality_ds2'] = 'Non-essential'
combined_df.head()

(324, 19)
(4016, 24)


Unnamed: 0,genome_id,source,type,start,stop,idk,strand,trash,qualifiers,coding_sequence,...,iCUB,GC_cds,GC_upstream,locus_tag,Gene,prot_abundance,trans_eff_2014,trans_eff_2019,essentiality_ds1,essentiality_ds2
0,NC_000913.3,RefSeq,CDS,337,2799,.,+,0,ID=cds-NP_414543.1;Parent=gene-b0002;Dbxref=Un...,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,...,55.949262,0.530654,0.433333,b0002,thrA,317.0,0.98,,Non-essential,Non-essential
1,NC_000913.3,RefSeq,CDS,2801,3733,.,+,0,ID=cds-NP_414544.1;Parent=gene-b0003;Dbxref=Un...,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...,...,56.062386,0.562701,0.5,b0003,thrB,76.3,0.63,,Non-essential,Non-essential
2,NC_000913.3,RefSeq,CDS,3734,5020,.,+,0,ID=cds-NP_414545.1;Parent=gene-b0004;Dbxref=Un...,ATGAAACTCTACAATCTGAAAGATCACAACGAGCAGGTCAGCTTTG...,...,53.052776,0.528361,0.566667,b0004,thrC,530.0,1.4,,Non-essential,Non-essential
3,NC_000913.3,RefSeq,CDS,5234,5530,.,+,0,ID=cds-NP_414546.1;Parent=gene-b0005;Dbxref=Un...,GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGG...,...,50.70553,0.538721,0.4,b0005,yaaX,,0.28,,Non-essential,Non-essential
4,NC_000913.3,RefSeq,CDS,5683,6459,.,-,0,ID=cds-NP_414547.1;Parent=gene-b0006;Dbxref=Un...,ATGCTGATTCTTATTTCACCTGCGAAAACGCTTGATTACCAAAGCC...,...,53.158862,0.496782,0.466667,b0006,yaaA,47.9,2.07,0.410931,Non-essential,Non-essential


**Finally, add a consensus column of gene essentiality**

In [10]:
combined_df['essentiality'] = 'Unclear'
indices = combined_df[(combined_df['essentiality_ds1']=='Essential')&
                      (combined_df['essentiality_ds2']=='Essential')].index
combined_df.at[indices, 'essentiality'] = 'Essential'
indices = combined_df[(combined_df['essentiality_ds1']!='Essential')&
                      (combined_df['essentiality_ds2']!='Essential')].index
combined_df.at[indices, 'essentiality'] = 'Non-essential'

print(combined_df['essentiality'].value_counts())
combined_df.head()

Non-essential    3630
Essential         270
Unclear           116
Name: essentiality, dtype: int64


Unnamed: 0,genome_id,source,type,start,stop,idk,strand,trash,qualifiers,coding_sequence,...,GC_cds,GC_upstream,locus_tag,Gene,prot_abundance,trans_eff_2014,trans_eff_2019,essentiality_ds1,essentiality_ds2,essentiality
0,NC_000913.3,RefSeq,CDS,337,2799,.,+,0,ID=cds-NP_414543.1;Parent=gene-b0002;Dbxref=Un...,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...,...,0.530654,0.433333,b0002,thrA,317.0,0.98,,Non-essential,Non-essential,Non-essential
1,NC_000913.3,RefSeq,CDS,2801,3733,.,+,0,ID=cds-NP_414544.1;Parent=gene-b0003;Dbxref=Un...,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...,...,0.562701,0.5,b0003,thrB,76.3,0.63,,Non-essential,Non-essential,Non-essential
2,NC_000913.3,RefSeq,CDS,3734,5020,.,+,0,ID=cds-NP_414545.1;Parent=gene-b0004;Dbxref=Un...,ATGAAACTCTACAATCTGAAAGATCACAACGAGCAGGTCAGCTTTG...,...,0.528361,0.566667,b0004,thrC,530.0,1.4,,Non-essential,Non-essential,Non-essential
3,NC_000913.3,RefSeq,CDS,5234,5530,.,+,0,ID=cds-NP_414546.1;Parent=gene-b0005;Dbxref=Un...,GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGG...,...,0.538721,0.4,b0005,yaaX,,0.28,,Non-essential,Non-essential,Non-essential
4,NC_000913.3,RefSeq,CDS,5683,6459,.,-,0,ID=cds-NP_414547.1;Parent=gene-b0006;Dbxref=Un...,ATGCTGATTCTTATTTCACCTGCGAAAACGCTTGATTACCAAAGCC...,...,0.496782,0.466667,b0006,yaaA,47.9,2.07,0.410931,Non-essential,Non-essential,Non-essential


# Save file

In [11]:
combined_df.to_csv('../Data/MVP_data/562_associated_data/ecoli.combined.tsv', sep='\t')