In [None]:
%matplotlib inline

# Imports

In [None]:
import pandas as pd
import numpy as np

# Load in the clean *E. coli* `.tsv` file


In [None]:
df = pd.read_csv('../Data/NCBI_phage_db/host_genomes/562.clean.tsv', sep='\t')
df.head()

**Some later data will require the "Gene" merge so make it a dedicated column and filter any duplicates**

In [None]:
print(df.shape)
df = df.drop_duplicates('gene_id', keep=False)
print(df.shape)
df = df.drop_duplicates('locus_tag', keep=False)
print(df.shape)

# Add in knowledge of protein abundances and join the dataframes

This comes from PaxDB (release 4.1). See:

https://pax-db.org/

and the associated paper:

https://doi.org/10.1002/pmic.201400441

In [None]:
prot_df = pd.read_csv('../Data/562_associated_data/511145-WHOLE_ORGANISM-integrated.txt',\
                      sep='\t', comment='#', header=None)
print(prot_df.shape)

###Add in a dedicated `locus_tag` column by splitting column 1
prot_df['locus_tag']= prot_df[1].str.split(".", n = 1, expand = True)[1]
assert prot_df['locus_tag'].value_counts().max() == 1

###Rename the columns and drop the unnecessary bits
prot_df = prot_df.drop([0, 1], axis=1)
prot_df.columns = ['prot_abundance', 'locus_tag']

###Merge with the original dataframe on `locus_tag`
combined_df = df.merge(prot_df, on='locus_tag', how='left')
print(combined_df.shape)

###Not sure how 0.0 protein abundances are determined / make sense so let's change them to NaN
print(combined_df[combined_df['prot_abundance']==0.0].shape)
combined_df.at[combined_df[combined_df['prot_abundance']==0.0].index, 'prot_abundance'] = np.nan
print(combined_df[combined_df['prot_abundance'].isnull()].shape)
combined_df.head()

# Add in translation efficiency data

This data comes from two separate data sources:

https://doi.org/10.1016/j.cell.2014.02.033

and more recently:

https://doi.org/10.15252/msb.20188719

In [None]:
teff_df = pd.read_excel('../Data/562_associated_data/Li_et_al_2014.xlsx')
print(teff_df.shape)
teff_df = teff_df.drop(['mRNA level (RPKM)'], axis=1)
teff_df.columns = ['gene_id', 'trans_eff_2014']

combined_df = combined_df.merge(teff_df, on='gene_id', how='left')
print(combined_df.shape)
combined_df.head()

In [None]:
###Read in data from two separate experiments
teff_df1 = pd.read_excel('../Data/562_associated_data/msb188719-sup-0002-datasetev1.xlsx',\
                       sheet_name='LacZ')
teff_df2 = pd.read_excel('../Data/562_associated_data/msb188719-sup-0002-datasetev1.xlsx',\
                       sheet_name='PK-LacZ')

###And average them to calculate trans_eff_2019
teff_df1.columns = ['gene_id', 'teff1', 'teff2']
teff_df2.columns = ['gene_id', 'teff1', 'teff2']
teff_df = teff_df1.merge(teff_df2, on='gene_id')
teff_df['trans_eff_2019'] = np.mean(teff_df[['teff1_x', 'teff1_y']], axis=1)

###Append to full dataframe
teff_df = teff_df.drop(['teff1_x', 'teff1_y', 'teff2_x', 'teff2_y'], axis=1)
combined_df = combined_df.merge(teff_df, on='gene_id', how='left')
print(combined_df.shape)
combined_df.head()

# Add in knowledge from two essential gene sets

Using two separate datasets for robustness. First:

https://doi.org/10.1128/mBio.02096-17

And then:
https://doi.org/10.1038/s41586-018-0124-0

In [None]:
essential_df = pd.read_csv('../Data/562_associated_data/ecoli_essential.txt', sep='\t')
essential_df.rename(columns={'Gene':'gene_id'}, inplace=True)
print('All:', essential_df.shape)
print('Essential:', essential_df[essential_df['Essential'] == True].shape)
print('Non-essential:', essential_df[essential_df['Non-essential'] == True].shape)
print('Unclear:', essential_df[essential_df['Unclear'] == True].shape)

###Re-factoring this dataframe a bit
essential_df['essentiality_ds1'] = ''
indices = essential_df[essential_df['Essential']==True].index
essential_df.loc[indices, 'essentiality_ds1'] = 'Essential'
indices = essential_df[essential_df['Non-essential']==True].index
essential_df.loc[indices, 'essentiality_ds1'] = 'Non-essential'
indices = essential_df[essential_df['Unclear']==True].index
essential_df.loc[indices, 'essentiality_ds1'] = 'Unclear'
print(essential_df['essentiality_ds1'].value_counts())

###Drop the unnecessary bits
essential_df = essential_df.drop(set(essential_df.columns)-set(['gene_id', 'essentiality_ds1']), axis=1)

###And combine
combined_df = combined_df.merge(essential_df, on='gene_id', how='left')
print(combined_df.shape)
combined_df.head()

In [None]:
essential_df = pd.read_csv('../Data/562_associated_data/ecoli_essential_Price.txt', sep='\t', skiprows=13)
print(essential_df.shape)

###Drop the unnecessary bits
essential_df['essentiality_ds2'] = 'Essential'
essential_df = essential_df.drop(set(essential_df.columns) - set(['sysName', 'essentiality_ds2']), axis=1)
essential_df.columns = ['locus_tag', 'essentiality_ds2']

###And merge
combined_df = combined_df.merge(essential_df, on='locus_tag', how='left')
print(combined_df.shape)
indices = combined_df[combined_df['essentiality_ds2']!='Essential'].index
combined_df.loc[indices, 'essentiality_ds2'] = 'Non-essential'
combined_df.head()

**Finally, add a consensus column of gene essentiality**

In [None]:
combined_df['essentiality'] = 'Unclear'
indices = combined_df[(combined_df['essentiality_ds1']=='Essential')&
                      (combined_df['essentiality_ds2']=='Essential')].index
combined_df.at[indices, 'essentiality'] = 'Essential'
indices = combined_df[(combined_df['essentiality_ds1']!='Essential')&
                      (combined_df['essentiality_ds2']!='Essential')].index
combined_df.at[indices, 'essentiality'] = 'Non-essential'

print(combined_df['essentiality'].value_counts())
combined_df.head()

# Save file

In [None]:
combined_df.to_csv('../Data/562_associated_data/ecoli.combined.tsv', sep='\t')