In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
from scipy import stats

import statsmodels.formula.api as smf

from matplotlib import pyplot as plt

# Import data

In [3]:
df = pd.ExcelFile(r'../Data/raw_data/41467_2019_13483_MOESM4_ESM.xlsx')

## Basic metadata clean up and data subsetting

In [4]:
meta_df = df.parse('Metadata')
print(meta_df.shape)
meta_df = meta_df[meta_df['Growth Rate (1/hr)'].isnull() == False]
print(meta_df.shape)
meta_df = meta_df[meta_df['Growth Rate (1/hr)'] > 0.0]
print(meta_df.shape)
meta_df = meta_df[meta_df['Alignment'] > 80]
print(meta_df.shape)
meta_df.head()

(278, 26)
(195, 26)
(179, 26)
(173, 26)


Unnamed: 0,Sample ID,Study,Project ID,Condition ID,Replicate #,Strain Description,Strain,Base Media,Carbon Source (g/L),Nitrogen Source (g/L),...,Culture Type,Growth Rate (1/hr),Evolved Sample,Isolate Type,Sequencing Machine,Additional Details,Biological Replicates,Alignment,DOI,GEO
4,fur__wt_fe__1,Fur,fur,wt_fe,1,Escherichia coli K-12 MG1655,MG1655,M9,glucose(2),NH4Cl(1),...,Batch,1.060606,No,,MiSeq,,2,93.35,doi.org/10.1038/ncomms5910,GSE54900
5,fur__wt_fe__2,Fur,fur,wt_fe,2,Escherichia coli K-12 MG1655,MG1655,M9,glucose(2),NH4Cl(1),...,Batch,1.060606,No,,MiSeq,,2,92.38,doi.org/10.1038/ncomms5910,GSE54900
8,fur__delfur_fe2__1,Fur,fur,delfur_fe2,1,Escherichia coli K-12 MG1655 del_fur,MG1655,M9,glucose(2),NH4Cl(1),...,Batch,0.619469,No,,MiSeq,,2,92.8,doi.org/10.1038/ncomms5910,GSE54900
9,fur__delfur_fe2__2,Fur,fur,delfur_fe2,2,Escherichia coli K-12 MG1655 del_fur,MG1655,M9,glucose(2),NH4Cl(1),...,Batch,0.619469,No,,MiSeq,,2,93.24,doi.org/10.1038/ncomms5910,GSE54900
55,omics__bw_ac__1,Omics,omics,bw_ac,1,Escherichia coli BW25113,BW25113,M9,acetate(3.5),NH4Cl(1),...,Batch,0.203,No,,MiSeq,,2,97.8,doi.org/10.1038/ncomms13091,GSE59759


## And the same for the expression data

In [5]:
exp_df = df.parse('Expression Data', index_col='log-TPM')
print(exp_df.shape)
exp_df = exp_df[meta_df['Sample ID']]
print(exp_df.shape)
assert list(meta_df['Sample ID']) == list(exp_df.columns)
exp_df.head()

(3923, 278)
(3923, 173)


Unnamed: 0_level_0,fur__wt_fe__1,fur__wt_fe__2,fur__delfur_fe2__1,fur__delfur_fe2__2,omics__bw_ac__1,omics__bw_ac__2,omics__bw_fum__1,omics__bw_fum__2,omics__bw_glc__1,omics__bw_glc__2,...,efeU__menFentC_ale29__1,efeU__menFentC_ale29__2,efeU__menFentC_ale30__1,efeU__menFentC_ale30__2,efeU__menFentCubiC_ale36__1,efeU__menFentCubiC_ale36__2,efeU__menFentCubiC_ale37__1,efeU__menFentCubiC_ale37__2,efeU__menFentCubiC_ale38__1,efeU__menFentCubiC_ale38__2
log-TPM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
b0002,11.06481,10.779071,11.229767,11.214065,9.257348,9.182322,9.709213,9.672126,10.208587,10.218351,...,10.271327,10.276565,11.148538,11.170578,11.676604,11.726097,11.881529,11.923237,11.49641,11.552762
b0003,10.776984,10.59781,10.897938,10.861157,8.983408,8.943151,9.436004,9.394573,9.609637,9.677931,...,10.160291,10.116861,10.314322,10.392251,10.916426,10.909277,11.023924,11.038426,10.624301,10.764195
b0004,10.394971,10.11395,10.185151,10.164655,8.76169,8.77992,9.532673,9.53437,9.883558,9.870356,...,10.475069,10.434352,10.679541,10.723953,11.14331,11.112721,11.184795,11.241845,10.953206,11.001006
b0005,6.716069,6.410864,6.527653,6.136168,4.474204,4.72049,5.782102,5.326669,5.846675,5.972022,...,5.979079,5.705586,6.30612,6.29134,5.058537,4.83555,5.448097,5.757951,5.873964,5.808618
b0006,6.761813,6.816532,6.862147,6.81748,6.536457,6.439917,6.408731,6.276017,6.9102,6.843384,...,8.371287,8.32239,8.137515,8.071837,7.354131,7.365536,7.328101,7.314761,8.05775,8.105213


In [6]:
###I noticed a weird duplicate, so later we're going to delete these
exp_df[[col for col in exp_df.columns if 'pal__lyx_ale' in col]].head()

Unnamed: 0_level_0,pal__lyx_ale2_f6__1,pal__lyx_ale2__1,pal__lyx_ale2__2,pal__lyx_ale4__1,pal__lyx_ale4__2
log-TPM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
b0002,9.627287,10.130315,10.155462,10.130315,10.155462
b0003,9.250534,9.708944,9.831138,9.708944,9.831138
b0004,9.203814,10.049444,10.190627,10.049444,10.190627
b0005,4.807384,5.772047,5.933463,5.772047,5.933463
b0006,6.398236,6.435048,6.220552,6.435048,6.220552


# Averaging between replicates to clean up the data

## First getting gene expression averages between replicates and creating a new `dataframe`

In [7]:
unique_cols = set(exp_df.columns.str[:-3])
new_exp_df = pd.DataFrame()

for i in unique_cols:
    new_exp_df[i] = exp_df[[col for col in exp_df.columns if col[:-3]==i]].mean(axis=1)
print(new_exp_df.shape)
new_exp_df.head()

(3923, 105)


Unnamed: 0_level_0,ytf__delybaQ,efeU__menFentC_ale30,pgi__pgi_ale8,ica__met_glc,cra_crp__delcra_glc,omics__bw_ac,ica__leu_glcr,cra_crp__wt_ac,ytf__wt_ph5,pal__5drib_ale10_f18,...,ica__arg_sbt,ica__tyr_glcn,ssw__glc_ac_ac2,pgi__pgi_ale7,ica__gth,ssw__wt_glc,ica__ade_glc,omics__bw_glc,ytf__delybaO,pal__tartr_ale28
log-TPM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
b0002,11.097921,11.159558,10.482452,11.003212,11.230252,9.219835,12.361644,9.568949,11.06991,7.979729,...,12.000295,11.031828,9.119442,10.862762,8.319211,10.445776,10.126944,10.213469,9.435166,8.134838
b0003,10.479954,10.353287,9.744255,10.310934,11.106092,8.96328,11.403576,8.831856,10.408418,8.026912,...,11.462286,10.394478,8.573671,10.194297,7.711763,9.787075,9.667656,9.643784,8.993282,7.992993
b0004,10.692369,10.701747,9.761211,10.705726,10.486687,8.770805,11.844689,8.680003,10.535924,7.996297,...,11.582998,10.75212,8.614467,10.1049,7.557831,9.916621,10.31969,9.876957,9.11808,8.173352
b0005,5.768176,6.29873,6.083744,6.111748,5.818731,4.597347,6.207579,4.576969,5.676045,4.368704,...,5.891487,5.703923,4.287681,6.461725,4.999832,4.968974,5.597313,5.909349,4.545074,4.050593
b0006,7.085122,8.104676,6.296399,6.721225,6.937212,6.488187,6.780668,6.273174,7.108237,6.302862,...,6.149227,6.866697,6.611613,6.92454,6.235536,6.996502,7.266482,6.876792,6.777675,6.675508


**Double checking the work**

Just making sure somethings add up here

In [8]:
for i in unique_cols:
    if len([col for col in exp_df.columns if col[:-3]==i]) > 1:
        print(i)
        break

ytf__delybaQ


In [9]:
exp_df[[col for col in exp_df.columns if col[:-3]==i]].head()

Unnamed: 0_level_0,ytf__delybaQ__1,ytf__delybaQ__2
log-TPM,Unnamed: 1_level_1,Unnamed: 2_level_1
b0002,11.105751,11.090092
b0003,10.492515,10.467394
b0004,10.685734,10.699005
b0005,5.861062,5.675291
b0006,7.011231,7.159013


In [10]:
new_exp_df[[i]].head()

Unnamed: 0_level_0,ytf__delybaQ
log-TPM,Unnamed: 1_level_1
b0002,11.097921
b0003,10.479954
b0004,10.692369
b0005,5.768176
b0006,7.085122


**Kill any completely identical columns, these must be bugs on the data end and even though I could in theory keep one, their growth rate value is unclear (since it differs)**

In [11]:
temp_corr = new_exp_df.corr(method='spearman')

In [12]:
temp_df = pd.DataFrame(
    np.where(np.equal(*np.indices(temp_corr.shape)), np.nan, temp_corr.values),
    temp_corr.index, temp_corr.columns
)
print(temp_df.shape)
temp_df.head()

(105, 105)


Unnamed: 0,ytf__delybaQ,efeU__menFentC_ale30,pgi__pgi_ale8,ica__met_glc,cra_crp__delcra_glc,omics__bw_ac,ica__leu_glcr,cra_crp__wt_ac,ytf__wt_ph5,pal__5drib_ale10_f18,...,ica__arg_sbt,ica__tyr_glcn,ssw__glc_ac_ac2,pgi__pgi_ale7,ica__gth,ssw__wt_glc,ica__ade_glc,omics__bw_glc,ytf__delybaO,pal__tartr_ale28
ytf__delybaQ,,0.948962,0.930857,0.921744,0.978433,0.924051,0.895565,0.911883,0.981701,0.861697,...,0.882757,0.913355,0.914027,0.924207,0.865194,0.940781,0.912856,0.965108,0.891961,0.921565
efeU__menFentC_ale30,0.948962,,0.929403,0.916283,0.937972,0.878564,0.884835,0.868528,0.928755,0.834579,...,0.868936,0.909888,0.869565,0.914815,0.790476,0.891712,0.91251,0.924271,0.870125,0.884007
pgi__pgi_ale8,0.930857,0.929403,,0.899754,0.926066,0.908016,0.868288,0.874839,0.908328,0.8605,...,0.858045,0.895362,0.884129,0.978946,0.816662,0.909861,0.89356,0.932734,0.834416,0.904287
ica__met_glc,0.921744,0.916283,0.899754,,0.904891,0.872485,0.956993,0.859826,0.899197,0.80543,...,0.938643,0.972383,0.856065,0.910656,0.757806,0.91619,0.959795,0.894957,0.824143,0.874159
cra_crp__delcra_glc,0.978433,0.937972,0.926066,0.904891,,0.910402,0.877161,0.913441,0.972327,0.842011,...,0.872234,0.899706,0.893035,0.918873,0.864279,0.927465,0.891048,0.952289,0.88614,0.896915


In [13]:
temp_df[temp_df.values==1]

Unnamed: 0,ytf__delybaQ,efeU__menFentC_ale30,pgi__pgi_ale8,ica__met_glc,cra_crp__delcra_glc,omics__bw_ac,ica__leu_glcr,cra_crp__wt_ac,ytf__wt_ph5,pal__5drib_ale10_f18,...,ica__arg_sbt,ica__tyr_glcn,ssw__glc_ac_ac2,pgi__pgi_ale7,ica__gth,ssw__wt_glc,ica__ade_glc,omics__bw_glc,ytf__delybaO,pal__tartr_ale28
pal__lyx_ale4,0.901845,0.887248,0.905686,0.893834,0.88225,0.924265,0.869209,0.891766,0.869489,0.907829,...,0.885533,0.881834,0.91008,0.911686,0.766783,0.872025,0.875274,0.895297,0.805745,0.924159
pal__lyx_ale2,0.901845,0.887248,0.905686,0.893834,0.88225,0.924265,0.869209,0.891766,0.869489,0.907829,...,0.885533,0.881834,0.91008,0.911686,0.766783,0.872025,0.875274,0.895297,0.805745,0.924159


In [14]:
new_exp_df.drop(['pal__lyx_ale2', 'pal__lyx_ale4'], axis=1, inplace=True)
print(new_exp_df.shape)

(3923, 103)


## Average the growth rates across these replicates in the metadata as well

In [15]:
meta_df['Simple_sample_id'] = meta_df['Sample ID'].str[:-3]
print(meta_df.shape)
group_cols = ['Simple_sample_id']
metric_cols_a = ['Temperature (C)', 'pH', 'Growth Rate (1/hr)', 'Alignment']
aggs_a = meta_df.groupby(group_cols)[metric_cols_a].mean()
metric_cols_b = ['Replicate #', 'Biological Replicates']
aggs_b = meta_df.groupby(group_cols)['Replicate #'].count()

meta_df.drop(metric_cols_a, axis=1, inplace=True)
meta_df.drop(metric_cols_b, axis=1, inplace=True)
meta_df.drop_duplicates(subset=group_cols, keep='first', inplace=True)
meta_df = meta_df.merge(right=aggs_a, right_index=True, left_on=group_cols, how='right')
print(meta_df.shape)
meta_df = meta_df.merge(right=aggs_b, right_index=True, left_on=group_cols, how='right')
print(meta_df.shape)
meta_df.head()

(173, 27)
(105, 25)
(105, 26)


Unnamed: 0,Sample ID,Study,Project ID,Condition ID,Strain Description,Strain,Base Media,Carbon Source (g/L),Nitrogen Source (g/L),Electron Acceptor,...,Sequencing Machine,Additional Details,DOI,GEO,Simple_sample_id,Temperature (C),pH,Growth Rate (1/hr),Alignment,Replicate #
4,fur__wt_fe__1,Fur,fur,wt_fe,Escherichia coli K-12 MG1655,MG1655,M9,glucose(2),NH4Cl(1),O2,...,MiSeq,,doi.org/10.1038/ncomms5910,GSE54900,fur__wt_fe,37,7.0,1.060606,92.865,2
8,fur__delfur_fe2__1,Fur,fur,delfur_fe2,Escherichia coli K-12 MG1655 del_fur,MG1655,M9,glucose(2),NH4Cl(1),O2,...,MiSeq,,doi.org/10.1038/ncomms5910,GSE54900,fur__delfur_fe2,37,7.0,0.619469,93.02,2
55,omics__bw_ac__1,Omics,omics,bw_ac,Escherichia coli BW25113,BW25113,M9,acetate(3.5),NH4Cl(1),O2,...,MiSeq,,doi.org/10.1038/ncomms13091,GSE59759,omics__bw_ac,37,7.0,0.203,97.92,2
57,omics__bw_fum__1,Omics,omics,bw_fum,Escherichia coli BW25113,BW25113,M9,fumarate(2.8),NH4Cl(1),O2,...,MiSeq,,doi.org/10.1038/ncomms13091,GSE59759,omics__bw_fum,37,7.0,0.309,98.13,2
59,omics__bw_glc__1,Omics,omics,bw_glc,Escherichia coli BW25113,BW25113,M9,glucose(5),NH4Cl(1),O2,...,MiSeq,,doi.org/10.1038/ncomms13091,GSE59759,omics__bw_glc,37,7.0,0.6075,97.875,2


In [16]:
print(meta_df.shape)
###Drop that problematic row
meta_df = meta_df[meta_df['Sample ID'].str.contains('pal__lyx_ale2__')==False]
print(meta_df.shape)
meta_df = meta_df[meta_df['Sample ID'].str.contains('pal__lyx_ale4__')==False]
print(meta_df.shape)

(105, 26)
(104, 26)
(103, 26)


**Sum should equal the original shape!**

In [17]:
meta_df['Replicate #'].value_counts()

2    61
1    41
6     1
Name: Replicate #, dtype: int64

In [18]:
new_exp_df = new_exp_df[meta_df['Simple_sample_id']]

In [19]:
all(new_exp_df.columns == meta_df['Simple_sample_id'])

True

In [20]:
meta_df['Doubling_time'] = np.log(2)/meta_df['Growth Rate (1/hr)']

# Construct a third `dataframe` containing expression data summary stats

In [21]:
exp_summary_df = new_exp_df.apply(pd.DataFrame.describe, axis=1)

In [22]:
col_listy = []
for col in exp_summary_df.columns:
    if '%' not in col:
        col_listy.append(col)
    else:
        col_listy.append(col.replace('%', '_percentile'))
print(col_listy)
exp_summary_df.columns = col_listy

['count', 'mean', 'std', 'min', '25_percentile', '50_percentile', '75_percentile', 'max']


In [23]:
exp_summary_df['cv'] = exp_summary_df['std']/exp_summary_df['mean']
exp_summary_df['noise'] = exp_summary_df['std'].pow(2)/exp_summary_df['mean'].pow(2)

In [24]:
slopes = []
pearsons = []
spearmans = []
for gene in exp_summary_df.index:
    a, b, c, d, e = stats.linregress(new_exp_df.loc[gene], meta_df['Growth Rate (1/hr)'])
    slopes.append(a)
    pearsons.append(c)
    rho, p = stats.spearmanr(new_exp_df.loc[gene], meta_df['Growth Rate (1/hr)'])
    spearmans.append(rho)
    
exp_summary_df['lin_slope'] = slopes
exp_summary_df['lin_r'] = pearsons
exp_summary_df['spearmans_rho'] = spearmans

# Save some files

In [25]:
print(new_exp_df.shape)
print(meta_df.shape)
print(exp_summary_df.shape)
new_exp_df.to_csv('../Data/processed_data/processed_expression.tsv', sep='\t')
meta_df.to_csv('../Data/processed_data/processed_metadata.tsv', sep='\t')
exp_summary_df.to_csv('../Data/processed_data/processed_expression_summary.tsv', sep='\t')

(3923, 103)
(103, 27)
(3923, 13)


# Get a thinned down set as a robustness check

Since conditions are correlated with one another, I'm constructing a more sparsely populated dataset where inter-condition correlations are minimized using a greedy algorithm. 

The algorithm works as follows:
1. Find which two conditions are the most highly correlated across the entire all-to-all correlation matrix
2. Randomly delete one of the two conditions in question
3. Iterate to (1)
4. Stop after reaching a pre-defined final dataset size

In [26]:
import random

In [27]:
cond_corr_mat = new_exp_df.corr(method='spearman')
cond_corr_mat.head()

Unnamed: 0,fur__wt_fe,fur__delfur_fe2,omics__bw_ac,omics__bw_fum,omics__bw_glc,omics__bw_pyr,cra_crp__wt_ac,cra_crp__wt_fru,cra_crp__delcra_ac,cra_crp__delcra_fru,...,pal__tartr_ale26_f10,pal__tartr_ale26,pal__tartr_ale28_f10,pal__tartr_ale28,efeU__menFentC_ale39,efeU__menFentC_ale29,efeU__menFentC_ale30,efeU__menFentCubiC_ale36,efeU__menFentCubiC_ale37,efeU__menFentCubiC_ale38
fur__wt_fe,1.0,0.96464,0.923339,0.917715,0.962571,0.935009,0.901038,0.926609,0.904316,0.943159,...,0.866013,0.866432,0.858613,0.918781,0.873224,0.850732,0.928566,0.93477,0.939126,0.927869
fur__delfur_fe2,0.96464,1.0,0.913742,0.903783,0.952763,0.92447,0.898731,0.914061,0.924542,0.961016,...,0.854125,0.849878,0.858827,0.901197,0.840142,0.834696,0.896473,0.942488,0.946356,0.904614
omics__bw_ac,0.923339,0.913742,1.0,0.967975,0.942483,0.972111,0.943399,0.930222,0.931166,0.885828,...,0.907638,0.897741,0.903667,0.949034,0.839838,0.817468,0.878564,0.884267,0.884292,0.869539
omics__bw_fum,0.917715,0.903783,0.967975,1.0,0.956253,0.974362,0.902824,0.916864,0.892215,0.869514,...,0.867798,0.873566,0.850602,0.940838,0.842448,0.820857,0.897118,0.879182,0.874671,0.884392
omics__bw_glc,0.962571,0.952763,0.942483,0.956253,1.0,0.966576,0.885214,0.913576,0.897124,0.932456,...,0.856878,0.85861,0.856949,0.92533,0.867961,0.851744,0.924271,0.932579,0.932818,0.925066


In [29]:
final_data_size = 30
n_to_prune = cond_corr_mat.shape[1] - final_data_size

In [30]:
np.fill_diagonal(cond_corr_mat.values, np.nan)
to_prune = []
for i in range(n_to_prune):
    tempy = cond_corr_mat.loc[[i for i in cond_corr_mat.columns if i not in to_prune]][[i for i in cond_corr_mat.columns if i not in to_prune]].max()
    to_prune.append(random.choice([tempy.sort_values(ascending=False).index[0],\
                                   tempy.sort_values(ascending=False).index[1]]))
np.fill_diagonal(cond_corr_mat.values, 1.)

In [31]:
to_keep = [i for i in cond_corr_mat.columns if i not in to_prune]
new_exp_df = new_exp_df[to_keep]
meta_df = meta_df[meta_df['Simple_sample_id'].isin(to_keep)]

In [32]:
exp_summary_df = new_exp_df.apply(pd.DataFrame.describe, axis=1)
col_listy = []
for col in exp_summary_df.columns:
    if '%' not in col:
        col_listy.append(col)
    else:
        col_listy.append(col.replace('%', '_percentile'))
print(col_listy)
exp_summary_df.columns = col_listy

exp_summary_df['cv'] = exp_summary_df['std']/exp_summary_df['mean']
exp_summary_df['noise'] = exp_summary_df['std'].pow(2)/exp_summary_df['mean'].pow(2)

slopes = []
pearsons = []
spearmans = []
for gene in exp_summary_df.index:
    a, b, c, d, e = stats.linregress(new_exp_df.loc[gene], meta_df['Growth Rate (1/hr)'])
    slopes.append(a)
    pearsons.append(c)
    rho, p = stats.spearmanr(new_exp_df.loc[gene], meta_df['Growth Rate (1/hr)'])
    spearmans.append(rho)
    
exp_summary_df['lin_slope'] = slopes
exp_summary_df['lin_r'] = pearsons
exp_summary_df['spearmans_rho'] = spearmans

['count', 'mean', 'std', 'min', '25_percentile', '50_percentile', '75_percentile', 'max']


In [33]:
print(new_exp_df.shape)
print(meta_df.shape)
print(exp_summary_df.shape)
new_exp_df.to_csv('../Data/processed_data/processed_expression_SPARSE.tsv', sep='\t')
meta_df.to_csv('../Data/processed_data/processed_metadata_SPARSE.tsv', sep='\t')
exp_summary_df.to_csv('../Data/processed_data/processed_expression_summary_SPARSE.tsv', sep='\t')

(3923, 30)
(30, 27)
(3923, 13)


# Scratch