In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
from scipy import stats

import statsmodels.formula.api as smf

from matplotlib import pyplot as plt

# Import data

This data comes from this paper: https://www.nature.com/articles/s41467-019-13483-w

It's a big excel file with expression and growth rate data in different sheets and this code will use a few of these sheets.

In [3]:
df = pd.ExcelFile(r'../Data/raw_data/41467_2019_13483_MOESM4_ESM.xlsx')

## Basic metadata clean up and data subsetting

After some manual inspection, looking at distributions, etc. we decided to exclude samples where:

1. The growth rate data was unknown


2. The growth rate data was reported as zero. This one might seem strange but it's is a little unclear in general if that's possible/true to have zero growth rate. It is possible that these were stationary phase cultures but equally likely from my stand-point that these are errors in the table.


3. Really poor alignment (perhaps indicating some overall contamination)

In [4]:
meta_df = df.parse('Metadata') ###This grabs the sheet that contains information about the samples
print(meta_df.shape)
meta_df = meta_df[meta_df['Growth Rate (1/hr)'].isnull() == False]
print(meta_df.shape)
meta_df = meta_df[meta_df['Growth Rate (1/hr)'] > 0.0]
print(meta_df.shape)
meta_df = meta_df[meta_df['Alignment'] > 80]
print(meta_df.shape)
meta_df.head()

(278, 26)
(195, 26)
(179, 26)
(173, 26)


Unnamed: 0,Sample ID,Study,Project ID,Condition ID,Replicate #,Strain Description,Strain,Base Media,Carbon Source (g/L),Nitrogen Source (g/L),...,Culture Type,Growth Rate (1/hr),Evolved Sample,Isolate Type,Sequencing Machine,Additional Details,Biological Replicates,Alignment,DOI,GEO
4,fur__wt_fe__1,Fur,fur,wt_fe,1,Escherichia coli K-12 MG1655,MG1655,M9,glucose(2),NH4Cl(1),...,Batch,1.060606,No,,MiSeq,,2,93.35,doi.org/10.1038/ncomms5910,GSE54900
5,fur__wt_fe__2,Fur,fur,wt_fe,2,Escherichia coli K-12 MG1655,MG1655,M9,glucose(2),NH4Cl(1),...,Batch,1.060606,No,,MiSeq,,2,92.38,doi.org/10.1038/ncomms5910,GSE54900
8,fur__delfur_fe2__1,Fur,fur,delfur_fe2,1,Escherichia coli K-12 MG1655 del_fur,MG1655,M9,glucose(2),NH4Cl(1),...,Batch,0.619469,No,,MiSeq,,2,92.8,doi.org/10.1038/ncomms5910,GSE54900
9,fur__delfur_fe2__2,Fur,fur,delfur_fe2,2,Escherichia coli K-12 MG1655 del_fur,MG1655,M9,glucose(2),NH4Cl(1),...,Batch,0.619469,No,,MiSeq,,2,93.24,doi.org/10.1038/ncomms5910,GSE54900
55,omics__bw_ac__1,Omics,omics,bw_ac,1,Escherichia coli BW25113,BW25113,M9,acetate(3.5),NH4Cl(1),...,Batch,0.203,No,,MiSeq,,2,97.8,doi.org/10.1038/ncomms13091,GSE59759


## Read in the expression data

In [6]:
exp_df = df.parse('Expression Data', index_col='log-TPM')
print(exp_df.shape)
exp_df = exp_df[meta_df['Sample ID']]###Only grab the columns corresponding to the samples identified above
print(exp_df.shape)
assert list(meta_df['Sample ID']) == list(exp_df.columns) ###Check our work
exp_df.head()

(3923, 278)
(3923, 173)


Unnamed: 0_level_0,fur__wt_fe__1,fur__wt_fe__2,fur__delfur_fe2__1,fur__delfur_fe2__2,omics__bw_ac__1,omics__bw_ac__2,omics__bw_fum__1,omics__bw_fum__2,omics__bw_glc__1,omics__bw_glc__2,...,efeU__menFentC_ale29__1,efeU__menFentC_ale29__2,efeU__menFentC_ale30__1,efeU__menFentC_ale30__2,efeU__menFentCubiC_ale36__1,efeU__menFentCubiC_ale36__2,efeU__menFentCubiC_ale37__1,efeU__menFentCubiC_ale37__2,efeU__menFentCubiC_ale38__1,efeU__menFentCubiC_ale38__2
log-TPM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
b0002,11.06481,10.779071,11.229767,11.214065,9.257348,9.182322,9.709213,9.672126,10.208587,10.218351,...,10.271327,10.276565,11.148538,11.170578,11.676604,11.726097,11.881529,11.923237,11.49641,11.552762
b0003,10.776984,10.59781,10.897938,10.861157,8.983408,8.943151,9.436004,9.394573,9.609637,9.677931,...,10.160291,10.116861,10.314322,10.392251,10.916426,10.909277,11.023924,11.038426,10.624301,10.764195
b0004,10.394971,10.11395,10.185151,10.164655,8.76169,8.77992,9.532673,9.53437,9.883558,9.870356,...,10.475069,10.434352,10.679541,10.723953,11.14331,11.112721,11.184795,11.241845,10.953206,11.001006
b0005,6.716069,6.410864,6.527653,6.136168,4.474204,4.72049,5.782102,5.326669,5.846675,5.972022,...,5.979079,5.705586,6.30612,6.29134,5.058537,4.83555,5.448097,5.757951,5.873964,5.808618
b0006,6.761813,6.816532,6.862147,6.81748,6.536457,6.439917,6.408731,6.276017,6.9102,6.843384,...,8.371287,8.32239,8.137515,8.071837,7.354131,7.365536,7.328101,7.314761,8.05775,8.105213


**Manual inspection found some weird and highly implausible/impossible duplicate column/s, so we'll make a note of that here and deal with it later**

In [7]:
exp_df[[col for col in exp_df.columns if 'pal__lyx_ale' in col]].head()

Unnamed: 0_level_0,pal__lyx_ale2_f6__1,pal__lyx_ale2__1,pal__lyx_ale2__2,pal__lyx_ale4__1,pal__lyx_ale4__2
log-TPM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
b0002,9.627287,10.130315,10.155462,10.130315,10.155462
b0003,9.250534,9.708944,9.831138,9.708944,9.831138
b0004,9.203814,10.049444,10.190627,10.049444,10.190627
b0005,4.807384,5.772047,5.933463,5.772047,5.933463
b0006,6.398236,6.435048,6.220552,6.435048,6.220552


# Averaging gene expression values across replicates to clean up/simplify the data

## First getting gene expression averages between replicates and creating a new `dataframe` to hold this information

Thus just exploits the fact that replicates are denoted by "__x" in the sample names so we identify these and average them when possible. 

In [8]:
unique_cols = set(exp_df.columns.str[:-3])
new_exp_df = pd.DataFrame()

for i in unique_cols:
    new_exp_df[i] = exp_df[[col for col in exp_df.columns if col[:-3]==i]].mean(axis=1)
print(new_exp_df.shape)
new_exp_df.head()

(3923, 105)


Unnamed: 0_level_0,ytf__delyafC,ytf__delyddM,pgi__glu4_delpgi,pgi__pgi_ale7,fur__delfur_fe2,pgi__pgi_ale8,pal__lyx_ale4,ica__leu_glcr,rpoB__rpoBE546V_glc,efeU__menFentC_ale39,...,42c__42c_ale5,pgi__pgi_ale6,ytf__delyheO,pal__arab_ale14_f9,cra_crp__wt_fru,glu__glu_ale5,ssw__glc_xyl_glc,ica__thm_gal,pal__arab_ale16,ytf__wt_ph8
log-TPM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
b0002,9.312825,11.11283,9.846088,10.862762,11.221916,10.482452,10.142889,12.361644,10.927354,10.344302,...,10.627727,9.859055,9.401156,10.227775,11.19804,11.139383,10.079777,11.09047,8.970781,11.135983
b0003,8.680662,10.412806,9.25333,10.194297,10.879548,9.744255,9.770041,11.403576,10.293011,10.265772,...,10.015906,9.214803,8.933743,9.808444,10.182029,10.650042,9.672696,10.402663,8.56464,10.507847
b0004,8.735598,10.728984,9.273931,10.1049,10.174903,9.761211,10.120036,11.844689,10.246893,10.712971,...,10.239207,9.12837,9.164724,9.854127,10.03774,10.828312,9.735267,10.633509,8.87595,10.65049
b0005,4.190217,5.831692,4.381576,6.461725,6.331911,6.083744,5.852755,6.207579,7.551687,6.245881,...,5.799398,6.575517,4.21512,6.178483,6.058802,6.753118,6.280117,4.686068,5.094244,5.651241
b0006,6.769807,7.205476,7.260331,6.92454,6.839814,6.296399,6.3278,6.780668,6.882767,7.161943,...,6.945713,6.865126,6.748235,6.589387,6.555468,6.723685,6.89557,6.663905,6.655641,7.034334


**Double checking the work**

Just making sure somethings add up here by taking an example column/condition and looking at the replicate values

In [14]:
example_col = list(unique_cols)[0]

exp_df[[col for col in exp_df.columns if col[:-3]==example_col]].head()

Unnamed: 0_level_0,ytf__delyafC__1,ytf__delyafC__2
log-TPM,Unnamed: 1_level_1,Unnamed: 2_level_1
b0002,9.378698,9.246952
b0003,8.727473,8.633851
b0004,8.745293,8.725903
b0005,4.154798,4.225636
b0006,6.826286,6.713329


And their average

In [15]:
new_exp_df[[example_col]].head()

Unnamed: 0_level_0,ytf__delyafC
log-TPM,Unnamed: 1_level_1
b0002,9.312825
b0003,8.680662
b0004,8.735598
b0005,4.190217
b0006,6.769807


## Dealing with the weird duplicate column/s

Time to kill any completely identical columns, these must be bugs on the data end and even though I could in theory keep one, their growth rate value is unclear (since it differs)

In [16]:
###Get an all-to-all correlation matrix between gene expression values
temp_corr = new_exp_df.corr(method='spearman')

In [17]:
###And make this into a symmetric dataframe
temp_df = pd.DataFrame(
    np.where(np.equal(*np.indices(temp_corr.shape)), np.nan, temp_corr.values),
    temp_corr.index, temp_corr.columns
)
print(temp_df.shape)
temp_df.head()

(105, 105)


Unnamed: 0,ytf__delyafC,ytf__delyddM,pgi__glu4_delpgi,pgi__pgi_ale7,fur__delfur_fe2,pgi__pgi_ale8,pal__lyx_ale4,ica__leu_glcr,rpoB__rpoBE546V_glc,efeU__menFentC_ale39,...,42c__42c_ale5,pgi__pgi_ale6,ytf__delyheO,pal__arab_ale14_f9,cra_crp__wt_fru,glu__glu_ale5,ssw__glc_xyl_glc,ica__thm_gal,pal__arab_ale16,ytf__wt_ph8
ytf__delyafC,,0.917379,0.822837,0.824888,0.856039,0.829152,0.799936,0.791314,0.849476,0.938303,...,0.803504,0.815242,0.990424,0.872424,0.852303,0.812769,0.892314,0.822446,0.806385,0.886383
ytf__delyddM,0.917379,,0.886031,0.904277,0.928504,0.911653,0.871243,0.885556,0.929653,0.90719,...,0.893792,0.898835,0.915217,0.936786,0.923216,0.880051,0.894655,0.884198,0.883831,0.959916
pgi__glu4_delpgi,0.822837,0.886031,,0.94396,0.914664,0.934235,0.89333,0.881568,0.913203,0.849512,...,0.905331,0.934927,0.819924,0.904554,0.905393,0.92059,0.848005,0.890937,0.873158,0.909069
pgi__pgi_ale7,0.824888,0.904277,0.94396,,0.916229,0.978946,0.911686,0.880287,0.948027,0.850431,...,0.939316,0.979767,0.8236,0.914642,0.904791,0.940711,0.892932,0.87335,0.889021,0.909327
fur__delfur_fe2,0.856039,0.928504,0.914664,0.916229,,0.915113,0.878752,0.8514,0.948601,0.840142,...,0.899094,0.914963,0.844202,0.935461,0.914061,0.899217,0.866064,0.845009,0.883927,0.965289


**Identify columns that contain a value of "1." since this indicates they have a perfect correlation**

In [18]:
temp_df[temp_df.values==1]

Unnamed: 0,ytf__delyafC,ytf__delyddM,pgi__glu4_delpgi,pgi__pgi_ale7,fur__delfur_fe2,pgi__pgi_ale8,pal__lyx_ale4,ica__leu_glcr,rpoB__rpoBE546V_glc,efeU__menFentC_ale39,...,42c__42c_ale5,pgi__pgi_ale6,ytf__delyheO,pal__arab_ale14_f9,cra_crp__wt_fru,glu__glu_ale5,ssw__glc_xyl_glc,ica__thm_gal,pal__arab_ale16,ytf__wt_ph8
pal__lyx_ale4,0.799936,0.871243,0.89333,0.911686,0.878752,0.905686,,0.869209,0.897721,0.829072,...,0.890743,0.907159,0.8044,0.907604,0.90089,0.909602,0.859288,0.883647,0.900231,0.884983
pal__lyx_ale2,0.799936,0.871243,0.89333,0.911686,0.878752,0.905686,1.0,0.869209,0.897721,0.829072,...,0.890743,0.907159,0.8044,0.907604,0.90089,0.909602,0.859288,0.883647,0.900231,0.884983


In [21]:
temp_df.loc[['pal__lyx_ale4', 'pal__lyx_ale2'], ['pal__lyx_ale4', 'pal__lyx_ale2']]

Unnamed: 0,pal__lyx_ale4,pal__lyx_ale2
pal__lyx_ale4,,1.0
pal__lyx_ale2,1.0,


Get rid of them both since there is obviously an error here somewhere

In [22]:
new_exp_df.drop(['pal__lyx_ale2', 'pal__lyx_ale4'], axis=1, inplace=True)
print(new_exp_df.shape)

(3923, 103)


# Average the growth rates across these replicates in the metadata as well

In [23]:
###Assign a unique id that removes the replicate information
meta_df['Simple_sample_id'] = meta_df['Sample ID'].str[:-3]
print(meta_df.shape)
###Group according to this new id
group_cols = ['Simple_sample_id']
###For these numeric columns I'll take the mean
metric_cols_a = ['Temperature (C)', 'pH', 'Growth Rate (1/hr)', 'Alignment']
aggs_a = meta_df.groupby(group_cols)[metric_cols_a].mean()
###And for these I'll just grab the count
metric_cols_b = ['Replicate #', 'Biological Replicates']
aggs_b = meta_df.groupby(group_cols)['Replicate #'].count()

###Drop the columns from the original dataframe (we'll add them back in later)
meta_df.drop(metric_cols_a, axis=1, inplace=True)
meta_df.drop(metric_cols_b, axis=1, inplace=True)
###And duplicates
meta_df.drop_duplicates(subset=group_cols, keep='first', inplace=True)

###Now merge the main dataframe with the grouped ones
meta_df = meta_df.merge(right=aggs_a, right_index=True, left_on=group_cols, how='right')
print(meta_df.shape)
meta_df = meta_df.merge(right=aggs_b, right_index=True, left_on=group_cols, how='right')
print(meta_df.shape)
meta_df.head()

(173, 27)
(105, 25)
(105, 26)


Unnamed: 0,Sample ID,Study,Project ID,Condition ID,Strain Description,Strain,Base Media,Carbon Source (g/L),Nitrogen Source (g/L),Electron Acceptor,...,Sequencing Machine,Additional Details,DOI,GEO,Simple_sample_id,Temperature (C),pH,Growth Rate (1/hr),Alignment,Replicate #
4,fur__wt_fe__1,Fur,fur,wt_fe,Escherichia coli K-12 MG1655,MG1655,M9,glucose(2),NH4Cl(1),O2,...,MiSeq,,doi.org/10.1038/ncomms5910,GSE54900,fur__wt_fe,37,7.0,1.060606,92.865,2
8,fur__delfur_fe2__1,Fur,fur,delfur_fe2,Escherichia coli K-12 MG1655 del_fur,MG1655,M9,glucose(2),NH4Cl(1),O2,...,MiSeq,,doi.org/10.1038/ncomms5910,GSE54900,fur__delfur_fe2,37,7.0,0.619469,93.02,2
55,omics__bw_ac__1,Omics,omics,bw_ac,Escherichia coli BW25113,BW25113,M9,acetate(3.5),NH4Cl(1),O2,...,MiSeq,,doi.org/10.1038/ncomms13091,GSE59759,omics__bw_ac,37,7.0,0.203,97.92,2
57,omics__bw_fum__1,Omics,omics,bw_fum,Escherichia coli BW25113,BW25113,M9,fumarate(2.8),NH4Cl(1),O2,...,MiSeq,,doi.org/10.1038/ncomms13091,GSE59759,omics__bw_fum,37,7.0,0.309,98.13,2
59,omics__bw_glc__1,Omics,omics,bw_glc,Escherichia coli BW25113,BW25113,M9,glucose(5),NH4Cl(1),O2,...,MiSeq,,doi.org/10.1038/ncomms13091,GSE59759,omics__bw_glc,37,7.0,0.6075,97.875,2


**And get rid of those problematic samples from this dataframe as well**

In [24]:
print(meta_df.shape)
meta_df = meta_df[meta_df['Sample ID'].str.contains('pal__lyx_ale2__')==False]
print(meta_df.shape)
meta_df = meta_df[meta_df['Sample ID'].str.contains('pal__lyx_ale4__')==False]
print(meta_df.shape)

(105, 26)
(104, 26)
(103, 26)


**Sum should equal the original shape!**

In [25]:
meta_df['Replicate #'].value_counts()

2    61
1    41
6     1
Name: Replicate #, dtype: int64

**Make sure that the columns line up when matching across these two dataframes** 

In [26]:
new_exp_df = new_exp_df[meta_df['Simple_sample_id']]

In [27]:
assert all(new_exp_df.columns == meta_df['Simple_sample_id'])

**And add the doubling time just for good measure**

Which is just a slight transformation of growth rate

In [28]:
meta_df['Doubling_time'] = np.log(2)/meta_df['Growth Rate (1/hr)']

# Construct a third `dataframe` containing gene expression data summary stats

Strictly speaking this isn't super necessary but might as well do it now to get it done and over with

In [29]:
exp_summary_df = new_exp_df.apply(pd.DataFrame.describe, axis=1)

**The % signs seem to cause some problems down the road so lets remove them**

In [30]:
col_listy = []
for col in exp_summary_df.columns:
    if '%' not in col:
        col_listy.append(col)
    else:
        col_listy.append(col.replace('%', '_percentile'))
print(col_listy)
exp_summary_df.columns = col_listy

['count', 'mean', 'std', 'min', '25_percentile', '50_percentile', '75_percentile', 'max']


**And add some other dispersion metrics**

In [31]:
exp_summary_df['cv'] = exp_summary_df['std']/exp_summary_df['mean']
exp_summary_df['noise'] = exp_summary_df['std'].pow(2)/exp_summary_df['mean'].pow(2)

**Finally, adding some of (what we think are) the cool new variables to consider**

In [32]:
slopes = []
pearsons = []
spearmans = []
for gene in exp_summary_df.index:
    a, b, c, d, e = stats.linregress(new_exp_df.loc[gene], meta_df['Growth Rate (1/hr)'])
    slopes.append(a)
    pearsons.append(c)
    rho, p = stats.spearmanr(new_exp_df.loc[gene], meta_df['Growth Rate (1/hr)'])
    spearmans.append(rho)
    
exp_summary_df['lin_slope'] = slopes
exp_summary_df['lin_r'] = pearsons
exp_summary_df['spearmans_rho'] = spearmans

# Save some files

This was the whole point of all the code above. Should have some straightforward data tables now

In [33]:
print(new_exp_df.shape)
print(meta_df.shape)
print(exp_summary_df.shape)
new_exp_df.to_csv('../Data/processed_data/processed_expression_ecoli.tsv', sep='\t')
meta_df.to_csv('../Data/processed_data/processed_metadata_ecoli.tsv', sep='\t')
exp_summary_df.to_csv('../Data/processed_data/processed_expression_summary_ecoli.tsv', sep='\t')

(3923, 103)
(103, 27)
(3923, 13)


# Get a thinned down dataset as a robustness check

Since conditions are correlated with one another, I'm constructing a more sparsely populated dataset where inter-condition correlations are minimized using a greedy algorithm. 

The algorithm works as follows:
1. Find which two conditions are the most highly correlated across the entire all-to-all correlation matrix
2. Randomly delete one of the two conditions in question
3. Iterate to (1)
4. Stop after reaching a pre-defined final dataset size

In [34]:
import random
random.seed(42)

In [35]:
cond_corr_mat = new_exp_df.corr(method='spearman')
cond_corr_mat.head()

Unnamed: 0,fur__wt_fe,fur__delfur_fe2,omics__bw_ac,omics__bw_fum,omics__bw_glc,omics__bw_pyr,cra_crp__wt_ac,cra_crp__wt_fru,cra_crp__delcra_ac,cra_crp__delcra_fru,...,pal__tartr_ale26_f10,pal__tartr_ale26,pal__tartr_ale28_f10,pal__tartr_ale28,efeU__menFentC_ale39,efeU__menFentC_ale29,efeU__menFentC_ale30,efeU__menFentCubiC_ale36,efeU__menFentCubiC_ale37,efeU__menFentCubiC_ale38
fur__wt_fe,1.0,0.96464,0.923339,0.917715,0.962571,0.935009,0.901038,0.926609,0.904316,0.943159,...,0.866013,0.866432,0.858613,0.918781,0.873224,0.850732,0.928566,0.93477,0.939126,0.927869
fur__delfur_fe2,0.96464,1.0,0.913742,0.903783,0.952763,0.92447,0.898731,0.914061,0.924542,0.961016,...,0.854125,0.849878,0.858827,0.901197,0.840142,0.834696,0.896473,0.942488,0.946356,0.904614
omics__bw_ac,0.923339,0.913742,1.0,0.967975,0.942483,0.972111,0.943399,0.930222,0.931166,0.885828,...,0.907638,0.897741,0.903667,0.949034,0.839838,0.817468,0.878564,0.884267,0.884292,0.869539
omics__bw_fum,0.917715,0.903783,0.967975,1.0,0.956253,0.974362,0.902824,0.916864,0.892215,0.869514,...,0.867798,0.873566,0.850602,0.940838,0.842448,0.820857,0.897118,0.879182,0.874671,0.884392
omics__bw_glc,0.962571,0.952763,0.942483,0.956253,1.0,0.966576,0.885214,0.913576,0.897124,0.932456,...,0.856878,0.85861,0.856949,0.92533,0.867961,0.851744,0.924271,0.932579,0.932818,0.925066


**The `final_data_size` is of course completely arbitrary. But the point is to get rid of some correlated data so it does the trick.**

In [37]:
final_data_size = 30
n_to_prune = cond_corr_mat.shape[1] - final_data_size

In [38]:
np.fill_diagonal(cond_corr_mat.values, np.nan)
to_prune = []
for i in range(n_to_prune):
    tempy = cond_corr_mat.loc[[i for i in cond_corr_mat.columns if i not in to_prune]][[i for i in cond_corr_mat.columns if i not in to_prune]].max()
    to_prune.append(random.choice([tempy.sort_values(ascending=False).index[0],\
                                   tempy.sort_values(ascending=False).index[1]]))
np.fill_diagonal(cond_corr_mat.values, 1.)

**Remove relevant rows from two of the dataframes**

In [39]:
to_keep = [i for i in cond_corr_mat.columns if i not in to_prune]
new_exp_df = new_exp_df[to_keep]
meta_df = meta_df[meta_df['Simple_sample_id'].isin(to_keep)]

**And regenerate the summary dataframe to only use this thinned-down set of samples**

In [40]:
exp_summary_df = new_exp_df.apply(pd.DataFrame.describe, axis=1)
col_listy = []
for col in exp_summary_df.columns:
    if '%' not in col:
        col_listy.append(col)
    else:
        col_listy.append(col.replace('%', '_percentile'))
print(col_listy)
exp_summary_df.columns = col_listy

exp_summary_df['cv'] = exp_summary_df['std']/exp_summary_df['mean']
exp_summary_df['noise'] = exp_summary_df['std'].pow(2)/exp_summary_df['mean'].pow(2)

slopes = []
pearsons = []
spearmans = []
for gene in exp_summary_df.index:
    a, b, c, d, e = stats.linregress(new_exp_df.loc[gene], meta_df['Growth Rate (1/hr)'])
    slopes.append(a)
    pearsons.append(c)
    rho, p = stats.spearmanr(new_exp_df.loc[gene], meta_df['Growth Rate (1/hr)'])
    spearmans.append(rho)
    
exp_summary_df['lin_slope'] = slopes
exp_summary_df['lin_r'] = pearsons
exp_summary_df['spearmans_rho'] = spearmans

['count', 'mean', 'std', 'min', '25_percentile', '50_percentile', '75_percentile', 'max']


In [41]:
print(new_exp_df.shape)
print(meta_df.shape)
print(exp_summary_df.shape)
new_exp_df.to_csv('../Data/processed_data/processed_expression_ecoli_SPARSE.tsv', sep='\t')
meta_df.to_csv('../Data/processed_data/processed_metadata_ecoli_SPARSE.tsv', sep='\t')
exp_summary_df.to_csv('../Data/processed_data/processed_expression_summary_ecoli_SPARSE.tsv', sep='\t')

(3923, 30)
(30, 27)
(3923, 13)


# Scratch