In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
from scipy import stats

import statsmodels.formula.api as smf

from matplotlib import pyplot as plt

In [4]:
data_to_analyze = 'RNA'
# data_to_analyze = 'PROTEIN'

# Import data

In [5]:
df = pd.ExcelFile(r'../Data/raw_data/elife-65722-supp1-v2.xlsx')

## Basic metadata clean up and data subsetting

In [6]:
meta_df = df.parse('Supplementary File 1a', header=1)
print(meta_df.shape)
meta_df.head()

(42, 11)


Unnamed: 0,Sample,Nitrogen source,dilution rate (/h),glucose.H2O concentration (g/L),nitrogen source concentration (g/L),limiting nutrient,C/N ratio,experiment subset,total RNA content (g/gDW),total protein content (g/gDW),extra notes
0,1,NH4,0.05,7.5,0.5,N,30,GR,0.021734,0.272515,
1,2,NH4,0.05,7.5,0.5,N,30,GR,0.024941,0.25082,
2,3,NH4,0.05,7.5,0.5,N,30,GR,0.025831,0.261097,
3,4,NH4,0.1,7.5,5.0,C,3,NM,0.053444,0.359674,
4,5,NH4,0.1,7.5,5.0,C,3,NM,0.050594,0.377943,


## And the same for the expression data

In [7]:
exp_df = df.parse('Supplementary File 1b', header=1, index_col=1)
print(exp_df.shape)
exp_df.head()

(3127, 94)


Unnamed: 0_level_0,Accession,protein length,prot.1,prot.2,prot.3,prot.4,prot.5,prot.6,prot.7,prot.8,...,RNA.41,RNA.42,GR.cluster,NM.cluster,GR.Pearson.r,NM.Pearson.r,all.Pearson.r,GR.Spearman.rho,NM.Spearman.rho,all.Spearman.rho
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SAL1,D6W196,494,4.840234,4.240349,4.525756,8.088604,7.2658,7.831575,4.735927,4.787078,...,0.079206,0.096258,3,1,0.895107,0.105455,0.275134,0.923633,-0.064103,0.273317
CET1,O13297,549,157.097283,156.674508,169.097093,198.678218,219.735915,186.590269,117.466448,136.670311,...,0.098678,0.115185,3,1,0.890444,0.08957,0.351427,0.828689,-0.037241,0.252897
FOB1,O13329,566,1.534616,1.24527,1.309993,1.551638,1.723677,1.545066,0.76684,1.199747,...,0.06963,0.092075,1,1,0.850031,-0.224327,0.63686,0.874097,-0.161783,0.343165
RPS9A,O13516,197,11824.166906,9997.847519,9111.935208,15070.518625,14398.262706,16898.705557,8990.481368,9073.912674,...,0.389067,0.616458,1,1,0.952229,0.26534,0.822659,0.931889,0.03663,0.434892
COQ4,O13525,335,86.217705,83.954237,0.0,80.682606,88.779863,79.627994,79.29722,70.707023,...,0.206854,0.186202,2,1,-0.025761,-0.362301,-0.261312,0.105811,-0.331294,-0.295012


**Select the columns of interest, normalize, and... (log-scale)?**

In [8]:
if data_to_analyze == 'PROTEIN':
    exp_df = exp_df[['{}.{}'.format('prot', i) for i in meta_df['Sample']]]
elif data_to_analyze == 'RNA':
    exp_df = exp_df[['{}.{}'.format('RNA', i) for i in meta_df['Sample']]]
#####
#####
current_minimum = exp_df.min().min()
print('Smallest initial value found:', current_minimum)
if current_minimum == 0.:
    print('Making minor adjustments to minimum values to play nice with zeros')
    nonzero_minimum = exp_df[exp_df>0.].min().min()
    exp_df.replace(0., nonzero_minimum, inplace=True)
    print('New minimum value is {}'.format(nonzero_minimum))
else:
    print('No replacement of / dealing with zeros necessary')
print(exp_df.shape)
#####
#####
exp_df.columns = [i for i in meta_df['Sample']]
assert list(meta_df['Sample']) == list(exp_df.columns)
for i in exp_df.columns:
    summy = exp_df[i].sum()
#     exp_df[i] = exp_df[i]/summy ###NO LOG
    exp_df[i] = np.log(exp_df[i]/summy)###LOG
    
print(exp_df.shape)
exp_df.head()

Smallest initial value found: 0.0218306749020062
No replacement of / dealing with zeros necessary
(3127, 42)
(3127, 42)


Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,33,34,35,36,37,38,39,40,41,42
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SAL1,-9.466213,-9.623964,-9.499459,-9.488138,-9.463457,-9.436847,-9.360202,-9.353389,-9.363357,-9.289401,...,-9.276378,-9.252359,-9.394205,-9.251105,-9.292674,-9.328147,-9.333018,-9.384583,-9.391602,-9.315278
CET1,-9.174732,-9.245791,-9.086542,-9.117258,-9.164575,-9.135901,-9.120377,-9.01656,-9.099182,-8.937224,...,-9.302579,-9.158626,-9.20775,-9.174623,-9.185647,-9.164998,-9.102172,-9.121368,-9.171787,-9.13577
FOB1,-9.451646,-9.868206,-9.315312,-9.370994,-9.299975,-9.382131,-9.261659,-9.310037,-9.311688,-9.214607,...,-9.639062,-9.324432,-9.334874,-9.471322,-9.233043,-9.24907,-9.318233,-9.268187,-9.520448,-9.359702
RPS9A,-7.656629,-8.882885,-7.662886,-7.500015,-7.456057,-7.500801,-7.485037,-7.468796,-7.40453,-7.290969,...,-7.908027,-7.658115,-7.606991,-7.672596,-7.371383,-7.430768,-7.514734,-7.356361,-7.799897,-7.458315
COQ4,-8.566159,-7.897785,-8.446632,-8.444021,-8.546427,-8.435599,-8.654467,-8.59824,-8.59191,-8.715096,...,-8.569254,-8.625919,-8.691886,-8.655975,-8.771559,-8.738678,-8.580202,-8.736154,-8.431636,-8.655472


In [9]:
###(slowly) checking for duplicate columns
for i, col1 in enumerate(exp_df.columns):
    for col2 in exp_df.columns[i+1:]:
        assert list(exp_df[col1])!= list(exp_df[col2])

# Averaging between replicates to clean up the data

## First getting gene expression averages between replicates and creating a new `dataframe`

In [10]:
replicate_groups = {}
for sample in meta_df.index:
    tempy = '{}_{}_{}_{}'.format(*meta_df.loc[sample][['Nitrogen source',\
                                                     'dilution rate (/h)',\
                                                     'glucose.H2O concentration (g/L)',\
                                                     'nitrogen source concentration (g/L)']])
    try:
        replicate_groups[tempy].append(meta_df.loc[sample]['Sample'])
    except KeyError:
        replicate_groups[tempy] = [meta_df.loc[sample]['Sample']]

for key, val in replicate_groups.items():
    print(key, val)

NH4_0.05_7.5_0.5 [1, 2, 3]
NH4_0.1_7.5_5.0 [4, 5, 6]
NH4_0.1_7.5_0.5 [7, 8, 9]
NH4_0.13_7.5_0.5 [10, 11, 12]
NH4_0.18_7.5_0.5 [13, 14, 15]
NH4_0.3_7.5_0.5 [16, 17, 18]
NH4_0.35_7.5_0.5 [19, 20, 21]
Gln_0.1_7.5_5.53 [22, 23, 24]
Gln_0.1_1.21_5.53 [25, 26, 27]
Gln_0.1_6.87_0.55 [28, 29, 30]
Phe_0.1_7.5_12.35 [31, 32, 33]
Phe_0.1_7.5_1.24 [34, 35, 36]
Ile_0.1_7.5_9.81 [37, 38, 39]
Ile_0.1_7.5_0.98 [40, 41, 42]


In [11]:
new_exp_df = pd.DataFrame()

for key, val in replicate_groups.items():
    new_exp_df[key] = exp_df[list(val)].mean(axis=1)
print(new_exp_df.shape)
new_exp_df.head()

(3127, 14)


Unnamed: 0_level_0,NH4_0.05_7.5_0.5,NH4_0.1_7.5_5.0,NH4_0.1_7.5_0.5,NH4_0.13_7.5_0.5,NH4_0.18_7.5_0.5,NH4_0.3_7.5_0.5,NH4_0.35_7.5_0.5,Gln_0.1_7.5_5.53,Gln_0.1_1.21_5.53,Gln_0.1_6.87_0.55,Phe_0.1_7.5_12.35,Phe_0.1_7.5_1.24,Ile_0.1_7.5_9.81,Ile_0.1_7.5_0.98
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
SAL1,-9.529879,-9.462814,-9.358983,-9.34931,-9.222958,-9.351982,-9.339676,-9.418705,-9.529775,-9.331296,-9.294119,-9.299223,-9.317947,-9.363821
CET1,-9.169022,-9.139245,-9.078706,-9.031129,-9.07117,-9.227227,-9.302235,-9.106038,-9.220858,-9.136797,-9.2768,-9.180333,-9.150939,-9.142975
FOB1,-9.545055,-9.351033,-9.294461,-9.243162,-9.149694,-9.159748,-9.130156,-9.3708,-9.551954,-9.412692,-9.515242,-9.376876,-9.266782,-9.382779
RPS9A,-8.067467,-7.485624,-7.452787,-7.326448,-7.246923,-7.15872,-7.136058,-7.483618,-7.766808,-7.592273,-7.685154,-7.645901,-7.438962,-7.538191
COQ4,-8.303525,-8.475349,-8.614872,-8.736973,-8.886612,-9.258626,-9.339141,-8.471156,-8.456111,-8.633659,-8.572922,-8.657927,-8.696813,-8.607754


## Average the growth rates across these replicates in the metadata as well

In [12]:
for key, vals in replicate_groups.items():
    for numeric_id in vals:
        temp_df = meta_df[meta_df['Sample']==numeric_id]
        assert temp_df.shape[0]==1
        meta_df.at[temp_df.index[0], 'Simple_sample_id'] = key
print(meta_df.shape)
meta_df.head()

(42, 12)


Unnamed: 0,Sample,Nitrogen source,dilution rate (/h),glucose.H2O concentration (g/L),nitrogen source concentration (g/L),limiting nutrient,C/N ratio,experiment subset,total RNA content (g/gDW),total protein content (g/gDW),extra notes,Simple_sample_id
0,1,NH4,0.05,7.5,0.5,N,30,GR,0.021734,0.272515,,NH4_0.05_7.5_0.5
1,2,NH4,0.05,7.5,0.5,N,30,GR,0.024941,0.25082,,NH4_0.05_7.5_0.5
2,3,NH4,0.05,7.5,0.5,N,30,GR,0.025831,0.261097,,NH4_0.05_7.5_0.5
3,4,NH4,0.1,7.5,5.0,C,3,NM,0.053444,0.359674,,NH4_0.1_7.5_5.0
4,5,NH4,0.1,7.5,5.0,C,3,NM,0.050594,0.377943,,NH4_0.1_7.5_5.0


In [13]:
print(meta_df.shape)
group_cols = ['Simple_sample_id']
metric_cols_a = ['total RNA content (g/gDW)', 'total protein content (g/gDW)']
aggs_a = meta_df.groupby(group_cols)[metric_cols_a].mean()

meta_df.drop(metric_cols_a, axis=1, inplace=True)
meta_df.drop_duplicates(subset=group_cols, keep='first', inplace=True)
meta_df = meta_df.merge(right=aggs_a, right_index=True, left_on=group_cols, how='right')
print(meta_df.shape)
meta_df.head()

(42, 12)
(14, 12)


Unnamed: 0,Sample,Nitrogen source,dilution rate (/h),glucose.H2O concentration (g/L),nitrogen source concentration (g/L),limiting nutrient,C/N ratio,experiment subset,extra notes,Simple_sample_id,total RNA content (g/gDW),total protein content (g/gDW)
0,1,NH4,0.05,7.5,0.5,N,30,GR,,NH4_0.05_7.5_0.5,0.024169,0.261477
3,4,NH4,0.1,7.5,5.0,C,3,NM,,NH4_0.1_7.5_5.0,0.05,0.358024
6,7,NH4,0.1,7.5,0.5,N,30,"GR, NM",,NH4_0.1_7.5_0.5,0.025297,0.225573
9,10,NH4,0.13,7.5,0.5,N,30,GR,,NH4_0.13_7.5_0.5,0.025831,0.275686
12,13,NH4,0.18,7.5,0.5,N,30,GR,,NH4_0.18_7.5_0.5,0.038539,0.28647


In [14]:
meta_df['Doubling_time'] = np.log(2)/meta_df['dilution rate (/h)']
meta_df['Growth Rate (1/hr)'] = meta_df['dilution rate (/h)']
assert list(meta_df['Simple_sample_id']) == list(new_exp_df.columns)

# Construct a third `dataframe` containing expression data summary stats

In [15]:
exp_summary_df = new_exp_df.apply(pd.DataFrame.describe, axis=1)

In [16]:
col_listy = []
for col in exp_summary_df.columns:
    if '%' not in col:
        col_listy.append(col)
    else:
        col_listy.append(col.replace('%', '_percentile'))
print(col_listy)
exp_summary_df.columns = col_listy

['count', 'mean', 'std', 'min', '25_percentile', '50_percentile', '75_percentile', 'max']


In [17]:
exp_summary_df['cv'] = exp_summary_df['std']/exp_summary_df['mean']
exp_summary_df['noise'] = exp_summary_df['std'].pow(2)/exp_summary_df['mean'].pow(2)

In [18]:
slopes = []
pearsons = []
spearmans = []
for gene in exp_summary_df.index:
    a, b, c, d, e = stats.linregress(new_exp_df.loc[gene], meta_df['dilution rate (/h)']) 
    slopes.append(a)
    pearsons.append(c)
    rho, p = stats.spearmanr(new_exp_df.loc[gene], meta_df['dilution rate (/h)'], nan_policy='omit')
    spearmans.append(rho)
    
exp_summary_df['lin_slope'] = slopes
exp_summary_df['lin_r'] = pearsons
exp_summary_df['spearmans_rho'] = spearmans

In [20]:
meta_df

Unnamed: 0,Sample,Nitrogen source,dilution rate (/h),glucose.H2O concentration (g/L),nitrogen source concentration (g/L),limiting nutrient,C/N ratio,experiment subset,extra notes,Simple_sample_id,total RNA content (g/gDW),total protein content (g/gDW),Doubling_time,Growth Rate (1/hr)
0,1,NH4,0.05,7.5,0.5,N,30,GR,,NH4_0.05_7.5_0.5,0.024169,0.261477,13.862944,0.05
3,4,NH4,0.1,7.5,5.0,C,3,NM,,NH4_0.1_7.5_5.0,0.05,0.358024,6.931472,0.1
6,7,NH4,0.1,7.5,0.5,N,30,"GR, NM",,NH4_0.1_7.5_0.5,0.025297,0.225573,6.931472,0.1
9,10,NH4,0.13,7.5,0.5,N,30,GR,,NH4_0.13_7.5_0.5,0.025831,0.275686,5.331901,0.13
12,13,NH4,0.18,7.5,0.5,N,30,GR,,NH4_0.18_7.5_0.5,0.038539,0.28647,3.850818,0.18
15,16,NH4,0.3,7.5,0.5,N,30,GR,,NH4_0.3_7.5_0.5,0.048219,0.414481,2.310491,0.3
18,19,NH4,0.35,7.5,0.5,N,30,GR,,NH4_0.35_7.5_0.5,0.077494,0.494314,1.980421,0.35
21,22,Gln,0.1,7.5,5.53,C,5,NM,"labeled ""Q"" or ""Gln""",Gln_0.1_7.5_5.53,0.064727,0.558604,6.931472,0.1
24,25,Gln,0.1,1.21,5.53,C,3,NM,"labeled ""Q*"" or ""Gln*""",Gln_0.1_1.21_5.53,0.054038,0.387712,6.931472,0.1
27,28,Gln,0.1,6.87,0.55,N,30,NM,,Gln_0.1_6.87_0.55,0.02215,0.306769,6.931472,0.1


# Save some files

In [34]:
print(new_exp_df.shape)
print(meta_df.shape)
print(exp_summary_df.shape)
new_exp_df.to_csv('../Data/processed_data/processed_expression_scer_{}.tsv'.format(data_to_analyze), sep='\t')
meta_df.to_csv('../Data/processed_data/processed_metadata_scer_{}.tsv'.format(data_to_analyze), sep='\t')
exp_summary_df.to_csv('../Data/processed_data/processed_expression_summary_scer_{}.tsv'.format(data_to_analyze), sep='\t')

(3127, 14)
(14, 14)
(3127, 13)


# Get a thinned down set as a robustness check

Since conditions are correlated with one another, I'm constructing a more sparsely populated dataset where inter-condition correlations are minimized using a greedy algorithm. 

The algorithm works as follows:
1. Find which two conditions are the most highly correlated across the entire all-to-all correlation matrix
2. Randomly delete one of the two conditions in question
3. Iterate to (1)
4. Stop after reaching a pre-defined final dataset size

In [None]:
import random

In [None]:
cond_corr_mat = new_exp_df.corr(method='spearman')
cond_corr_mat.head()

In [None]:
final_data_size = 30
n_to_prune = cond_corr_mat.shape[1] - final_data_size

In [None]:
np.fill_diagonal(cond_corr_mat.values, np.nan)
to_prune = []
for i in range(n_to_prune):
    tempy = cond_corr_mat.loc[[i for i in cond_corr_mat.columns if i not in to_prune]][[i for i in cond_corr_mat.columns if i not in to_prune]].max()
    to_prune.append(random.choice([tempy.sort_values(ascending=False).index[0],\
                                   tempy.sort_values(ascending=False).index[1]]))
np.fill_diagonal(cond_corr_mat.values, 1.)

In [None]:
to_keep = [i for i in cond_corr_mat.columns if i not in to_prune]
new_exp_df = new_exp_df[to_keep]
meta_df = meta_df[meta_df['Simple_sample_id'].isin(to_keep)]

In [None]:
exp_summary_df = new_exp_df.apply(pd.DataFrame.describe, axis=1)
col_listy = []
for col in exp_summary_df.columns:
    if '%' not in col:
        col_listy.append(col)
    else:
        col_listy.append(col.replace('%', '_percentile'))
print(col_listy)
exp_summary_df.columns = col_listy

exp_summary_df['cv'] = exp_summary_df['std']/exp_summary_df['mean']
exp_summary_df['noise'] = exp_summary_df['std'].pow(2)/exp_summary_df['mean'].pow(2)

slopes = []
pearsons = []
spearmans = []
for gene in exp_summary_df.index:
    a, b, c, d, e = stats.linregress(new_exp_df.loc[gene], meta_df['Growth Rate (1/hr)'])
    slopes.append(a)
    pearsons.append(c)
    rho, p = stats.spearmanr(new_exp_df.loc[gene], meta_df['Growth Rate (1/hr)'])
    spearmans.append(rho)
    
exp_summary_df['lin_slope'] = slopes
exp_summary_df['lin_r'] = pearsons
exp_summary_df['spearmans_rho'] = spearmans

In [None]:
print(new_exp_df.shape)
print(meta_df.shape)
print(exp_summary_df.shape)
new_exp_df.to_csv('../Data/processed_data/processed_expression_SPARSE.tsv', sep='\t')
meta_df.to_csv('../Data/processed_data/processed_metadata_SPARSE.tsv', sep='\t')
exp_summary_df.to_csv('../Data/processed_data/processed_expression_summary_SPARSE.tsv', sep='\t')

# Scratch