In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
from scipy import stats

import statsmodels.formula.api as smf

from matplotlib import pyplot as plt

**Two different sample types are contained in this dataset so we'll try and re-run the notebook to use both of them separately**

Basically, make sure one is commented, run all the code. Then switch the commented line and re-run.

In [3]:
data_to_analyze = 'RNA'
# data_to_analyze = 'PROTEIN'

# Import data

This data comes from here: https://elifesciences.org/articles/65722/figures

In [4]:
df = pd.ExcelFile(r'../Data/raw_data/elife-65722-supp1-v2.xlsx')

## Basic metadata clean up and data subsetting

In [5]:
meta_df = df.parse('Supplementary File 1a', header=1)
print(meta_df.shape)
meta_df.head()

(42, 11)


Unnamed: 0,Sample,Nitrogen source,dilution rate (/h),glucose.H2O concentration (g/L),nitrogen source concentration (g/L),limiting nutrient,C/N ratio,experiment subset,total RNA content (g/gDW),total protein content (g/gDW),extra notes
0,1,NH4,0.05,7.5,0.5,N,30,GR,0.021734,0.272515,
1,2,NH4,0.05,7.5,0.5,N,30,GR,0.024941,0.25082,
2,3,NH4,0.05,7.5,0.5,N,30,GR,0.025831,0.261097,
3,4,NH4,0.1,7.5,5.0,C,3,NM,0.053444,0.359674,
4,5,NH4,0.1,7.5,5.0,C,3,NM,0.050594,0.377943,


## And the same for the expression data

In [6]:
exp_df = df.parse('Supplementary File 1b', header=1, index_col=1)
print(exp_df.shape)
exp_df.head()

(3127, 94)


Unnamed: 0_level_0,Accession,protein length,prot.1,prot.2,prot.3,prot.4,prot.5,prot.6,prot.7,prot.8,...,RNA.41,RNA.42,GR.cluster,NM.cluster,GR.Pearson.r,NM.Pearson.r,all.Pearson.r,GR.Spearman.rho,NM.Spearman.rho,all.Spearman.rho
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SAL1,D6W196,494,4.840234,4.240349,4.525756,8.088604,7.2658,7.831575,4.735927,4.787078,...,0.079206,0.096258,3,1,0.895107,0.105455,0.275134,0.923633,-0.064103,0.273317
CET1,O13297,549,157.097283,156.674508,169.097093,198.678218,219.735915,186.590269,117.466448,136.670311,...,0.098678,0.115185,3,1,0.890444,0.08957,0.351427,0.828689,-0.037241,0.252897
FOB1,O13329,566,1.534616,1.24527,1.309993,1.551638,1.723677,1.545066,0.76684,1.199747,...,0.06963,0.092075,1,1,0.850031,-0.224327,0.63686,0.874097,-0.161783,0.343165
RPS9A,O13516,197,11824.166906,9997.847519,9111.935208,15070.518625,14398.262706,16898.705557,8990.481368,9073.912674,...,0.389067,0.616458,1,1,0.952229,0.26534,0.822659,0.931889,0.03663,0.434892
COQ4,O13525,335,86.217705,83.954237,0.0,80.682606,88.779863,79.627994,79.29722,70.707023,...,0.206854,0.186202,2,1,-0.025761,-0.362301,-0.261312,0.105811,-0.331294,-0.295012


**Select the columns of interest, normalize so that values are ~molecules per million, and then log-scale**

The only tricky thing here is dealing with zero-values (these only exist in the protein dataset). I thought of a number of solutions, ideally we'd drop them but for a lot of things this becomes a huge pain. Instead, we can just set all 0 values to the smallest observed value anywhere in the dataset and poking around this didn't seem to effect things very much.

In [7]:
million=1e6

In [8]:
if data_to_analyze == 'PROTEIN':
    exp_df = exp_df[['{}.{}'.format('prot', i) for i in meta_df['Sample']]]
elif data_to_analyze == 'RNA':
    exp_df = exp_df[['{}.{}'.format('RNA', i) for i in meta_df['Sample']]]

#####
#####Dealing with zero values here, if necessary
current_minimum = exp_df.min().min()
print('Smallest initial value found:', current_minimum)
if current_minimum == 0.:
    print('Making minor adjustments to minimum values to play nice with zeros')
    nonzero_minimum = exp_df[exp_df>0.].min().min()
    exp_df.replace(0., nonzero_minimum, inplace=True)
    print('New minimum value is {}'.format(nonzero_minimum))
else:
    print('No replacement of / dealing with zeros necessary')
print(exp_df.shape)
#####
#####

exp_df.columns = [i for i in meta_df['Sample']]
assert list(meta_df['Sample']) == list(exp_df.columns)
for i in exp_df.columns:
    summy = exp_df[i].sum() ###Get the sum
    exp_df[i] = np.log((exp_df[i]/summy)*million)
    
print(exp_df.shape)
exp_df.head()

Smallest initial value found: 0.0218306749020062
No replacement of / dealing with zeros necessary
(3127, 42)
(3127, 42)


Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,33,34,35,36,37,38,39,40,41,42
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SAL1,4.349298,4.191546,4.316052,4.327372,4.352053,4.378664,4.455308,4.462122,4.452153,4.52611,...,4.539132,4.563151,4.421305,4.564405,4.522836,4.487363,4.482492,4.430928,4.423909,4.500233
CET1,4.640779,4.56972,4.728969,4.698252,4.650936,4.67961,4.695134,4.798951,4.716328,4.878287,...,4.512932,4.656884,4.607761,4.640888,4.629864,4.650512,4.713339,4.694143,4.643723,4.679741
FOB1,4.363864,3.947304,4.500199,4.444517,4.515536,4.433379,4.553852,4.505473,4.503823,4.600904,...,4.176448,4.491079,4.480637,4.344189,4.582468,4.566441,4.497277,4.547323,4.295063,4.455809
RPS9A,6.158881,4.932626,6.152624,6.315496,6.359453,6.31471,6.330474,6.346715,6.410981,6.524541,...,5.907484,6.157396,6.20852,6.142914,6.444128,6.384742,6.300777,6.45915,6.015614,6.357195
COQ4,5.249352,5.917725,5.368879,5.371489,5.269084,5.379912,5.161043,5.217271,5.223601,5.100415,...,5.246257,5.189591,5.123624,5.159536,5.043952,5.076833,5.235309,5.079357,5.383874,5.160038


In [9]:
###(slowly/inefficiently) checking for exact duplicate columns
for i, col1 in enumerate(exp_df.columns):
    for col2 in exp_df.columns[i+1:]:
        assert list(exp_df[col1])!= list(exp_df[col2])

# Averaging between replicates to clean up the data

## First getting gene expression averages between replicates and creating a new `dataframe`

In [10]:
replicate_groups = {}
for sample in meta_df.index:
    ###Make unique identifiers for the samples
    tempy = '{}_{}_{}_{}'.format(*meta_df.loc[sample][['Nitrogen source',\
                                                     'dilution rate (/h)',\
                                                     'glucose.H2O concentration (g/L)',\
                                                     'nitrogen source concentration (g/L)']])
    try:
        replicate_groups[tempy].append(meta_df.loc[sample]['Sample'])
    except KeyError:
        replicate_groups[tempy] = [meta_df.loc[sample]['Sample']]

for key, val in replicate_groups.items():
    print(key, val)

NH4_0.05_7.5_0.5 [1, 2, 3]
NH4_0.1_7.5_5.0 [4, 5, 6]
NH4_0.1_7.5_0.5 [7, 8, 9]
NH4_0.13_7.5_0.5 [10, 11, 12]
NH4_0.18_7.5_0.5 [13, 14, 15]
NH4_0.3_7.5_0.5 [16, 17, 18]
NH4_0.35_7.5_0.5 [19, 20, 21]
Gln_0.1_7.5_5.53 [22, 23, 24]
Gln_0.1_1.21_5.53 [25, 26, 27]
Gln_0.1_6.87_0.55 [28, 29, 30]
Phe_0.1_7.5_12.35 [31, 32, 33]
Phe_0.1_7.5_1.24 [34, 35, 36]
Ile_0.1_7.5_9.81 [37, 38, 39]
Ile_0.1_7.5_0.98 [40, 41, 42]


In [11]:
new_exp_df = pd.DataFrame()

for key, val in replicate_groups.items():
    new_exp_df[key] = exp_df[list(val)].mean(axis=1)
print(new_exp_df.shape)
new_exp_df.head()

(3127, 14)


Unnamed: 0_level_0,NH4_0.05_7.5_0.5,NH4_0.1_7.5_5.0,NH4_0.1_7.5_0.5,NH4_0.13_7.5_0.5,NH4_0.18_7.5_0.5,NH4_0.3_7.5_0.5,NH4_0.35_7.5_0.5,Gln_0.1_7.5_5.53,Gln_0.1_1.21_5.53,Gln_0.1_6.87_0.55,Phe_0.1_7.5_12.35,Phe_0.1_7.5_1.24,Ile_0.1_7.5_9.81,Ile_0.1_7.5_0.98
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
SAL1,4.285632,4.352696,4.456528,4.466201,4.592553,4.463528,4.475834,4.396805,4.285736,4.484215,4.521392,4.516287,4.497564,4.45169
CET1,4.646489,4.676266,4.736804,4.784382,4.744341,4.588284,4.513276,4.709473,4.594653,4.678713,4.53871,4.635178,4.664572,4.672536
FOB1,4.270456,4.464477,4.521049,4.572348,4.665816,4.655762,4.685354,4.444711,4.263557,4.402818,4.300268,4.438635,4.548728,4.432732
RPS9A,5.748044,6.329886,6.362723,6.489063,6.568587,6.656791,6.679452,6.331893,6.048702,6.223237,6.130356,6.16961,6.376549,6.27732
COQ4,5.511985,5.340162,5.200638,5.078538,4.928899,4.556884,4.476369,5.344354,5.359399,5.181852,5.242588,5.157584,5.118698,5.207757


## Average the growth rates across these replicates in the metadata as well

In [12]:
for key, vals in replicate_groups.items():
    for numeric_id in vals:
        temp_df = meta_df[meta_df['Sample']==numeric_id]
        assert temp_df.shape[0]==1
        meta_df.at[temp_df.index[0], 'Simple_sample_id'] = key
print(meta_df.shape)
meta_df.head()

(42, 12)


Unnamed: 0,Sample,Nitrogen source,dilution rate (/h),glucose.H2O concentration (g/L),nitrogen source concentration (g/L),limiting nutrient,C/N ratio,experiment subset,total RNA content (g/gDW),total protein content (g/gDW),extra notes,Simple_sample_id
0,1,NH4,0.05,7.5,0.5,N,30,GR,0.021734,0.272515,,NH4_0.05_7.5_0.5
1,2,NH4,0.05,7.5,0.5,N,30,GR,0.024941,0.25082,,NH4_0.05_7.5_0.5
2,3,NH4,0.05,7.5,0.5,N,30,GR,0.025831,0.261097,,NH4_0.05_7.5_0.5
3,4,NH4,0.1,7.5,5.0,C,3,NM,0.053444,0.359674,,NH4_0.1_7.5_5.0
4,5,NH4,0.1,7.5,5.0,C,3,NM,0.050594,0.377943,,NH4_0.1_7.5_5.0


In [13]:
print(meta_df.shape)
group_cols = ['Simple_sample_id']
metric_cols_a = ['total RNA content (g/gDW)', 'total protein content (g/gDW)']
aggs_a = meta_df.groupby(group_cols)[metric_cols_a].mean()

meta_df.drop(metric_cols_a, axis=1, inplace=True)
meta_df.drop_duplicates(subset=group_cols, keep='first', inplace=True)
meta_df = meta_df.merge(right=aggs_a, right_index=True, left_on=group_cols, how='right')
print(meta_df.shape)
meta_df.head()

(42, 12)
(14, 12)


Unnamed: 0,Sample,Nitrogen source,dilution rate (/h),glucose.H2O concentration (g/L),nitrogen source concentration (g/L),limiting nutrient,C/N ratio,experiment subset,extra notes,Simple_sample_id,total RNA content (g/gDW),total protein content (g/gDW)
0,1,NH4,0.05,7.5,0.5,N,30,GR,,NH4_0.05_7.5_0.5,0.024169,0.261477
3,4,NH4,0.1,7.5,5.0,C,3,NM,,NH4_0.1_7.5_5.0,0.05,0.358024
6,7,NH4,0.1,7.5,0.5,N,30,"GR, NM",,NH4_0.1_7.5_0.5,0.025297,0.225573
9,10,NH4,0.13,7.5,0.5,N,30,GR,,NH4_0.13_7.5_0.5,0.025831,0.275686
12,13,NH4,0.18,7.5,0.5,N,30,GR,,NH4_0.18_7.5_0.5,0.038539,0.28647


In [14]:
meta_df['Doubling_time'] = np.log(2)/meta_df['dilution rate (/h)']
meta_df['Growth Rate (1/hr)'] = meta_df['dilution rate (/h)']
assert list(meta_df['Simple_sample_id']) == list(new_exp_df.columns)

# Construct a third `dataframe` containing expression data summary stats

In [15]:
exp_summary_df = new_exp_df.apply(pd.DataFrame.describe, axis=1)

In [16]:
col_listy = []
for col in exp_summary_df.columns:
    if '%' not in col:
        col_listy.append(col)
    else:
        col_listy.append(col.replace('%', '_percentile'))
print(col_listy)
exp_summary_df.columns = col_listy

['count', 'mean', 'std', 'min', '25_percentile', '50_percentile', '75_percentile', 'max']


In [17]:
exp_summary_df['cv'] = exp_summary_df['std']/exp_summary_df['mean']
exp_summary_df['noise'] = exp_summary_df['std'].pow(2)/exp_summary_df['mean'].pow(2)

In [18]:
slopes = []
pearsons = []
spearmans = []
for gene in exp_summary_df.index:
    a, b, c, d, e = stats.linregress(new_exp_df.loc[gene], meta_df['dilution rate (/h)']) 
    slopes.append(a)
    pearsons.append(c)
    rho, p = stats.spearmanr(new_exp_df.loc[gene], meta_df['dilution rate (/h)'], nan_policy='omit')
    spearmans.append(rho)
    
exp_summary_df['lin_slope'] = slopes
exp_summary_df['lin_r'] = pearsons
exp_summary_df['spearmans_rho'] = spearmans

# Save some files

In [19]:
print(new_exp_df.shape)
print(meta_df.shape)
print(exp_summary_df.shape)
new_exp_df.to_csv('../Data/processed_data/processed_expression_scer_{}.tsv'.format(data_to_analyze), sep='\t')
meta_df.to_csv('../Data/processed_data/processed_metadata_scer_{}.tsv'.format(data_to_analyze), sep='\t')
exp_summary_df.to_csv('../Data/processed_data/processed_expression_summary_scer_{}.tsv'.format(data_to_analyze), sep='\t')

(3127, 14)
(14, 14)
(3127, 13)


# Get a thinned down set as a robustness check?

Too few samples to do this step here.