In [1]:
from bokeh.plotting import output_notebook
from bokeh.resources import INLINE

In [2]:
output_notebook(resources=INLINE)

### Read the counts with pandas.

**pandas** is an efficient library to handle tabular data.
It provides a DataFrame class similar to *R* data.frame.

The data is the counts normalized by gene length. This is an example of how to use the code. In real life examples it should be RPKM or expression values. 

In [3]:
from pandas import DataFrame

In [5]:
samples = DataFrame.from_csv('data/GSE71562_normalized.csv')

In [6]:
samples.head()

Unnamed: 0_level_0,E14R012a01,E14R012a02,E14R012a03,E14R012a04,E14R012a05,E14R012a06,E14R012b01,E14R012b02,E14R012b03,E14R012b04,E14R012b05,E14R012b06,E14R012c01,E14R012c02,E14R012c03,E14R012c04,E14R012c05,E14R012c06
bnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
b0001,6.923077,9.692308,10.276923,9.676923,7.184615,5.815385,4.984615,9.153846,11.030769,16.107692,19.861538,22.446154,5.969231,13.830769,19.415385,16.030769,7.046154,13.261538
b0002,9.127539,7.154346,8.335906,14.721771,5.248172,2.367181,4.750609,4.254265,11.614541,12.418359,16.333063,12.347685,2.469943,7.501625,14.976442,12.170593,6.330626,7.261576
b0003,5.783262,5.356223,5.854077,11.162017,3.140558,2.314378,3.947425,3.517167,6.687768,8.754292,10.740343,9.646996,2.036481,6.729614,11.674893,8.254292,3.769313,5.448498
b0004,8.189736,6.174184,6.796267,11.091757,3.699844,2.967341,4.258165,3.727838,8.034215,7.951788,10.874806,10.173406,2.287714,8.028771,13.246501,9.359253,4.303266,6.218507
b0005,0.317568,0.219595,0.297297,0.418919,0.192568,0.206081,0.212838,0.185811,0.358108,0.300676,0.435811,0.469595,0.097973,0.402027,0.625,0.506757,0.179054,0.222973


### Load *E. coli* model

In [7]:
from cameo import models

In [9]:
ecoli = models.bigg.iJO1366.copy()

### Keep metabolic genes only

In [10]:
model_filter = [g.id for g in ecoli.genes if g.id in samples.index]
samples = samples.loc[model_filter]

### Aggregate the replicas using median

In [11]:
timepoints = {
    "t0":  ["E14R012a01", "E14R012b01", "E14R012c01"],
    "t05": ["E14R012a02", "E14R012b02", "E14R012c02"],
    "t1":  ["E14R012a03", "E14R012b03", "E14R012c03"],
    "t2":  ["E14R012a04", "E14R012b04", "E14R012c04"],
    "t5":  ["E14R012a05", "E14R012b05", "E14R012c05"],
    "t10": ["E14R012a06", "E14R012b06", "E14R012c06"],
}

In [12]:
for t, columns in timepoints.items():
    samples[t] = samples[columns].mean(axis=1)

In [13]:
samples = samples[[k for k in timepoints]]

### Build the expression profile from the data

In [14]:
from driven.data_sets import ExpressionProfile

In [15]:
data = ExpressionProfile.from_data_frame(samples)
data

Unnamed: 0,t2,t1,t5,t10,t0,t05
b2215,76.829556,73.844968,39.760351,28.883348,37.355999,46.451496
b1377,0.013828,0.008826,0.013239,0.020594,0.010591,0.011180
b0241,0.106477,0.073934,0.086256,0.086256,0.018957,0.022749
b0929,2.592525,2.416360,1.610600,1.485907,1.137561,1.535846
...,...,...,...,...,...,...
b4031,0.119774,0.089040,0.070282,0.074802,0.038870,0.049492
b1857,0.439914,0.409871,0.296853,0.300072,0.232117,0.323677
b1859,0.177495,0.189384,0.109979,0.140127,0.108280,0.126115
b1858,0.393377,0.390728,0.275938,0.245916,0.229581,0.258720


### Visualize the expression data

In [16]:
data.boxplot()

<bokeh.charts.chart.Chart at 0x117382550>

In [17]:
data.histogram(bins=50, filter="value < 2", conditions=["t0", "t10"])

<bokeh.charts.chart.Chart at 0x10c95d400>

In [18]:
from numpy import log2

In [19]:
data.scatter("t0", "t10", color="orange", transform=lambda x: log2(x+1))

<bokeh.charts.chart.Chart at 0x1173616d8>

In [20]:
data.scatter("t0", "t05", color="green", transform=lambda x: log2(x+1))

<bokeh.charts.chart.Chart at 0x117ea1550>

### Run GIMME

GIMME is an algorithm to integrate expression data in the genome scale model. It computes the flux distribution that minimizes the inconsistence between the data and a feasible flux distribution.

In [21]:
from driven.flux_analysis.transcriptomics import gimme

Anaerobic conditions (t0)

In [22]:
anaerobic_result = gimme(ecoli, data, cutoff=0.2, condition="t0", fraction_of_optimum=0.1)

In [23]:
anaerobic_result

In [24]:
anaerobic_result.distance

617.9216631100784

In [25]:
anaerobic_result.data_frame.query("inconsistency_scores > 0")

Unnamed: 0,gimme_fluxes,fba_fluxes,expression,inconsistency_scores
ADCL,6.572067e-05,0.000657,0.109600,5.941127e-06
ADCS,6.572067e-05,0.000657,0.091770,7.112930e-06
AMAOTr,1.964744e-07,0.000002,0.136799,1.241746e-08
AOXSr2,1.964744e-07,0.000002,0.171866,5.527615e-09
...,...,...,...,...
ORPT,-3.249637e-02,-0.324964,0.042642,5.113573e-03
PNTK,5.658462e-05,0.000566,0.195088,2.779595e-07
PTPATi,5.658462e-05,0.000566,0.121781,4.425964e-06
TDSK,1.911303e-03,0.019113,0.122042,1.490015e-04


Then for aerobic conditions (t10)

In [26]:
aerobic_result = gimme(ecoli, data, cutoff=0.2, condition="t10", fraction_of_optimum=0.1)

In [27]:
anaerobic_result

In [28]:
aerobic_result.distance

622.7544787019549

In [29]:
aerobic_result.data_frame.query("inconsistency_scores > 0")

Unnamed: 0,gimme_fluxes,fba_fluxes,expression,inconsistency_scores
ADCL,0.000066,0.000657,0.154100,3.016598e-06
ADCS,0.000066,0.000657,0.146240,3.533119e-06
APRAUR,0.000044,0.000438,0.051677,6.498581e-06
BMOGDS1,0.000012,0.000120,0.140982,7.073301e-07
...,...,...,...,...
GCALDD,0.000066,0.000657,0.182071,1.178314e-06
MPTG,0.001365,0.013649,0.127687,9.870014e-05
ORPT,-0.032496,-0.324964,0.062923,4.454520e-03
TDSK,0.001911,0.019113,0.169709,5.789476e-05


### Visualize the results

In [30]:
anaerobic_result.display_on_map("iJO1366.Central metabolism")

In [31]:
aerobic_result.display_on_map("iJO1366.Central metabolism")

## Compare flux distributions

In [32]:
diff = anaerobic_result - aerobic_result

In [33]:
diff.euclidean_distance

28.450274770009997

In [34]:
diff.manhattan_distance

159.35142578810013

In [35]:
diff.data_frame.query("abs(fold_change) > 1")

Unnamed: 0,fluxes_A,fluxes_B,manhattan_distance,euclidean_distance,activity_profile,fold_change
ACONTa,0.105624,0.766437,0.660814,0.436675,0.0,-6.256293
ACONTb,0.105624,0.766437,0.660814,0.436675,0.0,-6.256293
ATPS4rpp,3.612235,8.237845,4.625610,21.396270,0.0,-1.280540
CS,0.105624,0.766437,0.660814,0.436675,0.0,-6.256293
...,...,...,...,...,...,...
RPI,-0.091755,-0.200782,0.109027,0.011887,0.0,-1.188250
SUCOAS,0.051561,-0.609253,0.660814,0.436675,0.0,12.816210
TKT2,-0.037462,0.071609,0.109071,0.011897,0.0,2.911519
VPAMTr,0.041570,-0.057111,0.098681,0.009738,0.0,2.373838
