In [1]:
from bokeh.plotting import output_notebook
output_notebook()

### Read the counts with pandas.

**pandas** is an efficient library to handle tabular data.
It provides a DataFrame class similar to *R* data.frame.

The data is the counts normalized by gene length. This is an example of how to use the code. In real life examples it should be RPKM or expression values. 

In [2]:
from pandas import DataFrame

In [3]:
samples = DataFrame.from_csv('data/GSE71562_normalized.csv')

In [4]:
samples.head()

Unnamed: 0_level_0,E14R012a01,E14R012a02,E14R012a03,E14R012a04,E14R012a05,E14R012a06,E14R012b01,E14R012b02,E14R012b03,E14R012b04,E14R012b05,E14R012b06,E14R012c01,E14R012c02,E14R012c03,E14R012c04,E14R012c05,E14R012c06
bnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
b0001,6.9231,9.6923,10.2769,9.6769,7.1846,5.8154,4.9846,9.1538,11.0308,16.1077,19.8615,22.4462,5.9692,13.8308,19.4154,16.0308,7.0462,13.2615
b0002,9.1275,7.1543,8.3359,14.7218,5.2482,2.3672,4.7506,4.2543,11.6145,12.4184,16.3331,12.3477,2.4699,7.5016,14.9764,12.1706,6.3306,7.2616
b0003,5.7833,5.3562,5.8541,11.162,3.1406,2.3144,3.9474,3.5172,6.6878,8.7543,10.7403,9.647,2.0365,6.7296,11.6749,8.2543,3.7693,5.4485
b0004,8.1897,6.1742,6.7963,11.0918,3.6998,2.9673,4.2582,3.7278,8.0342,7.9518,10.8748,10.1734,2.2877,8.0288,13.2465,9.3593,4.3033,6.2185
b0005,0.3176,0.2196,0.2973,0.4189,0.1926,0.2061,0.2128,0.1858,0.3581,0.3007,0.4358,0.4696,0.098,0.402,0.625,0.5068,0.1791,0.223


### Load *E. coli* model

In [5]:
from cameo import models

The bokeh.charts API has moved to a separate 'bkcharts' package.

This compatibility shim will remain until Bokeh 1.0 is released.
After that, if you want to use this API you will have to install
the bkcharts package explicitly.



In [6]:
ecoli = models.bigg.iJO1366

### Keep metabolic genes only

In [7]:
model_filter = [g.id for g in ecoli.genes if g.id in samples.index]
samples = samples.loc[model_filter]

### Aggregate the replicas using median

In [8]:
timepoints = {
    "t0":  ["E14R012a01", "E14R012b01", "E14R012c01"],
    "t05": ["E14R012a02", "E14R012b02", "E14R012c02"],
    "t1":  ["E14R012a03", "E14R012b03", "E14R012c03"],
    "t2":  ["E14R012a04", "E14R012b04", "E14R012c04"],
    "t5":  ["E14R012a05", "E14R012b05", "E14R012c05"],
    "t10": ["E14R012a06", "E14R012b06", "E14R012c06"],
}

In [9]:
for t, columns in timepoints.items():
    samples[t] = samples[columns].mean(axis=1)

In [10]:
samples = samples[[k for k in timepoints]]

### Build the expression profile from the data

In [11]:
from driven.data_sets import ExpressionProfile


pandas.tslib is deprecated and will be removed in a future version.
You can access Timestamp as pandas.Timestamp


The pandas.lib module is deprecated and will be removed in a future version. These are private functions and can be accessed from pandas._libs.lib instead


The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.



In [12]:
data = ExpressionProfile.from_data_frame(samples)
data

Unnamed: 0,t0,t05,t1,t2,t5,t10
b2215,37.3560,46.4515,73.8450,76.8296,39.7604,28.8833
b1377,0.0106,0.0112,0.0088,0.0138,0.0132,0.0206
b0241,0.0190,0.0227,0.0739,0.1065,0.0863,0.0863
b0929,1.1376,1.5358,2.4164,2.5925,1.6106,1.4859
b4035,0.0239,0.0173,0.0326,0.0164,0.0114,0.0179
...,...,...,...,...,...,...
b3568,0.0319,0.0262,0.0356,0.0325,0.0305,0.0409
b4031,0.0389,0.0495,0.0890,0.1198,0.0703,0.0748
b1858,0.2296,0.2587,0.3907,0.3934,0.2759,0.2459
b1859,0.1083,0.1261,0.1894,0.1775,0.1100,0.1401


### Visualize the expression data

In [13]:
data.boxplot()

In [14]:
data.histogram(bins=50, filter="value < 2", conditions=["t0", "t10"])

In [15]:
from numpy import log2

In [16]:
data.scatter("t0", "t10", color="orange", transform=lambda x: log2(x+1))

## Excercise 1

Scatter plots can be use to get a feeling about the changes in the data.

1. Find out which timepoints are more similar and more far away.
2. Between t0 and t5, which genes changed more.

You can change the axis data using the column ids in the data.

<code>data.scatter(<column 1>, <column 2>, color="green", transform=lambda x: log2(x+1))<code/>

### Run GIMME

GIMME is an algorithm to integrate expression data in the genome scale model. It computes the flux distribution that minimizes the inconsistence between the data and a feasible flux distribution.

In [17]:
from driven.flux_analysis.transcriptomics import gimme

Anaerobic conditions (t0)

In [18]:
anaerobic_result = gimme(ecoli, data, cutoff=0.2, condition="t0", fraction_of_optimum=0.1)

In [19]:
anaerobic_result

In [20]:
anaerobic_result.distance

618.7956947887177

In [21]:
anaerobic_result.data_frame.query("inconsistency_scores > 0")

Unnamed: 0,gimme_fluxes,fba_fluxes,expression,inconsistency_scores
ADCL,6.5721e-05,6.5721e-04,0.1096,5.9411e-06
ADCS,6.5721e-05,6.5721e-04,0.0918,7.1129e-06
AMAOTr,1.9647e-07,1.9647e-06,0.1368,1.2417e-08
AOXSr2,1.9647e-07,1.9647e-06,0.1719,5.5276e-09
APRAUR,4.3814e-05,4.3814e-04,0.0623,6.0352e-06
...,...,...,...,...
OPHBDC,2.1907e-05,2.1907e-04,0.1394,1.3270e-06
ORPT,-3.2496e-02,-3.2496e-01,0.0426,5.1136e-03
PNTK,5.6585e-05,5.6585e-04,0.1951,2.7796e-07
PTPATi,5.6585e-05,5.6585e-04,0.1218,4.4260e-06


Then for aerobic conditions (t10)

In [22]:
aerobic_result = gimme(ecoli, data, cutoff=0.2, condition="t10", fraction_of_optimum=0.8)

In [23]:
anaerobic_result

In [24]:
aerobic_result.distance

150.59379985802906

In [25]:
aerobic_result.data_frame.query("inconsistency_scores > 0")

Unnamed: 0,gimme_fluxes,fba_fluxes,expression,inconsistency_scores
ADCL,5.2577e-04,0.0007,0.1541,2.4133e-05
ADCS,5.2577e-04,0.0007,0.1462,2.8265e-05
APRAUR,3.5051e-04,0.0004,0.0517,5.1989e-05
BMOGDS1,9.5879e-05,0.0001,0.1410,5.6586e-06
BMOGDS2,9.5879e-05,0.0001,0.1410,5.6586e-06
...,...,...,...,...
DHPPDA2,3.5051e-04,0.0004,0.0517,5.1989e-05
GCALDD,5.2577e-04,0.0007,0.1821,9.4265e-06
MPTG,1.0919e-02,0.0136,0.1277,7.8960e-04
ORPT,-2.5997e-01,-0.3250,0.0629,3.5636e-02


### Visualize the results

In [26]:
anaerobic_result.display_on_map("iJO1366.Central metabolism")

In [27]:
aerobic_result.display_on_map("iJO1366.Central metabolism")

## Compare flux distributions

In [28]:
diff = anaerobic_result - aerobic_result

In [29]:
diff.normalize(ecoli.reactions.BIOMASS_Ec_iJO1366_core_53p95M)

In [30]:
diff.euclidean_distance

440.75038540607784

In [31]:
diff.manhattan_distance

2600.5167213695272

In [32]:
diff.data_frame.query("abs(fold_change) >= 1")

Unnamed: 0,fluxes_A,fluxes_B,manhattan_distance,euclidean_distance,activity_profile,fold_change
EX_ac_e,57.7157,0.0000,57.7157,3.3311e+03,1.0,1.0000
EX_quin_e,18.7843,0.0000,18.7843,3.5285e+02,1.0,1.0000
ACONTa,1.0752,4.9403,3.8651,1.4939e+01,0.0,-3.5948
ACONTb,1.0752,4.9403,3.8651,1.4939e+01,0.0,-3.5948
ADPT,0.0011,0.0000,0.0011,1.2522e-06,1.0,1.0000
...,...,...,...,...,...,...
SUCOAS,0.5249,-3.3402,3.8651,1.4939e+01,0.0,7.3640
THD2pp,75.4744,0.0000,75.4744,5.6964e+03,1.0,1.0000
TRDR,1.2909,0.0000,1.2909,1.6665e+00,1.0,1.0000
TRPS2,0.0568,0.0000,0.0568,3.2311e-03,1.0,1.0000


## Exercise 2

Assess the effect of the different RNA-seq data points.

1. Change the condition used to constraint the model and simulate it.
2. Visualize the data.
3. Explore the fluxes that change comparing with the aerobic conditions.

## Add physiological data

Anaerobic conditions (t0)

Oxigen uptake rate is 0

In [33]:
ecoli.reactions.EX_o2_e.lower_bound = 0
anaerobic_result = gimme(ecoli, data, cutoff=0.2, condition="t0", fraction_of_optimum=0.8)

In [34]:
anaerobic_result

In [35]:
anaerobic_result.distance

29.569655625425845

In [36]:
anaerobic_result.data_frame.query("abs(gimme_fluxes) >= 1e-6 and abs(fba_fluxes) < 1e-6")

Unnamed: 0,gimme_fluxes,fba_fluxes,expression,inconsistency_scores
3HAD100,0.0688,0.0,3.2667,0.0
3HAD120,0.0397,0.0,3.2667,0.0
3HAD121,0.0291,0.0,3.2667,0.0
3HAD140,0.0247,0.0,3.2667,0.0
3HAD141,0.0291,0.0,3.2667,0.0
...,...,...,...,...
SUCOAS,0.1014,0.0,0.2885,0.0
T2DECAI,0.0291,0.0,2.5006,0.0
THRt2pp,0.0010,0.0,0.3806,0.0
THRt4pp,0.0010,0.0,3.4566,0.0


In [37]:
anaerobic_result.display_on_map("iJO1366.Central metabolism")

In [38]:
aerobic_result.data_frame.query("abs(gimme_fluxes) >= 1e-6 and abs(fba_fluxes) < 1e-6")

Unnamed: 0,gimme_fluxes,fba_fluxes,expression,inconsistency_scores
EX_glyclt_e,0.0005,0.0,,0.0
EX_hom__L_e,0.6505,-0.0,,0.0
3HAD100,0.2797,0.0,4.5161,0.0
3HAD120,0.1615,0.0,4.5161,0.0
3HAD121,0.1182,0.0,4.5161,0.0
...,...,...,...,...
PRPPS,0.7332,0.0,3.5354,0.0
PUNP1,-0.0009,0.0,1.9226,0.0
T2DECAI,0.1182,0.0,4.5161,0.0
THRt2pp,0.0041,0.0,0.4570,0.0
