In [1]:
import sys
base = "./../../"
sys.path.append(base)
import pandas
from pathlib import Path
import pairs_flat_v2 as pairs
import json
import helper
from sklearn.preprocessing import QuantileTransformer
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import tools
import plotly.graph_objs as go
import numpy as np

init_notebook_mode(connected=True)
from tqdm import tqdm_notebook as tqdm

In [2]:
# Load matrix
gencounts_oscope = pandas.read_csv(Path(base + "data/GSE64016_H1andFUCCI_normalized_EC_human.csv"))

# Set index right
gencounts_oscope.set_index("Unnamed: 0", inplace=True)

# Subset sorted
gencounts_oscope_sorted = gencounts_oscope.iloc[:, 
                                                       [gencounts_oscope.columns.get_loc(c) 
                                                        for c in gencounts_oscope.columns 
                                                        if "G1_" in c or "G2_" in c or "S_" in c]]

# Define annotation
is_G1 = [gencounts_oscope_sorted.columns.get_loc(c) for c in gencounts_oscope_sorted.columns if "G1_" in c]
is_S = [gencounts_oscope_sorted.columns.get_loc(c) for c in gencounts_oscope_sorted.columns if "S_" in c]
is_G2M = [gencounts_oscope_sorted.columns.get_loc(c) for c in gencounts_oscope_sorted.columns if "G2_" in c]

annotation = {
    "G1": list(is_G1),
    "S": list(is_S),
    "G2M": list(is_G2M)
}

go_0007049 = [line.replace("\n","").replace("\r","") for line in open(base + "data/go_0007049_homoSapiens.csv", "r")]
cycle_base = [line.split("\t")[0] for i, line in enumerate(open(base + "data/cyclebase_top1000_genes.tsv", "r")) if 0 < i]
cycle_genes = np.unique(np.concatenate((go_0007049, cycle_base),0))

cc_marker = pairs.sandbag(gencounts_oscope_sorted, phases=annotation, subset_genes=list(cycle_genes), fraction=0.6, processes=10, verbose=True)

[__set_matrix] Original Matrix 'x' has shape 19084 x 247
[__set_matrix] Removed 16689 genes that were not in 'subset_genes'. 2395 genes remaining.
[__set_matrix] Removed 61 genes that were not expressed in any samples. 2334 genes remaining.
[__set_matrix] Removed 0 samples that were not annotated in 'phases'. 247 samples remaining.
[__set_matrix] Matrix truncation done. Working with 2334 genes for 247 samples.
[sandbag] Identifying marker pairs...Processing in parallel with 10 processes...
 Done!
[sandbag] Identified 8146 marker pairs (phase: count): {'G1': 2575, 'S': 4101, 'G2M': 1470}


## Human preimplantation embryos
Human parthenogenetic ES from [Single-Cell RNA-Seq Reveals Lineage and X Chromosome Dynamics in Human Preimplantation Embryos](http://www.cell.com/cell/fulltext/S0092-8674(16)30280-X?_returnURL=https%3A%2F%2Flinkinghub.elsevier.com%2Fretrieve%2Fpii%2FS009286741630280X%3Fshowall%3Dtrue)

## Counts

In [3]:
gencounts_EMTAB3929_counts = pandas.read_csv(Path(base + "data/E-MTAB-3929.processed.1_counts.txt"), sep='\t')

gencounts_EMTAB3929_counts.set_index("Unnamed: 0", inplace=True)
gencounts_EMTAB3929_counts.head(10)

Unnamed: 0_level_0,E5.5.101,E5.5.100,E6.2.114,E6.2.104,E6.2.107,E6.2.116,E7.2.138,E6.2.118,E6.2.105,E7.2.144,...,E3.50.3415,E3.51.3421,E3.53.3437,E3.51.3423,E3.52.3429,E3.49.3407,E3.51.3426,E3.47.3391,E3.52.3431,E3.53.3438
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0,0,0,0,0,16,0,0,0,0,...,327,2167,170,451,104,446,2517,473,104,116
A1BG-AS1,0,0,0,0,0,0,0,0,0,0,...,0,88,0,0,0,0,1,7,0,0
A1CF,0,0,0,0,0,0,0,0,0,0,...,39,86,0,0,15,11,94,40,66,34
A2M,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A2M-AS1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A2ML1,3,0,0,0,0,0,1,0,0,0,...,26,158,0,0,137,3,0,0,124,10
A2MP1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A3GALT2,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
A4GALT,50,123,20,18,54,291,111,349,232,343,...,44,120,33,12,68,0,96,0,12,13
A4GNT,0,0,0,0,0,0,0,0,0,0,...,0,18,0,0,0,0,0,0,0,0


In [4]:
x = gencounts_EMTAB3929_counts.T.values

X_std = QuantileTransformer().fit_transform(x.astype(float))

gencounts_EMTAB3929_counts_Qnorm = pandas.DataFrame(X_std.T, index=gencounts_EMTAB3929_counts.index, columns=gencounts_EMTAB3929_counts.columns)

EMTAB3929_counts_prediction = pairs.cyclone(gencounts_EMTAB3929_counts, cc_marker, verbose=True)

EMTAB3929_counts_prediction_table = helper.get_prediction_table(EMTAB3929_counts_prediction)
helper.DataTable(EMTAB3929_counts_prediction_table)

[__set_matrix] Original Matrix 'x' has shape 26178 x 1529
[__set_matrix] Matrix truncation done. Working with 26178 genes for 1529 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 229 marker pairs. 8146 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): G2M: 953, G1: 195, S: 381


Unnamed: 0_level_0,G1,G2M,S,G1_norm,G2M_norm,S_norm,prediction
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
E5.5.101,0.177,1.0,0.0,0.150382,0.849618,0.0,G2M
E5.5.100,0.024,0.997,0.0,0.023506,0.976494,0.0,G2M
E6.2.114,0.02,0.969,0.233,0.016367,0.792962,0.190671,G2M
E6.2.104,0.664,0.008,0.082,0.880637,0.01061,0.108753,G1
E6.2.107,0.06,0.905,0.111,0.055762,0.841078,0.10316,G2M
E6.2.116,0.152,0.817,0.006,0.155897,0.837949,0.006154,G2M
E7.2.138,0.496,1.0,0.0,0.331551,0.668449,0.0,G2M
E6.2.118,0.083,0.553,0.685,0.062831,0.418622,0.518547,G2M
E6.2.105,0.186,0.675,0.06,0.201954,0.732899,0.065147,G2M
E7.2.144,0.009,0.997,0.0,0.008946,0.991054,0.0,G2M


In [5]:
classes = ["E3","E4.late","E4","E5.early","E5","E6","E7"]
labels = []
for idx in EMTAB3929_counts_prediction_table.index:
    for c in classes:
        if c in idx:
           labels.append(c) 
           break

In [9]:
EMTAB3929_counts_prediction_table.iloc[[i for i, l in enumerate(labels) if l == "E3"],:]

Unnamed: 0_level_0,G1,G2M,S,G1_norm,G2M_norm,S_norm,prediction
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
E3.1.444,0.368,0.959,0.000,0.277317,0.722683,0.000000,G2M
E3.4.463,0.501,0.667,0.085,0.399840,0.532322,0.067837,G2M
E3.2.454,0.620,0.822,0.003,0.429066,0.568858,0.002076,G2M
E3.3.456,0.778,0.842,0.088,0.455504,0.492974,0.051522,G2M
E3.1.448,0.457,0.989,0.001,0.315826,0.683483,0.000691,G2M
E3.2.453,0.926,0.761,0.001,0.548578,0.450829,0.000592,G1
E3.3.457,0.154,0.983,0.005,0.134851,0.860771,0.004378,G2M
E3.1.447,0.529,0.943,0.111,0.334176,0.595704,0.070120,G2M
E3.2.467,0.943,0.884,0.015,0.511944,0.479913,0.008143,G1
E3.2.450,0.739,0.939,0.017,0.435988,0.553982,0.010029,G2M


In [7]:
e3_g1 = EMTAB3929_counts_prediction_table.iloc[[i for i, l in enumerate(labels) if l == "E3"],0].values
e3_g2m = EMTAB3929_counts_prediction_table.iloc[[i for i, l in enumerate(labels) if l == "E3"],1].values
print("E3 {}".format(len(EMTAB3929_counts_prediction_table.iloc[[i for i, l in enumerate(labels) if l == "E3"],1].values)))

e4_g1 = EMTAB3929_counts_prediction_table.iloc[[i for i, l in enumerate(labels) if l == "E4"],0].values
e4_g2m = EMTAB3929_counts_prediction_table.iloc[[i for i, l in enumerate(labels) if l == "E4"],1].values
print("E4 {}".format(len(EMTAB3929_counts_prediction_table.iloc[[i for i, l in enumerate(labels) if l == "E4"],1].values)))


e4late_g1 = EMTAB3929_counts_prediction_table.iloc[[i for i, l in enumerate(labels) if l == "E4.late"],0].values
e4late_g2m = EMTAB3929_counts_prediction_table.iloc[[i for i, l in enumerate(labels) if l == "E4.late"],1].values
print("E4 late {}".format(len(EMTAB3929_counts_prediction_table.iloc[[i for i, l in enumerate(labels) if l == "E4.late"],1].values)))


e5early_g1 = EMTAB3929_counts_prediction_table.iloc[[i for i, l in enumerate(labels) if l == "E5.early"],0].values
e5early_g2m = EMTAB3929_counts_prediction_table.iloc[[i for i, l in enumerate(labels) if l == "E5.early"],1].values
print("E5 early {}".format(len(EMTAB3929_counts_prediction_table.iloc[[i for i, l in enumerate(labels) if l == "E5.early"],1].values)))

e5_g1 = EMTAB3929_counts_prediction_table.iloc[[i for i, l in enumerate(labels) if l == "E5"],0].values
e5_g2m = EMTAB3929_counts_prediction_table.iloc[[i for i, l in enumerate(labels) if l == "E5"],1].values
print("E5 {}".format(len(EMTAB3929_counts_prediction_table.iloc[[i for i, l in enumerate(labels) if l == "E5"],1].values)))

e6_g1 = EMTAB3929_counts_prediction_table.iloc[[i for i, l in enumerate(labels) if l == "E6"],0].values
e6_g2m = EMTAB3929_counts_prediction_table.iloc[[i for i, l in enumerate(labels) if l == "E6"],1].values
print("E6 {}".format(len(EMTAB3929_counts_prediction_table.iloc[[i for i, l in enumerate(labels) if l == "E6"],1].values)))

e7_g1 = EMTAB3929_counts_prediction_table.iloc[[i for i, l in enumerate(labels) if l == "E7"],0].values
e7_g2m = EMTAB3929_counts_prediction_table.iloc[[i for i, l in enumerate(labels) if l == "E7"],1].values
print("E7 {}".format(len(EMTAB3929_counts_prediction_table.iloc[[i for i, l in enumerate(labels) if l == "E7"],1].values)))

E3 81
E4 142
E4 late 48
E5 early 24
E5 353
E6 415
E7 466


[0.368,
 0.501,
 0.62,
 0.778,
 0.457,
 0.926,
 0.154,
 0.529,
 0.943,
 0.739,
 0.182,
 0.995,
 0.847,
 0.236,
 0.519,
 0.34,
 0.51,
 0.238,
 0.894,
 0.099,
 0.895,
 0.461,
 0.73,
 0.609,
 0.464,
 0.683,
 0.661,
 0.642,
 0.706,
 0.731,
 0.69,
 0.617,
 0.475,
 0.914,
 0.585,
 0.755,
 0.402,
 0.511,
 0.531,
 0.66,
 0.741,
 0.563,
 0.649,
 0.899,
 0.717,
 0.846,
 0.793,
 0.679,
 0.783,
 0.7,
 0.815,
 0.791,
 0.828,
 0.758,
 0.616,
 0.778,
 0.789,
 0.714,
 0.88,
 0.822,
 0.868,
 0.802,
 0.624,
 0.622,
 0.676,
 0.742,
 0.411,
 0.761,
 0.863,
 0.605,
 0.75,
 0.7,
 0.566,
 0.372,
 0.655,
 0.653,
 0.744,
 0.532,
 0.385,
 0.743,
 0.417]

In [70]:
set1 = cl.scales['9']['qual']['Set1']

e3_trace = go.Scatter(
    x = e3_g1,
    y = e3_g2m,
    mode='markers+text',
    marker=dict(
        symbol='circle',
        size=10,
        color=set1[1],
    ),
    name='E3'
)

e4_trace = go.Scatter(
    x = e4_g1,
    y = e4_g2m,
    mode='markers+text',
    marker=dict(
        symbol='circle',
        size=10,
        color=set1[2],
    ),
    name='E4'
)

e4late_trace = go.Scatter(
    x = e4late_g1,
    y = e4late_g2m,
    mode='markers+text',
    marker=dict(
        symbol='circle',
        size=10,
        color=set1[3],
    ),
    name='E4 late'
)


e5early_trace = go.Scatter(
    x = e5early_g1,
    y = e5early_g2m,
    mode='markers+text',
    marker=dict(
        symbol='circle',
        size=10,
        color=set1[4],
    ),
    name='E5 early'
)

e5_trace = go.Scatter(
    x = e5_g1,
    y = e5_g2m,
    mode='markers+text',
    marker=dict(
        symbol='circle',
        size=10,
        color=set1[5],
    ),
    name='E5'
)

e6_trace = go.Scatter(
    x = e6_g1,
    y = e6_g2m,
    mode='markers+text',
    marker=dict(
        symbol='circle',
        size=10,
        color=set1[6],
    ),
    name='E6'
)


e7_trace = go.Scatter(
    x = e7_g1,
    y = e7_g2m,
    mode='markers+text',
    marker=dict(
        symbol='circle',
        size=10,
        color=set1[7],
    ),
    name='E7'
)


lbls = go.Scatter(
    x=[0.8, 0.4, 0.25],
    y=[0.4, 0.8, 0.25],
    text=['G1',
          'G2M',
          'S'],
    mode='text',
    showlegend=False,
    hoverinfo='none'
)

data = [e3_trace, e4_trace, e4late_trace, e5early_trace, lbls]
#e5_trace, e6_trace, e7_trace,

layout = {
    'title': "Predicted cell cycle clustering for E3 - early E5",
    'xaxis': {
        'title': "G1 Score",
        'range': [-0.1, 1.1],
    },
    'yaxis': {
        'title': "G2M Score",
        'range': [-0.1, 1.1]
    },
    'shapes': [
        # G1
        {
            'type': 'path',
            'path': ' M 0.5,0 L1,0 L1,1 L0.5,0.5 Z',
            'fillcolor': 'rgba(255,0,0,0.1)',
            'line': {
                'width': 1,
                'dash': 'dash'
            }
        },
        # S
        {
            'type': 'path',
            'path': ' M 0,0 L0.5,0 L0.5,0.5 L0,0.5 Z',
            'fillcolor': 'rgba(255,255,0,0.1)',
            'line': {
                'width': 1,
                'dash': 'dash'
            }
        },
        # G2M
        {
            'type': 'path',
            'path': ' M 0,0.5 L0,1 L1,1 L0.5,0.5 Z',
            'fillcolor': 'rgba(0,0,255,0.1)',
            'line': {
                'width': 1,
                'dash': 'dash'
            }
        },
        {
            'type': 'circle',
            'xref': 'x',
            'yref': 'y',
            'x0': np.percentile(list(e3_g1) + list(e4_g1) + list(e4late_g1) + list(e5early_g1), 5),
            'y0': np.percentile(list(e3_g2m) + list(e4_g2m) + list(e4late_g2m) + list(e5early_g2m), 5),
            'x1': np.percentile(list(e3_g1) + list(e4_g1) + list(e4late_g1) + list(e5early_g1), 95),
            'y1': np.percentile(list(e3_g2m) + list(e4_g2m) + list(e4late_g2m) + list(e5early_g2m), 95),
            'opacity': 0.9,
            #'fillcolor': 'black',
            'line': {
                'color': 'red',
                'width': 5,
                'dash': 'dot'
            },
        },
        #{
        #    'type': 'circle',
        #    'xref': 'x',
        #    'yref': 'y',
        #    'x0': np.percentile(list(e5_g1) + list(e6_g1) + list(e7_g1), 5),
        #    'y0': np.percentile(list(e5_g2m) + list(e6_g2m) + list(e7_g2m), 5),
        #    'x1': np.percentile(list(e5_g1) + list(e6_g1) + list(e7_g1), 95),
        #    'y1': np.percentile(list(e5_g2m) + list(e6_g2m) + list(e7_g2m), 95),
        #    'opacity': 0.8,
        #    'fillcolor': 'blue',
        #    'line': {
        #        'color': 'black',
        #    },
        #},
    ]
}
fig = {
    'data': data,
    'layout': layout,
}

iplot(fig, image="svg")

In [71]:
data = [e5_trace, e6_trace, e7_trace, lbls]
#

layout = {
    'title': "Predicted cell cycle clustering for E5 - E7",
    'xaxis': {
        'title': "G1 Score",
        'range': [-0.1, 1.1],
    },
    'yaxis': {
        'title': "G2M Score",
        'range': [-0.1, 1.1]
    },
    'shapes': [
        # G1
        {
            'type': 'path',
            'path': ' M 0.5,0 L1,0 L1,1 L0.5,0.5 Z',
            'fillcolor': 'rgba(255,0,0,0.1)',
            'line': {
                'width': 1,
                'dash': 'dash'
            }
        },
        # S
        {
            'type': 'path',
            'path': ' M 0,0 L0.5,0 L0.5,0.5 L0,0.5 Z',
            'fillcolor': 'rgba(255,255,0,0.1)',
            'line': {
                'width': 1,
                'dash': 'dash'
            }
        },
        # G2M
        {
            'type': 'path',
            'path': ' M 0,0.5 L0,1 L1,1 L0.5,0.5 Z',
            'fillcolor': 'rgba(0,0,255,0.1)',
            'line': {
                'width': 1,
                'dash': 'dash'
            }
        },
        {
            'type': 'circle',
            'xref': 'x',
            'yref': 'y',
            'x0': np.percentile(list(e5_g1) + list(e6_g1) + list(e7_g1), 5),
            'y0': np.percentile(list(e5_g2m) + list(e6_g2m) + list(e7_g2m), 5),
            'x1': np.percentile(list(e5_g1) + list(e6_g1) + list(e7_g1), 95),
            'y1': np.percentile(list(e5_g2m) + list(e6_g2m) + list(e7_g2m), 95),
            'opacity': 0.9,
            #'fillcolor': 'black',
            'line': {
                'color': 'red',
                'width': 5,
                'dash': 'dot'
            },
        },
    ]
}
fig = {
    'data': data,
    'layout': layout,
}

iplot(fig, image="svg")

In [33]:
from collections import defaultdict

e3 = EMTAB3929_counts_prediction_table.iloc[[i for i, l in enumerate(labels) if l == "E6"],:]
sub = defaultdict(list)
for index, row in e3.iterrows():
    pos = len(index) - 1 - index[::-1].index(".")
    s = index[:pos]
    sub[s].append((row[0], row[1], row[6]))
    
sub

defaultdict(list,
            {'E6.1': [(0.136, 0.178, 'S'),
              (0.45, 0.178, 'S'),
              (0.128, 0.062, 'S'),
              (0.287, 0.942, 'G2M'),
              (0.025, 0.111, 'S'),
              (0.876, 0.078, 'G1'),
              (0.113, 0.079, 'S'),
              (0.064, 0.593, 'G2M'),
              (0.112, 0.816, 'G2M'),
              (0.024, 0.015, 'S'),
              (0.072, 1.0, 'G2M'),
              (0.075, 0.999, 'G2M'),
              (0.727, 0.001, 'G1'),
              (0.796, 0.018, 'G1'),
              (0.552, 0.007, 'G1'),
              (0.049, 0.243, 'S')],
             'E6.10': [(0.901, 0.08, 'G1'),
              (0.97, 0.153, 'G1'),
              (0.308, 0.021, 'S'),
              (0.305, 0.234, 'S'),
              (0.337, 0.185, 'S'),
              (0.647, 0.018, 'G1'),
              (0.516, 0.214, 'G1'),
              (0.97, 0.001, 'G1'),
              (0.034, 0.527, 'G2M'),
              (0.51, 0.0, 'G1'),
              (0.213, 0.0, 'S'),
        

In [34]:
import colorlover as cl


bupu = cl.scales['7']['qual']['Set1']
bupu500 = cl.interp( bupu, 20 )

traces = []
i = 0
for key, val in sub.items():
    trace = go.Scatter(
        x = [i[0] for i in val],
        y = [i[1] for i in val],
        mode='markers+text',
        marker=dict(
            symbol='circle',
            size=10,
            color=bupu500[i],
        ),
        name=key
    )
    i+=1
    traces.append(trace)

layout = {
    'xaxis': {
        'range': [-0.1, 1.1],
    },
    'yaxis': {
        'range': [-0.1, 1.1]
    },
    'shapes': [
        # G1
        {
            'type': 'path',
            'path': ' M 0.5,0 L1,0 L1,1 L0.5,0.5 Z',
            'fillcolor': 'rgba(255,0,0,0.1)',
            'line': {
                'width': 1,
                'dash': 'dash'
            }
        },
        # S
        {
            'type': 'path',
            'path': ' M 0,0 L0.5,0 L0.5,0.5 L0,0.5 Z',
            'fillcolor': 'rgba(255,255,0,0.1)',
            'line': {
                'width': 1,
                'dash': 'dash'
            }
        },
        # G2M
        {
            'type': 'path',
            'path': ' M 0,0.5 L0,1 L1,1 L0.5,0.5 Z',
            'fillcolor': 'rgba(0,0,255,0.1)',
            'line': {
                'width': 1,
                'dash': 'dash'
            }
        }
    ]
}
fig = {
    'data': traces,
    'layout': layout,
}

iplot(fig)