# Using pretrained mouse marker on oscope dataset

In [17]:
import sys
base = "./../../../"
sys.path.append(base)

In [18]:
import json
import pandas
import pairs_flat_v2 as pairs
import helper
from sklearn.preprocessing import QuantileTransformer
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import numpy as np

init_notebook_mode(connected=True)

## Loading Mouse-Human Orthologues

In [19]:
genes_mouse_to_human = {}
header = True
count = 0
ambigious = set()

for line in open(base + "data/biomart_mouse-human-orthologs.txt"):
    if header:
        header = False
        continue
    infos = line.split(",")
    if infos[2] != "":
        if infos[0] in genes_mouse_to_human:
            ambigious.add(infos[0])
            count += 1
        genes_mouse_to_human[infos[0]] = infos[2]
            
for gene in ambigious:
    del genes_mouse_to_human[gene]
    
print("{} ambiguous mappings skipped".format(count))
print("{} genes remain".format(len(genes_mouse_to_human)))

6912 ambiguous mappings skipped
18962 genes remain


## Loading Mouse Marker Pairs

In [20]:
marker_json = json.load(
    open(base + 'data/mouse_pretrained-pairs.json')
)

mm_marker_pairs = {
    "G1": list(
        zip(
            marker_json["G1"]["first"],
            marker_json["G1"]["second"]
        )
    ),
    "S": list(
        zip(
            marker_json["S"]["first"],
            marker_json["S"]["second"]
        )
    ),
    "G2M": list(
        zip(
            marker_json["G2M"]["first"],
            marker_json["G2M"]["second"]
        )
    )
}

mm_marker_pairs = {
    phase: [
        (
            genes_mouse_to_human[pair[0]], 
            genes_mouse_to_human[pair[1]]
        ) 
        for pair in pairs 
        if pair[0] in genes_mouse_to_human 
        and pair[1] in genes_mouse_to_human
    ] for phase, pairs in mm_marker_pairs.items()
}

print("The {} marker pairs loaded".format(
          len(mm_marker_pairs["G1"]) +
          len(mm_marker_pairs["S"]) +
          len(mm_marker_pairs["G2M"])
      )
     )
print("Split up into: " 
      "{} G1 , {} S and {} G2M pairs".format(
          len(mm_marker_pairs["G1"]), 
          len(mm_marker_pairs["S"]), 
          len(mm_marker_pairs["G2M"])
      )
     )

The 26545 marker pairs loaded
Split up into: 11259 G1 , 5920 S and 9366 G2M pairs


## Loading Oscope Dataset

In [21]:
gencounts_oscope = pandas.read_csv(
    base + "data/GSE64016_H1andFUCCI_normalized_EC_human.csv"
)
gencounts_oscope.set_index("Unnamed: 0", inplace=True)
gencounts_oscope_sorted = gencounts_oscope.iloc[
    :, [
        gencounts_oscope.columns.get_loc(c) 
        for c in gencounts_oscope.columns if
        "G1_" in c or "G2_" in c or "S_" in c
    ]
]
gencounts_oscope.head(10)

Unnamed: 0_level_0,H1_Exp1.001,H1_Exp1.002,H1_Exp1.003,H1_Exp1.004,H1_Exp1.006,H1_Exp1.007,H1_Exp1.008,H1_Exp1.009,H1_Exp1.010,H1_Exp1.011,...,G1_Exp1.008,G1_Exp1.055,G1_Exp1.050,G1_Exp1.076,G1_Exp1.011,G1_Exp1.063,G1_Exp1.083,G1_Exp1.030,G1_Exp1.018,G1_Exp1.046
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MKL2,24.148634,285.530829,6.481959,107.12962,0.0,5.316709,42.809004,0.0,267.202286,2.838761,...,3.887628,84.337868,69.192927,1.126491,1.13733,0.0,36.741767,11.218839,152.79286,123.041274
CD109,2.414863,2.238421,341.512799,14.896119,16.807235,115.372585,7.991014,154.389316,16.663439,12.022155,...,4.956726,7.208501,4.299846,7.626347,5.936864,2.797575,149.063512,2.80471,15.996667,7.077119
ABTB1,0.0,49.351007,0.0,2.550705,0.0,23.92519,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MAST2,0.0,234.417285,88.586769,0.0,0.0,5.316709,0.0,0.0,0.0,11.355046,...,0.971907,15.069427,0.0,60.830538,22.746604,0.0,0.720427,0.0,5.589983,33.230125
KAT5,0.0,12.443504,114.341752,51.422218,0.0,16.72105,0.0,0.0,151.218876,173.817364,...,0.0,96.041736,0.0,0.0,0.0,0.0,1.03021,55.09151,4.826018,0.0
WWC2,205.118496,8.81268,658.999139,104.70645,374.666774,394.180805,0.0,219.069622,539.029228,725.303552,...,169.11182,233.122597,656.509085,412.712674,1075.186455,661.889223,104.641994,322.527589,47.514853,67.124853
CD163,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MYL2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
UBE2Z,43.467541,257.41838,230.606485,22.956347,36.938978,295.077349,23.78278,43.813924,39.6399,102.195412,...,138.010796,58.87738,538.955243,5.632457,71.651801,0.656708,2.881707,610.984967,0.0,3.215239
RGPD4,0.0,8.759804,0.0,0.0,0.0,0.0,0.0,8.708017,3.083103,0.0,...,0.0,0.0,1.91928,3.559713,0.0,0.0,2.845686,0.0,17.505962,0.0


## Predicting

In [22]:
prediction = pairs.cyclone(
    gencounts_oscope_sorted, 
    mm_marker_pairs, 
    processes=10,
    verbose=True
)

[__set_matrix] Original Matrix 'x' has shape 19084 x 247
[__set_matrix] Matrix truncation done. Working with 19084 genes for 247 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 2836 marker pairs. 26545 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): S: 99, G2M: 4, G1: 144


In [23]:
prediction_table = helper.get_prediction_table(prediction)
helper.DataTable(prediction_table)

Unnamed: 0_level_0,G1,G2M,S,G1_norm,G2M_norm,S_norm,prediction
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
G2_Exp1.059,0.48,0.011,0.396,0.54115,0.012401,0.446449,S
G2_Exp1.069,0.062,0.504,0.053,0.100162,0.814216,0.085622,G2M
G2_Exp1.075,0.004,0.258,0.633,0.004469,0.288268,0.707263,S
G2_Exp1.063,0.043,0.426,0.136,0.071074,0.704132,0.224793,S
G2_Exp1.029,0.281,0.051,0.248,0.484483,0.087931,0.427586,S
G2_Exp1.076,0.042,0.414,0.219,0.062222,0.613333,0.324444,S
G2_Exp1.013,0.367,0.032,0.416,0.450307,0.039264,0.510429,S
G2_Exp1.037,0.835,0.003,0.235,0.778192,0.002796,0.219012,G1
G2_Exp1.057,0.646,0.0,0.233,0.734926,0.0,0.265074,G1
G2_Exp1.018,0.088,0.5,0.097,0.128467,0.729927,0.141606,G2M


In [24]:
helper.plot_prediction(
    prediction_table.loc[:, "G1"], 
    prediction_table.loc[:, "S"], 
    prediction_table.loc[:, "G2M"], 
    samples=list(prediction_table.index),
    t="scatter", title="Phase assignment"
)

{'data': [{'marker': {'color': 'black', 'size': 10, 'symbol': 'circle'},
   'mode': 'markers+text',
   'name': 'Sample0',
   'text': ['G2_Exp1.059',
    'G2_Exp1.069',
    'G2_Exp1.075',
    'G2_Exp1.063',
    'G2_Exp1.029',
    'G2_Exp1.076',
    'G2_Exp1.013',
    'G2_Exp1.037',
    'G2_Exp1.057',
    'G2_Exp1.018',
    'G2_Exp1.015',
    'G2_Exp1.019',
    'G2_Exp1.050',
    'G2_Exp1.004',
    'G2_Exp1.061',
    'G2_Exp1.042',
    'G2_Exp1.060',
    'G2_Exp1.058',
    'G2_Exp1.065',
    'G2_Exp1.002',
    'G2_Exp1.044',
    'G2_Exp1.051',
    'G2_Exp1.073',
    'G2_Exp1.030',
    'G2_Exp1.028',
    'G2_Exp1.022',
    'G2_Exp1.034',
    'G2_Exp1.017',
    'G2_Exp1.047',
    'G2_Exp1.072',
    'G2_Exp1.074',
    'G2_Exp1.054',
    'G2_Exp1.024',
    'G2_Exp1.032',
    'G2_Exp1.020',
    'G2_Exp1.064',
    'G2_Exp1.045',
    'G2_Exp1.038',
    'G2_Exp1.001',
    'G2_Exp1.049',
    'G2_Exp1.031',
    'G2_Exp1.039',
    'G2_Exp1.070',
    'G2_Exp1.007',
    'G2_Exp1.021',
    'G2_Exp1.03

In [25]:
p = ["G1","S","G2M"]
label = [] 

for c in gencounts_oscope_sorted.columns:
    if "G1_" in c:
        label.append("G1")
    elif "S_" in c:
        label.append("S")
    elif "G2_" in c:
        label.append("G2M")

In [26]:
evaluation = helper.evaluate_prediction(prediction_table, label)

iplot(helper.plot_evaluation(*evaluation, average=True, xaxislbl=["G1","S","G2M"], title="Prediction Scores per Phase for unnormalized gene counts"), image = 'svg')

F1 Score: G1: 0.4936170212765958, S: 0.13407821229050276, G2M: 0.1
Reacall: G1: 0.6373626373626373, S: 0.15, G2M: 0.05263157894736842 
Precision: G1: 0.4027777777777778, S: 0.12121212121212122, G2M: 1.0 


In [27]:
x = gencounts_oscope.T.values

X_std = QuantileTransformer().fit_transform(x.astype(float))

gencounts_oscope_normalized = pandas.DataFrame(X_std.T, index=gencounts_oscope.index, columns=gencounts_oscope.columns)
gencounts_oscope_normalized_sorted = gencounts_oscope_normalized.iloc[:,
                              [gencounts_oscope_normalized.columns.get_loc(c) for c in gencounts_oscope_normalized.columns if
                              "G1_" in c or "G2_" in c or "S_" in c]]
gencounts_oscope_normalized_sorted.head(10)

Unnamed: 0_level_0,G2_Exp1.059,G2_Exp1.069,G2_Exp1.075,G2_Exp1.063,G2_Exp1.029,G2_Exp1.076,G2_Exp1.013,G2_Exp1.037,G2_Exp1.057,G2_Exp1.018,...,G1_Exp1.008,G1_Exp1.055,G1_Exp1.050,G1_Exp1.076,G1_Exp1.011,G1_Exp1.063,G1_Exp1.083,G1_Exp1.030,G1_Exp1.018,G1_Exp1.046
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MKL2,0.5948365,0.4251952,0.2353241,0.45062,0.3248365,0.8494862,0.6035429,1e-07,0.564266,0.9781442,...,0.3512828,0.7757691,0.7407407,0.2723152,0.2772233,1e-07,0.6386019,0.4684572,0.868922,0.8305803
CD109,0.100305,0.3201135,0.7909013,0.03080467,0.08501255,0.5030089,0.7447545,0.9152499,0.05667066,0.357342,...,0.2592593,0.3856201,0.2323796,0.40253,0.3136761,0.126431,0.8998772,0.1290721,0.5925926,0.3766069
ABTB1,0.9020715,1e-07,1e-07,0.9211207,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,...,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07
MAST2,0.4792734,0.4836901,0.8627706,1e-07,0.7818149,1e-07,0.8561407,1e-07,1e-07,0.9150147,...,0.5272325,0.7318324,1e-07,0.845494,0.7754613,1e-07,0.5014795,1e-07,0.6666667,0.8039964
KAT5,0.5337311,0.4465861,1e-07,0.9829656,1e-07,1e-07,0.7804196,0.509615,0.975995,1e-07,...,1e-07,0.853874,1e-07,1e-07,1e-07,1e-07,0.4663023,0.766857,0.5425416,1e-07
WWC2,0.1460928,0.8388357,0.7306529,0.1270348,0.5951589,0.5510902,0.8825024,0.3772122,0.5226564,0.8779768,...,0.2723594,0.346951,0.8539372,0.6712327,0.9738533,0.8583905,0.157137,0.5029423,0.06746602,0.1111111
CD163,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,...,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07
MYL2,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,...,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07
UBE2Z,0.9478934,0.6929424,0.1960402,0.1851852,0.4357608,0.5512878,0.919791,0.6493849,0.6947714,0.2418189,...,0.6818743,0.4377827,0.9805805,0.1266152,0.477079,0.06317808,0.1021025,0.9890268,1e-07,0.1048298
RGPD4,1e-07,0.6426691,1e-07,1e-07,0.9545437,0.8388375,1e-07,1e-07,0.5947432,1e-07,...,1e-07,1e-07,0.6145896,0.6882182,1e-07,1e-07,0.6585361,1e-07,0.9129127,1e-07


In [28]:
prediction = pairs.cyclone(gencounts_oscope_normalized_sorted, mm_marker_pairs, verbose=True)

[__set_matrix] Original Matrix 'x' has shape 19084 x 247
[__set_matrix] Matrix truncation done. Working with 19084 genes for 247 samples.
[cyclone] Preparing marker pairs, where at least one gene was not present in 'x'... Done!
[cyclone] Removed 2836 marker pairs. 26545 marker pairs remaining.
[cyclone] Calculating scores and predicting cell cycle phase... Done!
[cyclone] Calculated scores and prediction (phase: count): G2M: 94, S: 29, G1: 124


In [29]:
prediction_table = helper.get_prediction_table(prediction)
helper.DataTable(prediction_table)

Unnamed: 0_level_0,G1,G2M,S,G1_norm,G2M_norm,S_norm,prediction
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
G2_Exp1.059,0.185,0.705,0.508,0.132332,0.504292,0.363376,G2M
G2_Exp1.069,0.003,1.0,0.174,0.002549,0.849618,0.147833,G2M
G2_Exp1.075,0.0,0.995,0.921,0.0,0.519311,0.480689,G2M
G2_Exp1.063,0.022,0.989,0.094,0.01991,0.895023,0.085068,G2M
G2_Exp1.029,0.167,0.914,0.388,0.113683,0.622192,0.264125,G2M
G2_Exp1.076,0.01,1.0,0.263,0.007855,0.785546,0.206599,G2M
G2_Exp1.013,0.446,0.473,0.455,0.3246,0.34425,0.33115,S
G2_Exp1.037,0.866,0.194,0.593,0.523896,0.117362,0.358742,G1
G2_Exp1.057,0.693,0.017,0.71,0.488028,0.011972,0.5,G1
G2_Exp1.018,0.033,0.993,0.212,0.026656,0.8021,0.171244,G2M


In [30]:
helper.plot_prediction(
    prediction_table.loc[:, "G1"], 
    prediction_table.loc[:, "S"], 
    prediction_table.loc[:, "G2M"], 
    samples=list(prediction_table.index),
    t="pie", title="Phase assignment", width=600, height=600
)

{'data': [{'marker': {'color': 'black', 'size': 10, 'symbol': 'circle'},
   'mode': 'markers+text',
   'name': 'Sample0',
   'text': [' 0',
    ' 1',
    ' 2',
    ' 3',
    ' 4',
    ' 5',
    ' 6',
    ' 7',
    ' 8',
    ' 9',
    ' 10',
    ' 11',
    ' 12',
    ' 13',
    ' 14',
    ' 15',
    ' 16',
    ' 17',
    ' 18',
    ' 19',
    ' 20',
    ' 21',
    ' 22',
    ' 23',
    ' 24',
    ' 25',
    ' 26',
    ' 27',
    ' 28',
    ' 29',
    ' 30',
    ' 31',
    ' 32',
    ' 33',
    ' 34',
    ' 35',
    ' 36',
    ' 37',
    ' 38',
    ' 39',
    ' 40',
    ' 41',
    ' 42',
    ' 43',
    ' 44',
    ' 45',
    ' 46',
    ' 47',
    ' 48',
    ' 49',
    ' 50',
    ' 51',
    ' 52',
    ' 53',
    ' 54',
    ' 55',
    ' 56',
    ' 57',
    ' 58',
    ' 59',
    ' 60',
    ' 61',
    ' 62',
    ' 63',
    ' 64',
    ' 65',
    ' 66',
    ' 67',
    ' 68',
    ' 69',
    ' 70',
    ' 71',
    ' 72',
    ' 73',
    ' 74',
    ' 75',
    ' 76',
    ' 77',
    ' 78',
    ' 79',


In [31]:
evaluation = helper.evaluate_prediction(prediction_table, label=label)

iplot(helper.plot_evaluation(*evaluation, average=True, xaxislbl=["G1","S","G2M"], title="Prediction Scores per Phase for unnormalized gene counts"), image='svg')

F1 Score: G1: 0.4837209302325581, S: 0.20183486238532108, G2M: 0.6705882352941177
Reacall: G1: 0.5714285714285714, S: 0.1375, G2M: 0.75 
Precision: G1: 0.41935483870967744, S: 0.3793103448275862, G2M: 0.6063829787234043 
