__Author:__ Bram Van de Sande

__Date:__ 22 MAR 2018

__Outline:__ Assess the AUCell implementation.

In [1]:
import pandas as pd
import seaborn as sns
from pyscenic.genesig import GeneSignature
from pyscenic.aucell import aucell, derive_auc_threshold, create_rankings

In [2]:
%load_ext memory_profiler

In [3]:
# A module from GeneSigDB (C6)
GMT_FNAME = 'signatures.hgnc.gmt'
# An expression matrix from GEO
EXPRESSION_MTX_FNAME = 'GSE103322.mtx.tsv' # Gene expression as (cell, gene) - matrix.

In [4]:
signatures = GeneSignature.from_gmt(GMT_FNAME, 'HGNC', field_separator='\t', gene_separator='\t')
len(signatures)

189

In [5]:
ex_matrix = pd.read_csv(EXPRESSION_MTX_FNAME, sep='\t', header=0, index_col=0).T
ex_matrix.shape

(5902, 20684)

In [6]:
%memit create_rankings(ex_matrix)

peak memory: 5733.45 MiB, increment: 4447.32 MiB


In [7]:
%timeit -n1 -r1 create_rankings(ex_matrix)

12.1 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [8]:
percentiles = derive_auc_threshold(ex_matrix)
percentiles

0.01    0.092294
0.05    0.102497
0.10    0.112067
0.50    0.170712
1.00    0.632421
dtype: float64

In [9]:
aucs_mtx = aucell(ex_matrix, signatures, auc_threshold=percentiles[0.01], num_cores=8)
aucs_mtx.head()

Regulome,GLI1_UP.V1_DN,GLI1_UP.V1_UP,E2F1_UP.V1_DN,E2F1_UP.V1_UP,EGFR_UP.V1_DN,EGFR_UP.V1_UP,ERB2_UP.V1_DN,ERB2_UP.V1_UP,GCNP_SHH_UP_EARLY.V1_DN,GCNP_SHH_UP_EARLY.V1_UP,...,KRAS.KIDNEY_UP.V1_DN,KRAS.KIDNEY_UP.V1_UP,KRAS.LUNG_UP.V1_DN,KRAS.LUNG_UP.V1_UP,KRAS.LUNG.BREAST_UP.V1_DN,KRAS.LUNG.BREAST_UP.V1_UP,KRAS.PROSTATE_UP.V1_DN,KRAS.PROSTATE_UP.V1_UP,LEF1_UP.V1_DN,LEF1_UP.V1_UP
Cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HN28_P15_D06_S330_comb,0.099154,0.133946,0.100587,0.065074,0.069673,0.107095,0.051168,0.066695,0.080446,0.025027,...,0.021262,0.051473,0.026772,0.00819,0.045178,0.033161,0.028602,0.024628,0.054168,0.076753
HN28_P6_G05_S173_comb,0.079136,0.118619,0.105842,0.028522,0.054226,0.083728,0.04688,0.070569,0.090524,0.056925,...,0.016059,0.062782,0.015966,0.013072,0.037452,0.034941,0.02511,0.022847,0.042237,0.07031
HN26_P14_D11_S239_comb,0.063197,0.098985,0.058575,0.044917,0.047802,0.051754,0.033969,0.03031,0.058901,0.045749,...,0.010676,0.010213,0.017624,0.005473,0.026113,0.008704,0.007087,0.004038,0.056571,0.012875
HN26_P14_H05_S281_comb,0.04853,0.064955,0.11151,0.036309,0.081354,0.090282,0.065022,0.065199,0.058168,0.056469,...,0.0169,0.046019,0.007634,0.018416,0.02459,0.046674,0.029752,0.026695,0.044156,0.095498
HN26_P25_H09_S189_comb,0.103888,0.106164,0.0904,0.033373,0.063693,0.098212,0.056039,0.056896,0.043951,0.032995,...,0.03652,0.022551,0.048584,0.018322,0.04658,0.0203,0.026427,0.011657,0.044799,0.034251


In [10]:
df_rnk = create_rankings(ex_matrix)
df_rnk.head()

Unnamed: 0,FASTKD2,IFNA1,SACS,TAS2R14,SPIC,BRD1,TPM4,LAMC1,TTTY1B,PPYR1,...,FBLN5,BBS12,CDK7,HDAC4,SLC23A1,TEDDM1,FOXQ1,FAM168B,NPLOC4,DCP2
HN28_P15_D06_S330_comb,3941,3942,3943,3944,3945,3946,34,1265,3947,3948,...,20674,20675,20676,20677,20678,20679,20680,20681,20682,20683
HN28_P6_G05_S173_comb,2511,2512,2513,2514,2515,2516,1156,1187,2517,2518,...,20675,20676,444,20677,20678,20679,20680,20681,20682,20683
HN26_P14_D11_S239_comb,2017,6430,4038,6431,6432,3049,6433,6434,6435,6436,...,20678,20679,5210,3965,20680,20681,20682,3218,4353,20683
HN26_P14_H05_S281_comb,2560,3162,3163,3164,3165,3166,246,1500,3167,3168,...,20675,20676,20677,20678,20679,20680,20681,20682,20683,1555
HN26_P25_H09_S189_comb,4777,6819,6820,6821,6822,6823,1608,6824,6825,6826,...,20676,20677,1234,3074,20678,20679,20680,20681,20682,20683


In [11]:
df_rnk.shape

(5902, 20684)

In [12]:
ex_matrix.head()

Unnamed: 0,SNORD113-9,MAGEB16,SNORA49,MIR26A1,MIR485,MIR3909,TTTY23,MIR519E,C10orf53,MIR3684,...,GAPDH,UBB,FTL,RPL7,MTRNR2L8,FTH1,ACTB,TMSB4X,MTRNR2L2,B2M
HN28_P15_D06_S330_comb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.945653,0.842134,0.833173,0.824638,1.093898,0.831537,1.072929,0.917554,1.167422,1.061845
HN28_P6_G05_S173_comb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.971751,0.862773,0.968909,0.953399,1.124328,0.923339,0.996584,0.954457,1.170053,0.940295
HN26_P14_D11_S239_comb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.10017,0.821669,0.956934,0.953593,1.119356,0.952363,0.821784,0.819039,1.194907,0.820894
HN26_P14_H05_S281_comb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.705589,0.977141,0.941196,0.881868,1.049143,0.954018,0.916003,0.984108,1.171271,1.080044
HN26_P25_H09_S189_comb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.001442,0.850375,0.874529,0.973545,1.156655,0.938911,0.720506,0.85775,1.227803,0.934479


In [13]:
ex_matrix.shape

(5902, 20684)

In [14]:
ex_matrix.sample(frac=1.0, replace=False, axis=1).head()

Unnamed: 0,LRRC10,C12orf40,DBF4,NIPA2,BLOC1S1,BMP8A,CSF2RA,DDN,GTF2IRD2P1,KRIT1,...,PFN3,POLL,GRM5,DNASE1L1,ESR1,MIR3646,MRPL24,RPL34,RPAIN,SPAG4
HN28_P15_D06_S330_comb,0.0,0.0,0.0,0.574693,0.0,0.0,0.056355,0.0,0.0,0.386392,...,0.0,0.0,0.0,0.0,0.0,0.0,0.498271,0.557611,0.0,0.0
HN28_P6_G05_S173_comb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.440144,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.446616,0.0,0.0
HN26_P14_D11_S239_comb,0.0,0.0,0.0,0.0,0.319468,0.326089,0.0,0.0,0.0,0.207755,...,0.0,0.0,0.0,0.0,0.0,0.0,0.344226,0.45172,0.273874,0.0
HN26_P14_H05_S281_comb,0.0,0.0,0.0,0.0,0.487497,0.020447,0.0,0.0,0.012077,0.0,...,0.0,0.0,0.0,0.0,0.256479,0.0,0.0,0.575303,0.0,0.0
HN26_P25_H09_S189_comb,0.010119,0.0,0.0,0.0,0.34136,0.08436,0.0,0.0,0.0,0.011687,...,0.0,0.377368,0.0,0.018955,0.0,0.0,0.063789,0.559325,0.322401,0.0


In [15]:
ex_matrix.sample(frac=1.0, replace=False, axis=1).head()

Unnamed: 0,GDF6,MARVELD2,RNF123,IL20RB,C1orf106,EVC2,TWSG1,OR2A20P,TNFRSF10A,N4BP1,...,ISG20,TYR,MCOLN3,ZNF428,PTPRG,NDUFA9,RIN2,OR5C1,SLC22A13,CTSG
HN28_P15_D06_S330_comb,0.0,0.047957,0.0,0.0,0.0,0.0,0.687455,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.32496,0.559639,0.362015,0.0,0.0,0.0
HN28_P6_G05_S173_comb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.441016,...,0.0,0.0,0.0,0.0,0.0,0.401925,0.0,0.0,0.0,0.0
HN26_P14_D11_S239_comb,0.0,0.345828,0.0,0.054676,0.210875,0.0,0.204591,0.0,0.336877,0.0,...,0.414081,0.0,0.0,0.245861,0.0,0.624896,0.272417,0.0,0.0,0.0
HN26_P14_H05_S281_comb,0.0,0.0,0.0,0.0,0.015358,0.0,0.0,0.0,0.0,0.0,...,0.704146,0.0,0.0,0.0,0.0,0.0,0.565675,0.0,0.0,0.0
HN26_P25_H09_S189_comb,0.0,0.0,0.0,0.0,0.385884,0.0,0.0,0.0,0.0,0.0,...,0.420455,0.0,0.0,0.0,0.0,0.6179,0.261471,0.0,0.0,0.0


In [16]:
len(ex_matrix.sample(frac=1.0, replace=False, axis=1).columns.unique())

20684

In [17]:
len(ex_matrix.columns.unique())

20684

In [25]:
(df_rnk + 1).sum(axis=1).unique()

array([213924270])

In [23]:
(ex_matrix.shape[1]*(ex_matrix.shape[1]+1))/2.0

213924270.0

In [29]:
col1 = df_rnk.loc['HN28_P15_D06_S330_comb'].sort_values(ascending=True)

In [30]:
col2 = ex_matrix.loc['HN28_P15_D06_S330_comb'].sort_values(ascending=False)

In [39]:
col1.head(25)

MTRNR2L2     0
ACTA2        1
MTRNR2L8     2
ACTB         3
B2M          4
SPARCL1      5
TAGLN        6
VIM          7
IGFBP7       8
A2M          9
CTGF        10
GAPDH       11
TPM1        12
MYL6        13
CSRP2       14
EEF1A1      15
IFITM3      16
IFITM1      17
S100A4      18
TMSB4X      19
UBC         20
RPL9        21
FOS         22
ACTG2       23
TPM2        24
Name: HN28_P15_D06_S330_comb, dtype: uint32

In [40]:
col2.head(25)

MTRNR2L2    1.167422
ACTA2       1.146134
MTRNR2L8    1.093898
ACTB        1.072929
B2M         1.061845
SPARCL1     1.047538
TAGLN       1.041804
VIM         1.034850
IGFBP7      0.980954
A2M         0.976950
CTGF        0.952751
GAPDH       0.945653
TPM1        0.941099
MYL6        0.940129
CSRP2       0.938331
EEF1A1      0.925513
IFITM3      0.924951
IFITM1      0.921931
S100A4      0.919569
TMSB4X      0.917554
UBC         0.914924
RPL9        0.914358
FOS         0.912780
ACTG2       0.909681
TPM2        0.904457
Name: HN28_P15_D06_S330_comb, dtype: float64