In [1]:
from IPython.display import HTML, display

import h5py
import numpy as np
import random

import tabulate

In [2]:
# link for file download: 
# javascript:downloadFile('https://s3.amazonaws.com/mssm-seq-matrix/human_transcript_v8.h5','human_transcript.h5','8')
f = h5py.File('./data/human_transcript_v8.h5', 'r')

# list of all sample sources
all_sources = list(f['meta']['Sample_source_name_ch1'][()])
# list of all transcript names
all_transcripts = list(f['meta']['transcripts'][()])

print('No of sources: {}'.format(len(all_sources)))
print('No of transcripts: {}'.format(len(all_transcripts)))

No of sources: 238522
No of transcripts: 178136


In [3]:
# find indices of samples with "AML" in their source names
ind_ = []

for k in range(len(all_sources)):
    if 'AML' in all_sources[k].decode('utf-8'):
        ind_.append(k)

print('Sources with "AML" in the name: {}'.format(len(ind_)))

Sources with "AML" in the name: 678


In [4]:
# create a table with all the sample expression arrays (AML): 
# rows - samples, cols - transcripts, elements - No of reads, not normalised
AML_expr = []

for i in ind_:
    AML_expr.append(list(f['data']['expression'][i]))

AML_expr = np.array(AML_expr)
print(np.shape(AML_expr))

# find normalised average of all th expr levels in AML samples
AML_expr_average = np.sum(AML_expr, axis=0)/np.shape(AML_expr)[0]

# create a table with all the sample expression arrays (random): 
# rows - samples, cols - transcripts, elements - No of reads, not normalised
rand_expr = []

ind = [random.randrange(0, len(all_sources)) for x in range(0,1000)]
for i in ind:
    rand_expr.append(list(f['data']['expression'][i]))
    
print(np.shape(rand_expr))

# find normalised average of all th expr levels in 1000 random samples
rand_expr_average = np.sum(rand_expr, axis=0)/np.shape(rand_expr)[0]

# find the difference between AML and random sets expression
diff_expr = AML_expr_average - rand_expr_average

(678, 178136)
(1000, 178136)


In [5]:
# find the 1000 transcripts with the highest +- diff expression
ind_max = np.argsort(diff_expr, axis=-1)[-1000:]
ind_min = np.argsort(diff_expr, axis=-1)[:1000]

pos_dif_tr = [all_transcripts[i].decode('utf-8') for i in ind_max]
neg_dif_tr = [all_transcripts[i].decode('utf-8') for i in ind_min]

# find respective gene names from GENCODE v34 comprehensive, hg38 edition
comprehensive = np.genfromtxt('./data/GENCODE_v34_hg38_comprehensive', usecols=(1, 12), dtype='str')

pos_dif_gen = []
neg_dif_gen = []

for tr in pos_dif_tr:
    for row in comprehensive:
        if tr[:15] in row[0]:
            pos_dif_gen.append(row[1])
            break
        if row[0]==comprehensive[-1][0]:
            pos_dif_gen.append('not found')   
            
for tr in neg_dif_tr:
    for row in comprehensive:
        if tr[:15] in row[0]:
            neg_dif_gen.append(row[1])
            break
        if row[0]==comprehensive[-1][0]:
            neg_dif_gen.append('not found')  

# print a table with the highest +- diff expression transcripts and genes between AML and random
table = []

table.append(pos_dif_tr)
table.append(pos_dif_gen)
table.append(['+'+str(int(diff_expr[i])) for i in ind_max][::-1])
table.append(neg_dif_tr)
table.append(neg_dif_gen)
table.append([int(diff_expr[i]) for i in ind_min])

display(HTML(tabulate.tabulate(np.array(table).T, tablefmt='html')))

0,1,2,3,4,5
ENST00000357103.4,ADIPOR2,251040,ENST00000251595.10,HBA2,-38068
ENST00000375048.7,DDOST,201941,ENST00000320868.9,HBA1,-27626
ENST00000305885.2,FEN1,123128,ENST00000307407.7,CXCL8,-20454
ENST00000535572.5,WNK1,71733,ENST00000225964.9,COL1A1,-17604
ENST00000336095.10,RNF24,60039,ENST00000335295.4,HBB,-16080
ENST00000394668.2,RPL34,57898,ENST00000446046.5,FN1,-14698
ENST00000341068.7,ANAPC1,53574,ENST00000443816.5,FN1,-14542
ENST00000454306.6,PRRC2A,50879,ENST00000445125.2,not found,-13772
ENST00000398571.6,USP34,47664,ENST00000390237.2,IGKC,-8567
ENST00000583389.5,EIF4A1,45860,ENST00000620041.4,FTH1,-7987


In [6]:
table_diff_rand = []

table_diff_rand.append(pos_dif_tr+neg_dif_tr)
table_diff_rand.append(pos_dif_gen+neg_dif_gen)
table_diff_rand.append(['+'+str(int(diff_expr[i])) for i in ind_max][::-1] +
             [int(diff_expr[i]) for i in ind_min])

np.savetxt('./lists/diff_AML_rand', np.array(table_diff_rand).T, fmt='%s', delimiter='\t')

In [7]:
genes_AML_23 = np.loadtxt('./lists/clinSEQ_AML_23', dtype='str', delimiter='\t')
genes_AML_716 = np.loadtxt('./lists/TCGA-AML_AS_716', dtype='str', delimiter='\t')
genes_AML_222 = np.loadtxt('./lists/TCGA-AML_clinSEQ_AS_common_222', dtype='str', delimiter='\t')

# check how many of the AML-predictive genes we'll find in 
# the differential expr sets of genes

i = 0
for gene in genes_AML_23:
    if gene in pos_dif_gen+neg_dif_gen:
        i += 1
print('For ClinSEQ AML gene set {} out of {} are in the diff_AML_rand list'
     .format(i, len(genes_AML_23)))

i = 0
for gene in genes_AML_716:
    if gene in pos_dif_gen+neg_dif_gen:
        i += 1
print('For TCGA-AML gene set {} out of {} are in the diff_AML_rand list'
     .format(i, len(genes_AML_716)))

i = 0
for gene in genes_AML_222:
    if gene in pos_dif_gen+neg_dif_gen:
        i += 1
print('For TCGA-AML + ClinSEQ gene set {} out of {} are in the diff_AML_rand list'
     .format(i, len(genes_AML_222)))

For ClinSEQ AML gene set 14 out of 23 are in the diff_AML list
For TCGA-AML gene set 115 out of 716 are in the diff_AML list
For TCGA-AML + ClinSEQ gene set 56 out of 222 are in the diff_AML list


In [8]:
# find indices of samples with blood cell keywords in their source names
ind_ = []

for k in range(len(all_sources)):
    if 'peripheral blood' in all_sources[k].decode('utf-8').lower():
        ind_.append(k)
    if 'marrow' in all_sources[k].decode('utf-8').lower():
        ind_.append(k)
    if 'hematopoietic' in all_sources[k].decode('utf-8').lower():
        ind_.append(k)

print('Sources with "peripheral blood", "marrow" and "hematopoietic" in their name: {}'.format(len(ind_)))

Sources with "peripheral blood", "marrow" and "hematopoietic" in their name: 13007


In [9]:
blood_expr = []

for i in ind_[:2000]:
    blood_expr.append(list(f['data']['expression'][i]))

blood_expr = np.array(blood_expr)
print(np.shape(blood_expr))

# find normalised average of all the expr levels in AML samples
blood_expr_average = np.sum(blood_expr, axis=0)/np.shape(blood_expr)[0]

blood_diff_expr = AML_expr_average - blood_expr_average

ind_max_b = np.argsort(blood_diff_expr, axis=-1)[-1000:]
ind_min_b = np.argsort(blood_diff_expr, axis=-1)[:1000]

pos_dif_tr_b = [all_transcripts[i].decode('utf-8') for i in ind_max_b]
neg_dif_tr_b = [all_transcripts[i].decode('utf-8') for i in ind_min_b]


(2000, 178136)


In [10]:
pos_dif_gen_b = []
neg_dif_gen_b = []

for tr in pos_dif_tr_b:
    for row in comprehensive:
        if tr[:15] in row[0]:
            pos_dif_gen_b.append(row[1])
            break
        if row[0]==comprehensive[-1][0]:
            pos_dif_gen_b.append('not found')   
            
for tr in neg_dif_tr_b:
    for row in comprehensive:
        if tr[:15] in row[0]:
            neg_dif_gen_b.append(row[1])
            break
        if row[0]==comprehensive[-1][0]:
            neg_dif_gen_b.append('not found')  

# print a table with the highest +- diff expression transcripts and genes between AML and random
table_bl = []

table_bl.append(pos_dif_tr_b)
table_bl.append(pos_dif_gen_b)
table_bl.append(['+'+str(int(blood_diff_expr[i])) for i in ind_max_b][::-1])
table_bl.append(neg_dif_tr_b)
table_bl.append(neg_dif_gen_b)
table_bl.append([int(blood_diff_expr[i]) for i in ind_min_b])

display(HTML(tabulate.tabulate(np.array(table_bl).T, tablefmt='html')))

0,1,2,3,4,5
ENST00000264028.4,ARCN1,335176,ENST00000335295.4,HBB,-45423
ENST00000450554.6,U2AF2,211583,ENST00000251595.10,HBA2,-33854
ENST00000199389.10,EIF2AK1,174646,ENST00000320868.9,HBA1,-33306
ENST00000257829.7,NAT10,130390,ENST00000307407.7,CXCL8,-17145
ENST00000435706.6,TOP2B,80163,ENST00000390237.2,IGKC,-16603
ENST00000265896.9,SQLE,72381,ENST00000546260.5,SOD2,-15389
ENST00000301785.5,HNRNPUL2,68104,ENST00000368738.3,S100A9,-14285
ENST00000349736.9,CSNK2A1,62803,ENST00000394936.7,PSAP,-10505
ENST00000263238.6,ACTR3,59760,ENST00000263341.6,IL1B,-8184
ENST00000393043.5,CLTC,58928,ENST00000582401.5,TXNIP,-7669


In [11]:
table_diff_bl = []

table_diff_bl.append(pos_dif_tr_b+neg_dif_tr_b)
table_diff_bl.append(pos_dif_gen_b+neg_dif_gen_b)
table_diff_bl.append(['+'+str(int(blood_diff_expr[i])) for i in ind_max_b][::-1] +
             [int(blood_diff_expr[i]) for i in ind_min_b])

np.savetxt('./lists/diff_AML_blood', np.array(table_diff_bl).T, fmt='%s', delimiter='\t')

In [12]:
# check how many of the AML-predictive genes we'll find in 
# the differential (blood) expr sets of genes

i = 0
for gene in genes_AML_23:
    if gene in pos_dif_gen_b+neg_dif_gen_b:
        i += 1
print('For ClinSEQ AML gene set {} out of {} are in the diff_AML_blood list'
     .format(i, len(genes_AML_23)))

i = 0
for gene in genes_AML_716:
    if gene in pos_dif_gen_b+neg_dif_gen_b:
        i += 1
print('For TCGA-AML gene set {} out of {} are in the diff_AML_blood list'
     .format(i, len(genes_AML_716)))

i = 0
for gene in genes_AML_222:
    if gene in pos_dif_gen_b+neg_dif_gen_b:
        i += 1
print('For TCGA-AML + ClinSEQ gene set {} out of {} are in the diff_AML_blood list'
     .format(i, len(genes_AML_222)))

For ClinSEQ AML gene set 14 out of 23 are in the diff_AML_blood list
For TCGA-AML gene set 118 out of 716 are in the diff_AML_blood list
For TCGA-AML + ClinSEQ gene set 55 out of 222 are in the diff_AML_blood list
