In [11]:
from IPython.display import HTML, display

import h5py
import numpy as np
import random

import tabulate

In [12]:
# link for file download: 
# javascript:downloadFile('https://s3.amazonaws.com/mssm-seq-matrix/human_transcript_v8.h5','human_transcript.h5','8')
f = h5py.File('./data/human_transcript_v8.h5', 'r')

# list of all sample sources
all_sources = list(f['meta']['Sample_source_name_ch1'][()])
# list of all transcript names
all_transcripts = list(f['meta']['transcripts'][()])

print('No of sources: {}'.format(len(all_sources)))
print('No of transcripts: {}'.format(len(all_transcripts)))

No of sources: 238522
No of transcripts: 178136


In [13]:
# find indices of samples with "AML" in their source names
ind_ = []

for k in range(len(all_sources)):
    if 'AML' in all_sources[k].decode('utf-8'):
        ind_.append(k)        

print('Sources with "AML": {}'.format(len(ind_)))

Sources with "AML": 678


In [14]:
# create a table with all the sample expression arrays (AML): 
# rows - samples, cols - transcripts, elements - No of reads, not normalised
AML_expr = []

for i in ind_:
    AML_expr.append(list(f['data']['expression'][i]))

AML_expr = np.array(AML_expr)
print(np.shape(AML_expr))

# find normalised average of all th expr levels in AML samples
AML_expr_average = np.sum(AML_expr, axis=0)/np.shape(AML_expr)[0]

# create a table with all the sample expression arrays (random): 
# rows - samples, cols - transcripts, elements - No of reads, not normalised
rand_expr = []

ind = [random.randrange(0, len(all_sources)) for x in range(0,1000)]
for i in ind:
    rand_expr.append(list(f['data']['expression'][i]))
    
print(np.shape(rand_expr))

# find normalised average of all th expr levels in 1000 random samples
rand_expr_average = np.sum(rand_expr, axis=0)/np.shape(rand_expr)[0]

(678, 178136)
(1000, 178136)


In [15]:
# select 1000 highest expressed transcripts from both sets
ind_max = np.argsort(AML_expr_average, axis=-1)[-1000:]
ind_max_rand = np.argsort(rand_expr_average, axis=-1)[-1000:]

ma_tr = [all_transcripts[i].decode('utf-8') for i in ind_max]
ma_tr_rand = [all_transcripts[i].decode('utf-8') for i in ind_max_rand]

# find respective gene names from GENCODEv34 table
comprehensive = np.genfromtxt('./data/GENCODE_v34_hg38_comprehensive', usecols=(1, 12), dtype='str')

ma_gen = []
for tr in ma_tr:
    for row in comprehensive:
        if tr[:15] in row[0]:
            ma_gen.append(row[1])
            break
        if row[0]==comprehensive[-1][0]:
            ma_gen.append('not found') 
            
ma_gen_rand = []
for tr in ma_tr_rand:
    for row in comprehensive:
        if tr[:15] in row[0]:
            ma_gen_rand.append(row[1])
            break
        if row[0]==comprehensive[-1][0]:
            ma_gen_rand.append('not found')

# print a table with the most abundant AML set transcripts and genes
table = []

table.append(ma_tr)
table.append(ma_gen)
table.append([str(int(AML_expr_average[i])) for i in ind_max][::-1])

display(HTML(tabulate.tabulate(np.array(table).T, tablefmt='html')))

0,1,2
ENST00000340368.8,INSIG1,404122
ENST00000263239.6,DDX18,291257
ENST00000361050.3,MPEG1,287857
ENST00000438806.5,PNISR,153418
ENST00000611208.4,ARL8B,116656
ENST00000357304.8,PRRC2B,111875
ENST00000313368.7,TAF7,105589
ENST00000264161.8,DARS1,99752
ENST00000526562.5,RPL27A,98084
ENST00000549690.1,LYZ,80331


In [16]:
genes_AML_23 = np.loadtxt('./lists/clinSEQ_AML_23', dtype='str', delimiter='\t')
genes_AML_716 = np.loadtxt('./lists/TCGA-AML_AS_716', dtype='str', delimiter='\t')
genes_AML_222 = np.loadtxt('./lists/TCGA-AML_clinSEQ_AS_common_222', dtype='str', delimiter='\t')

# check how many of the AML-predictive genes we'll find in 
# the 1000 highest expressed genes in AML-samples: 
# does expression level correlate with predictive value

i = 0
for gene in genes_AML_23:
    if gene in ma_gen:
        i += 1
print('For ClinSEQ AML gene set {} out of {} are in the ma_AML list'
     .format(i, len(genes_AML_23)))

i = 0
for gene in genes_AML_716:
    if gene in ma_gen:
        i += 1
print('For TCGA-AML gene set {} out of {} are in the ma_AML list'
     .format(i, len(genes_AML_716)))

i = 0
for gene in genes_AML_222:
    if gene in ma_gen:
        i += 1
print('For TCGA-AML + ClinSEQ gene set {} out of {} are in the ma_AML list'
     .format(i, len(genes_AML_222)))

For ClinSEQ AML gene set 12 out of 23 are in the ma_AML list
For TCGA-AML gene set 90 out of 716 are in the ma_AML list
For TCGA-AML + ClinSEQ gene set 50 out of 222 are in the ma_AML list


In [17]:
# check how many of the AML-predictive genes we'll find in 
# the 1000 highest expressed genes in 1000 random samples

i = 0
for gene in genes_AML_23:
    if gene in ma_gen_rand:
        i += 1
print('For ClinSEQ AML gene set {} out of {} are in the ma_rand list'
     .format(i, len(genes_AML_23)))

i = 0
for gene in genes_AML_716:
    if gene in ma_gen_rand:
        i += 1
print('For TCGA-AML gene set {} out of {} are in the ma_rand list'
     .format(i, len(genes_AML_716)))

i = 0
for gene in genes_AML_222:
    if gene in ma_gen_rand:
        i += 1
print('For TCGA-AML + ClinSEQ gene set {} out of {} are in the ma_rand list'
     .format(i, len(genes_AML_222)))

For ClinSEQ AML gene set 5 out of 23 are in the ma_rand list
For TCGA-AML gene set 70 out of 716 are in the ma_rand list
For TCGA-AML + ClinSEQ gene set 37 out of 222 are in the ma_rand list
