In [2]:
import h5py
import numpy as np

In [4]:
# link for file download: 
# javascript:downloadFile('https://s3.amazonaws.com/mssm-seq-matrix/human_transcript_v8.h5','human_transcript.h5','8')
f = h5py.File('./data/human_transcript_v8.h5', 'r')

# list of all sample sources
all_sources = list(f['meta']['Sample_source_name_ch1'][()])
# list of all transcript names
all_transcripts = list(f['meta']['transcripts'][()])

print('No of sources: {}'.format(len(all_sources)))
print('No of transcripts: {}'.format(len(all_transcripts)))

No of sources: 238522
No of transcripts: 178136


In [6]:
# find indices of samples with "AML" in their source names
ind_ = []

for k in range(len(all_sources)):
    if 'AML' in all_sources[k].decode('utf-8'):
        ind_.append(k)        

print('Sources with "AML": {}'.format(len(ind_)))

Sources with "AML": 678


In [7]:
# create a table with all the sample expression arrays: 
# rows - samples, cols - transcripts, elements - No of reads, not normalised
AML_expr = []

for i in ind_:
    AML_expr.append(list(f['data']['expression'][i]))
    
AML_expr = np.array(AML_expr)
print(np.shape(AML_expr))

#np.savetxt('./data/AML_expr', AML_expr, fmt='%i', delimiter='\t')

(678, 178136)


In [8]:
# select the most 100 covered transcripts from every sample and put them in one list
most_common = []

for sample in AML_expr:
    ind = np.argsort(sample, axis=-1)[-100:]
    for i in ind:
        most_common.append(all_transcripts[i].decode('utf-8'))

most_common_set = set(most_common)
print('Unique transcript set: {}'.format(len(most_common_set)))

#np.savetxt('./lists/most_common_transcripts_AML', np.array(list(most_common_set)), fmt='%s', delimiter='\t')

Unique transcript set: 3186


In [9]:
# find respective gene names from GENCODE v34 comprehensive, hg38 edition

comprehensive = np.genfromtxt('./data/GENCODE_v34_hg38_comprehensive', usecols=(1, 12), dtype='str')

most_common_genes = []
nf = []

for tr in most_common_set:
    for row in comprehensive:
        if tr[:15] in row[0]:
            most_common_genes.append(row[1])
            break
        if row[0]==comprehensive[-1][0]:
            nf.append(tr)    
            
most_common_genes = set(most_common_genes)
print('Unique gene set: {}, transcript-gene pairs not found in GENCODEv34 comprehensive: {}'
      .format(len(most_common_genes), len(nf)))

#np.savetxt('./lists/most_common_genes_AML', np.array(list(most_common_genes)), fmt='%s', delimiter='\t')

Unique gene set: 2198, transcript-gene pairs not found in GENCODEv34 comprehensive: 131


In [11]:
genes_AML_23 = np.loadtxt('./lists/clinSEQ_AML_23', dtype='str', delimiter='\t')
genes_AML_716 = np.loadtxt('./lists/TCGA-AML_AS_716', dtype='str', delimiter='\t')
genes_AML_222 = np.loadtxt('./lists/TCGA-AML_clinSEQ_AS_common_222', dtype='str', delimiter='\t')

i = 0
for gene in genes_AML_23:
    if gene in most_common_genes:
        i += 1
print('For ClinSEQ AML gene set {} out of {} are in the most_common list'
     .format(i, len(genes_AML_23)))

i = 0
for gene in genes_AML_716:
    if gene in most_common_genes:
        i += 1
print('For TCGA-AML gene set {} out of {} are in the most_common list'
     .format(i, len(genes_AML_716)))

i = 0
for gene in genes_AML_222:
    if gene in most_common_genes:
        i += 1
print('For TCGA-AML + ClinSEQ gene set {} out of {} are in the most_common list'
     .format(i, len(genes_AML_222)))

For ClinSEQ AML gene set 17 out of 23 are in the most_common list
For TCGA-AML gene set 186 out of 716 are in the most_common list
For TCGA-AML + ClinSEQ gene set 80 out of 222 are in the most_common list
