In [2]:
from IPython.display import HTML, display

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_dna
import numpy as np
from utils import *

import h5py
import numpy as np
import random

import tabulate

Loading the differentially expressed transcript names and sequences from a file. This list was obtained using limma-voom R packages for RNA-seq data analysis that uses an empirical Bayes model to calculate the most statistically significant (in terms of p-value) differentially expressed genes. 1000 random sequences were used as a baseline. 

50 top-DE (differentially expressed) transcripts were saved into the DE_AML_transcripts file (exonic sequences and the respective identifiers -- transcript names).

Tutorial: https://ucdavis-bioinformatics-training.github.io/2018-June-RNA-Seq-Workshop/thursday/DE.html


In [6]:
# load the names and sequences of differentially expressed transcripts
# and store them in a dictionary, so dict[name] = seq

path_to_file = './data/DE_AML_transcripts.fa'

transcript_dict = {}

with open(path_to_file, mode='r') as handle:

    for record in SeqIO.parse(handle, 'fasta'):

        identifier = record.id
        description = record.description
        sequence = record.seq
        
        transcript_dict[identifier] = sequence

dict_keys(['ENST00000257818.2', 'ENST00000241453.11', 'ENST00000216336.2', 'ENST00000284509.10', 'ENST00000592205.5', 'ENST00000633060.1', 'ENST00000427103.5', 'ENST00000360121.4', 'ENST00000378962.3', 'ENST00000612677.4', 'ENST00000309017.7', 'ENST00000304625.2', 'ENST00000380987.2', 'ENST00000598473.1', 'ENST00000233997.3', 'ENST00000620695.2', 'ENST00000367279.8', 'ENST00000376581.9', 'ENST00000448387.6', 'ENST00000537784.5', 'ENST00000563039.2', 'ENST00000381297.9', 'ENST00000611771.1', 'ENST00000430686.2', 'ENST00000304639.3', 'ENST00000393118.6', 'ENST00000554578.5', 'ENST00000400007.8', 'ENST00000245479.2', 'ENST00000561385.5', 'ENST00000215855.6', 'ENST00000293373.10', 'ENST00000468385.1', 'ENST00000477988.1', 'ENST00000282026.1', 'ENST00000346128.10', 'ENST00000261233.8', 'ENST00000359135.7', 'ENST00000367814.8', 'ENST00000515859.5', 'ENST00000507316.1', 'ENST00000355530.6', 'ENST00000531348.5', 'ENST00000262262.4', 'ENST00000264824.4', 'ENST00000527615.5', 'ENST00000381501.7'

In [8]:
# to retrieve the data from the archs4 database the sample-expression matrix must be downloaded
# to the ./data folder; link for file download: 
# javascript:downloadFile('https://s3.amazonaws.com/mssm-seq-matrix/human_transcript_v8.h5','human_transcript.h5','8')

f = h5py.File('./data/human_transcript_v8.h5', 'r')

# list of all sample sources
all_sources = list(f['meta']['Sample_source_name_ch1'][()])
# list of all transcript names
all_transcripts = list(f['meta']['transcripts'][()])

print('No of sources: {}'.format(len(all_sources)))
print('No of transcripts: {}'.format(len(all_transcripts)))

No of sources: 238522
No of transcripts: 178136


In [9]:
# find indices of samples with "AML" in their source names
ind_ = []

for k in range(len(all_sources)):
    if 'AML' in all_sources[k].decode('utf-8'):
        ind_.append(k)

print('Sources with "AML" in the name: {}'.format(len(ind_)))

Sources with "AML" in the name: 678


In [10]:
# create a table with all the sample expression arrays (AML): 
# rows - samples, cols - transcripts, elements - No of reads, not normalised
AML_expr = []

for i in ind_:
    AML_expr.append(list(f['data']['expression'][i]))

AML_expr = np.array(AML_expr)
print(np.shape(AML_expr))

# find normalised average of all th expr levels in AML samples
AML_expr_average = np.sum(AML_expr, axis=0)/np.shape(AML_expr)[0]

# create a table with all the sample expression arrays (random): 
# rows - samples, cols - transcripts, elements - No of reads, not normalised
rand_expr = []

ind = [random.randrange(0, len(all_sources)) for x in range(0,1000)]
for i in ind:
    rand_expr.append(list(f['data']['expression'][i]))
    
print(np.shape(rand_expr))

(678, 178136)
(1000, 178136)


Transcript counts should be normalised, as different samples contain different overall numbers of reads. One of the ways of normalising the counts is to calculate a cpm (counts per million) normalising factor first for each sample, and then additionally divide each count by the respective transcript length.

To obtain cpm, one should find the overall size of the sample library (sum of all the reads from one sample) and divide this number by 10^6. This is the normalising factor F. Then one should divide every count in the sample by this normalising factor F to obtain reads in cpm.

In [88]:
# find normalizing factor arrays for each group of samples (cpm)
# each row will be summarised into one element and divided by 10^6
F_AML = np.sum(AML_expr, axis=1)/10**6
F_rand = np.sum(rand_expr, axis=1)/10**6

# convert to cpm: divide by normalising factors
AML_expr_ = [row/s for row,s in zip(AML_expr, F_AML)]
rand_expr_ = [row/s for row,s in zip(rand_expr, F_rand)]

In [None]:
# find indices of the DE transcripts
ind_DE = {}

for k in range(len(all_transcripts)):
    if all_transcripts[k].decode('utf-8') in transcript_dict.keys():
        ind_DE[k] = all_transcripts[k].decode('utf-8')

In [89]:
# only retain the DE transcripts for the set of AML / rand samples
AML_DE_expr = []
rand_DE_expr = []
            
for row in AML_expr_:
    AML_DE_expr.append([row[x] for x in ind_DE.keys()])
    
for row in rand_expr_:
    rand_DE_expr.append([row[x] for x in ind_DE.keys()])

The "table" matrix contains the ultimate dataset with samples as rows and selected differentially expressed transcripts and respective expression levels (rounded to 3 decimals) as matrix values.

Sequences of the DE transcripts (only exons) can be accessed from the "transcript_dict" dictionary under the respective key, i.e. sequence = transcript_dict['ENST00000477988.1']

In [96]:
table = []
table.extend(np.array(AML_DE_expr))
table.extend(np.array(rand_DE_expr))
table = np.array(table)
transcript_row = np.array([ind_DE[key] for key in ind_DE.keys()]).reshape(len(ind_DE.keys()),1)
table = np.concatenate((transcript_row.T, table), axis=0)

Now from counts per million normalising read counts by the length of the transcripts, as longer transcripts will be covered more --> need to account for this.

In [97]:
for j in range(1, np.shape(table)[1]):
    for i in range(1, np.shape(table)[0]):
        table[i,j] = float(table[i,j])/len(transcript_dict[table[0,j]])

The 'table' variable contains a matrix with all the differentially expressed (DE) transcript read counts, normalised. Both groups of samples (AML/rand) are stacked into the matrix, and the first column contains the sample marker (AMLN/randN). The first row contains the transcript names. Respective transcript sequences (just exonic sequences, ligated) can be obtained from the 'transcript_dict' dictionary with the transcript name as the key. 

In [98]:
a = ['-']
for i in range(len(AML_DE_expr)):
    a.append('AML'+str(i+1))
for i in range(len(rand_DE_expr)):
    a.append('rand'+str(i+1))
a = np.array(a).reshape(len(AML_DE_expr)+len(rand_DE_expr)+1,1)

table = np.concatenate((a, table), axis=1)

display(HTML(tabulate.tabulate(table, tablefmt='html')))

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49
-,ENST00000477988.1,ENST00000561385.5,ENST00000284509.10,ENST00000262262.4,ENST00000598473.1,ENST00000378962.3,ENST00000515859.5,ENST00000507316.1,ENST00000537784.5,ENST00000611771.1,ENST00000592205.5,ENST00000233997.3,ENST00000367279.8,ENST00000309017.7,ENST00000468385.1,ENST00000304639.3,ENST00000381501.7,ENST00000376581.9,ENST00000373304.3,ENST00000393118.6,ENST00000531348.5,ENST00000216336.2,ENST00000257818.2,ENST00000215855.6,ENST00000281938.6,ENST00000304625.2,ENST00000282026.1,ENST00000261233.8,ENST00000448387.6,ENST00000359135.7,ENST00000381297.9,ENST00000427103.5,ENST00000367814.8,ENST00000527615.5,ENST00000430686.2,ENST00000264824.4,ENST00000241453.11,ENST00000380987.2,ENST00000346128.10,ENST00000400007.8,ENST00000245479.2,ENST00000355530.6,ENST00000633060.1,ENST00000620695.2,ENST00000612677.4,ENST00000554578.5,ENST00000293373.10,ENST00000563039.2,ENST00000360121.4
AML1,0.0,0.0013589183509810086,0.016990068836177354,0.03054908768427388,0.008996417759847344,0.020375504303272587,0.0,0.0,0.00956472898069939,0.004739288504189794,4.436041668439177e-05,0.001089153383842029,0.00957722374904117,0.004399917090071489,0.00666071029366048,0.012691284169567588,0.004877767696909405,0.0,0.0037783615240791655,0.0,0.0,0.007752747231968118,0.034790434850668406,2.2219120988410265e-05,0.0,0.051741389306698624,0.00276509829563962,0.01820579225987866,0.021514167360878748,0.0,0.03227367679364027,0.016888099583196182,0.027558237144756154,0.0015638304473909249,0.00262222087610809,0.0315553525826626,0.006449962266119424,0.0022322145806422016,0.0,0.0,0.0,0.006062552860207446,4.436041668439177e-05,0.0010879538757100444,0.01223966286295588,0.0027413895224561377,0.0116628487725819,0.0297416664000045,0.04484245013736034
AML2,0.0,0.005864118822085527,0.044451630239024376,0.0032146890415642917,0.0009020907668123923,0.047740041557842054,0.0,0.0,0.0002447656787113738,0.0017960449830863054,2.4173142817001857e-05,0.0,0.007026144220404409,0.0013417924102932138,0.00602980934800517,0.0016966021007541157,0.004829683006456931,0.0,0.028035151549636372,0.0,0.0,0.00206136872086581,0.056282655995370086,4.8431103678975654e-05,0.0,0.012796405675179348,0.0038354243551175266,0.01974714573285294,0.009476033017586177,0.0,0.07330689392976086,0.026716461737504,0.03977206382397939,0.002678254271907066,0.004416650190956006,0.06482499957268589,0.032750236375158434,0.016442652897768684,0.0,0.0,0.0,0.036577223369354865,2.4173142817001857e-05,0.0,0.010845570840944405,0.0044291470418313834,0.02234127352033014,0.021721075565811928,0.037348220958495694
AML3,0.0,0.0017664905370746993,0.025002854819780238,0.029718779671105273,0.008465207997701937,0.019462229920574687,0.0,0.0,0.01224768653759801,0.006377088110834016,6.415899427818996e-05,0.0007472365336909147,0.016813851889945,0.005692257544147459,0.009978749365497294,0.010150888546809542,0.007730229985966647,0.0,0.004423311166262203,0.0,0.0,0.004529245938524782,0.03409094789603901,0.0,0.0,0.04375821629201106,0.0035479634831360315,0.01888266303533183,0.033352160439651835,0.0,0.06064872894699495,0.017467161114361255,0.049665541590880435,0.0017463453089786116,0.0035538571101844303,0.044228897292592495,0.005409998415854222,0.001804520265872184,0.0,0.0,0.0,0.004440284365595402,6.415899427818996e-05,0.0007464135859665854,0.015478820597853302,0.0028287619363337145,0.01415620339565448,0.034845553078328054,0.056631405795611364
AML4,0.0,0.028152388115391922,0.04196139534632247,0.053341092569191284,0.004761137060869872,0.008499492546297015,0.0,0.0,0.00016874282292138394,0.016321138789632313,0.009076710467810473,0.01959162706739246,0.013371738662392796,0.0014724220506732492,0.0099924916897716,0.002177647763495706,0.022265082412734418,0.0,0.009713000341941166,0.0,0.0,0.033573487869836896,0.00936416941754452,0.0,0.0,0.0331054308807621,0.002695784996798497,0.014779136838710051,0.05071117763165027,0.0,0.017480090655051406,0.020495372457663155,0.0008366146984443138,0.0012543000257730112,0.001675575637107838,0.039914790008628735,0.056040564243018086,0.07287475383711867,0.0,0.0,4.748296173628752e-06,0.03945919625032786,0.009076710467810473,0.019570050385600175,0.016066662263392328,0.011022126745071395,0.03259820223503324,0.03323133130108242,0.04249161258274476
AML5,0.0,0.00363182897403816,0.0399609424574373,0.02416764379135809,0.005966940817635084,0.021420269744285653,0.0,0.0,0.0008571278396100574,0.014743770160627931,0.0006738427910580758,0.0045297678550505795,0.01693973299743132,0.005767955964623123,0.014954414296999865,0.011955950764507839,0.02501513494862063,0.0,0.006861991135750862,0.0,0.0,0.007234931813334562,0.030620312328864467,0.0,0.0,0.046628207981580565,0.004599148750781962,0.0189511373101817,0.05755842183502226,0.0,0.04518079338266792,0.05484247043793181,0.04449067279424064,0.001331127197484054,0.003814657509630616,0.09455849980371052,0.01520757392780577,0.006158926755960009,0.0,1.895763863294055e-05,0.0,0.006243476589476607,0.0006738427910580758,0.004524779123932682,0.018190786793616055,0.009014429160996013,0.01937256180971924,0.06640031096716952,0.10710409157448143
AML6,0.018406332941812226,0.009882809520451744,0.08044680677451686,0.060003134610586234,0.012235851192071875,0.01054871871163011,0.0,0.0,0.00031359885749262834,0.010185616440391185,0.0033739571183473665,0.005743100576111201,0.00957805809131362,0.003945261801870509,0.008708745652016717,0.009842275253607926,0.009532479447603381,0.0,0.015740823245423863,0.0,0.0,0.03999450461973641,0.015496345657883239,0.0,0.0,0.07145236190606273,0.007244384417976772,0.024773845574540494,0.029441843121787846,0.0,0.04948616159092333,0.045862047545324934,0.05354670933950583,0.0021181584317751154,0.0042462486226767585,0.06763488444925195,0.019729318189851526,0.008863809204229882,0.0,0.0,0.0,0.02225038329513689,0.0033739571183473665,0.005736775575476718,0.020822191151362128,0.007513977688271447,0.03081728460803417,0.048017384479611604,0.06570906005670737
AML7,0.0,0.010781676033091523,0.0860290174328547,0.057058500629361124,0.010399359466883248,0.012763123862542913,0.0,0.0,0.00020902164620791125,0.011982213529547514,0.0018689161172309586,0.003159790317460073,0.012442971726624357,0.003356720183220812,0.012161804197398646,0.003844539999670628,0.014350556299830867,0.0,0.016672617911029523,0.0,0.0,0.01589209094751455,0.026697308040174873,0.0,1.5024336865499988e-05,0.033532932219349366,0.006994008578434384,0.02436789041796299,0.03344059187348914,0.0,0.06596584217429186,0.04900578279970392,0.06588281144746946,0.0022018288723248225,0.005222313021665613,0.07583827722633069,0.020973872164719295,0.006582286425613093,0.0,0.0,0.0,0.02660473071889085,0.0018689161172309586,0.003156310372176527,0.01955053916885164,0.008910130678177957,0.033490076725213425,0.04394730019738293,0.06246046708856451
AML8,0.013735073803427497,0.003575972009638378,0.04412464901723464,0.02222789645616242,0.005717137319193171,0.008822025640482728,0.0,0.0,0.0014183994852635488,0.011085279799804148,0.0010022674987906813,0.0026046666970116093,0.016616704907668794,0.006044891711096775,0.01242758798425108,0.030026397453603998,0.008411372703437434,0.0,0.004995497212950297,0.0,0.0,0.01669547811335662,0.019648877805287675,0.0,0.0,0.09215852992271976,0.0033873661744939438,0.01865647873011189,0.039076272083648764,0.0,0.019799493546960985,0.05260436361253293,0.048497039816523674,0.0017882708437893843,0.00315535479267929,0.06814674024820071,0.01737526325368246,0.004834111005664218,0.0,3.992754012624272e-05,3.4940406521056974e-06,0.004741783603953659,0.0010022674987906813,0.0026017981213541075,0.02069774681155146,0.0055114156907424255,0.014377471464612303,0.06344966635582265,0.08903690865610311
AML9,0.0,0.004075076993435421,0.04165232560771626,0.020386227739064875,0.008282912489494218,0.007292521902503573,0.0,0.0,0.009725497818686658,0.016393513836743975,0.0064744742340228974,0.012148984050345805,0.015417568027035523,0.0056974133513262,0.012938115813466664,0.04173369551923975,0.007323100052777397,0.0,0.0064215891372793,0.0,0.0,0.045150982088977554,0.02741890359869288,0.0,0.0,0.27143818463848873,0.004679569926373401,0.020915626800405616,0.056649815348318776,8.56239369197239e-06,0.047695604048618424,0.047079264531536484,0.06802665165443779,0.0019980189248413457,0.005255871641444017,0.05310789420328035,0.027351254091791906,0.01058717949637862,0.0,0.0,0.0,0.012344305364392669,0.0064744742340228974,0.012135604111964366,0.026565647687344613,0.0107287361980248,0.01954862035065835,0.0666812720284062,0.08785811433208769
