In [8]:
from Bio import SeqIO
import numpy as np
from utils import *


In [4]:
transcript_file = np.genfromtxt('./data/GENCODE_v34_hg38_comprehensive', usecols=(1, 2, 3, 4, 5, 9, 10), skip_header=1, dtype='str')
DE_tr = np.genfromtxt('./lists/limma_DE_AML_RAN', usecols=(1,), skip_header=1, dtype='str')

print(DE_tr)

['ENST00000257818.2' 'ENST00000241453.11' 'ENST00000216336.2'
 'ENST00000284509.10' 'ENST00000592205.5' 'ENST00000633060.1'
 'ENST00000427103.5' 'ENST00000360121.4' 'ENST00000378962.3'
 'ENST00000612677.4' 'ENST00000309017.7' 'ENST00000304625.2'
 'ENST00000380987.2' 'ENST00000598473.1' 'ENST00000233997.3'
 'ENST00000620695.2' 'ENST00000367279.8' 'ENST00000376581.9'
 'ENST00000635923.1' 'ENST00000448387.6' 'ENST00000537784.5'
 'ENST00000563039.2' 'ENST00000381297.9' 'ENST00000611771.1'
 'ENST00000430686.2' 'ENST00000304639.3' 'ENST00000393118.6'
 'ENST00000554578.5' 'ENST00000400007.8' 'ENST00000245479.2'
 'ENST00000561385.5' 'ENST00000215855.6' 'ENST00000293373.10'
 'ENST00000468385.1' 'ENST00000477988.1' 'ENST00000282026.1'
 'ENST00000346128.10' 'ENST00000261233.8' 'ENST00000359135.7'
 'ENST00000367814.8' 'ENST00000515859.5' 'ENST00000507316.1'
 'ENST00000355530.6' 'ENST00000531348.5' 'ENST00000262262.4'
 'ENST00000264824.4' 'ENST00000527615.5' 'ENST00000381501.7'
 'ENST00000373304.3'

In [6]:
fasta_seq = SeqIO.parse(open('./data/hg38.fa'), 'fasta')

for fasta in fasta_seq:
    name, sequence = fasta.id, str(fasta.seq)

In [24]:
transcripts = []
labels = []

# flanking ends on each side are of this length to include some context
context = 1000

for row in transcript_file:
    # explicitly checking transcript_name
    if row[0] in DE_tr:
        # sequence from start to end
        s = sequence[int(row[3]) - context: int(row[4]) + context].upper()
        # adding the transcripts of the sense strand: whole transcript + flanks + zero-padded, labels + zero-padded
        if row[2] == '+':
            # extract the transcript sequence with 1k flanks
            if 'N' not in s:
                # padding labels here
                pad = 5000 - (len(s) - context * 2) % 5000
                es, ee = row[5].split(',')[:-1], row[6].split(',')[:-1]
                # decrease the pad length from both sides because the context-1 and context+sequence+1 sites are
                # donor and acceptor, respectively
                y = make_labels(s, context, es, ee)
                labels.append(y)
                # padding sequence with Os
                s = (pad // 2) * 'O' + s + (pad - pad // 2) * 'O'
                transcripts.append(s)
            else:
                print('contains N')
        # adding the transcripts of the antisense strand
        if row[2] == '-':
            if 'N' not in s:
                # padding labels here
                pad = 5000 - (len(s) - context * 2) % 5000
                # decrease the pad length from both sides because the context-1 and context+sequence+1 sites are
                # donor and acceptor, respectively
                es, ee = row[5].split(',')[:-1], row[6].split(',')[:-1]
                # decrease the pad length from both sides because the context-1 and context+sequence+1 sites are
                # donor and acceptor, respectively
                y = make_labels(s, context, es, ee)
                labels.append(y)
                # hot-encoding labels and adding hot-encoded labels to a new list
                # getting complementary seq
                s = ''.join([complementary(x) for x in s])
                # padding sequence with Os
                s = (pad // 2) * 'O' + s + (pad - pad // 2) * 'O'
                transcripts.append(s)
            else:
                print('contains N')


In [27]:
for t in DE_tr:
    found = 0
    # explicitly checking transcript_name
    for row in transcript_file:
        if row[0][:15]==t[:15]:
            found = 1
            s = sequence[int(row[3]) - context: int(row[4]) + context].upper()
            # adding the transcripts of the sense strand: whole transcript + flanks + zero-padded, labels + zero-padded
            if row[2] == '+':
                # extract the transcript sequence with 1k flanks
                if 'N' not in s:
                    # padding labels here
                    pad = 5000 - (len(s) - context * 2) % 5000
                    es, ee = row[5].split(',')[:-1], row[6].split(',')[:-1]
                    # decrease the pad length from both sides because the context-1 and context+sequence+1 sites are
                    # donor and acceptor, respectively
                    y = make_labels(s, context, es, ee)
                    labels.append(y)
                    # padding sequence with Os
                    s = (pad // 2) * 'O' + s + (pad - pad // 2) * 'O'
                    transcripts.append(s)
                else:
                    print('contains N')
            # adding the transcripts of the antisense strand
            if row[2] == '-':
                if 'N' not in s:
                    # padding labels here
                    pad = 5000 - (len(s) - context * 2) % 5000
                    # decrease the pad length from both sides because the context-1 and context+sequence+1 sites are
                    # donor and acceptor, respectively
                    es, ee = row[5].split(',')[:-1], row[6].split(',')[:-1]
                    # decrease the pad length from both sides because the context-1 and context+sequence+1 sites are
                    # donor and acceptor, respectively
                    y = make_labels(s, context, es, ee)
                    labels.append(y)
                    # hot-encoding labels and adding hot-encoded labels to a new list
                    # getting complementary seq
                    s = ''.join([complementary(x) for x in s])
                    # padding sequence with Os
                    s = (pad // 2) * 'O' + s + (pad - pad // 2) * 'O'
                    transcripts.append(s)
                else:
                    print('contains N')
    if found==0:
        print(t, 'not found')


In [25]:
print(len(transcripts))
print(len(DE_tr))

print(transcript_file[0][0][:15])
if 'ENST00000281938.6' in DE_tr:
    print('yeth')

29
50
ENST00000371007
yeth
