In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_dna
import numpy as np
from utils import *

import h5py

In [2]:
path_to_file = './data/hg38.fa'

chr_dict = {}

with open(path_to_file, mode='r') as handle:

    for record in SeqIO.parse(handle, 'fasta'):

        identifier = record.id
        description = record.description
        sequence = record.seq
        
        chr_dict[identifier] = sequence

['ENST00000257818.2' 'ENST00000241453.11' 'ENST00000216336.2'
 'ENST00000284509.10' 'ENST00000592205.5' 'ENST00000633060.1'
 'ENST00000427103.5' 'ENST00000360121.4' 'ENST00000378962.3'
 'ENST00000612677.4' 'ENST00000309017.7' 'ENST00000304625.2'
 'ENST00000380987.2' 'ENST00000598473.1' 'ENST00000233997.3'
 'ENST00000620695.2' 'ENST00000367279.8' 'ENST00000376581.9'
 'ENST00000635923.1' 'ENST00000448387.6' 'ENST00000537784.5'
 'ENST00000563039.2' 'ENST00000381297.9' 'ENST00000611771.1'
 'ENST00000430686.2' 'ENST00000304639.3' 'ENST00000393118.6'
 'ENST00000554578.5' 'ENST00000400007.8' 'ENST00000245479.2'
 'ENST00000561385.5' 'ENST00000215855.6' 'ENST00000293373.10'
 'ENST00000468385.1' 'ENST00000477988.1' 'ENST00000282026.1'
 'ENST00000346128.10' 'ENST00000261233.8' 'ENST00000359135.7'
 'ENST00000367814.8' 'ENST00000515859.5' 'ENST00000507316.1'
 'ENST00000355530.6' 'ENST00000531348.5' 'ENST00000262262.4'
 'ENST00000264824.4' 'ENST00000527615.5' 'ENST00000381501.7'
 'ENST00000373304.3'

In [None]:
# 0 - tr. name, 1 - chr, 2 - strand, 3,4 - tr. start/end, 5,6 - exon starts/ends, 7 - gene name
transcript_file = np.genfromtxt('./data/GENCODE_v34_hg38_comprehensive', usecols=(1, 2, 3, 4, 5, 9, 10, 12), skip_header=1, dtype='str')

gene_dict = {}

for row in transcript_file:
    
    if row[7] in gene_dict.keys():
        gene_dict[row[7]][row[0]] = {}
        gene_dict[row[7]][row[0]]['starts'] = row[5]
        gene_dict[row[7]][row[0]]['ends'] = row[6]
        if row[3] < gene_dict[row[7]]['global_start']:
            gene_dict[row[7]]['global_start'] = row[3]
        if row[4] > gene_dict[row[7]]['global_end']:
            gene_dict[row[7]]['global_start'] = row[4]
    else:
        gene_dict[row[7]] = {}
        gene_dict[row[7]]['chr'] = row[1]
        gene_dict[row[7]]['strand'] = row[2]
        gene_dict[row[7]][row[0]] = {}
        gene_dict[row[7]][row[0]]['starts'] = row[5]
        gene_dict[row[7]][row[0]]['ends'] = row[6]
        gene_dict[row[7]]['global_start'] = row[3]
        gene_dict[row[7]]['global_end'] = row[4]
        
    gene_dict[row[7]]['labels'] = np.zeros(
        (int(gene_dict[row[7]]['global_end']) - 
         int(gene_dict[row[7]]['global_start'])))


In [None]:
print(len(gene_dict.keys()))

In [None]:
# a dictionary with a respective structure: 
# library -> samples: 'AML1', 'AML2', ...
# each sample -> 'AMLi': 'gene1', 'gene2', ...
# each gene -> 'genei': array(donor_labels), array(acceptor_labels)
library_AML = {}

for i in range(len(samples)):
    # sample number
    sN = 'AML'+str(i+1)
    library_AML[sN] = {}
    
    for j in range(len(samples[i])):
        
        c = 0
        for gene in gene_dict.keys():
            # go thru the genes and find if the transcript belongs to the gene
            if data_tr[j] in gene_dict[gene].keys():
                # found a transcript in the gene; 
                # adding the counts to the respective exon start/end pos
                # meanwhile summing up the counts to normalise afterwards
                # ind1, ind2 based on global start/end; samples i j shouldn't be a string!
                # labels[i,j] <- 
                # gene_dict[key][all_transcripts[j]]['starts'/'ends'] - 
                # - global_start / global_end
                if gene in library_AML[sN].keys():
                    # if gene is alrd there just add
                    # labels but not only for one index, for a set of indices
                    for s in gene_dict[gene][data_tr[j]]['starts']:
                        gs = int(gene_dict[gene]['global_start'])
                        # int(s)-gs is an array index
                        library_AML[sN][gene]['alabels'][int(s)-gs] += float(samples[i][j])
                    for s in gene_dict[gene][data_tr[j]]['ends']:
                        ge = int(gene_dict[gene]['global_end'])
                        # int(s)-gs is an array index
                        library_AML[sN][gene]['dlabels'][int(s)-ge] += float(samples[i][j])
                    library_AML[sN][gene]['norm_factor'] += samples[i][j]
                else:
                    # if gene is not there yet create
                    library_AML[sN][gene] = {}
                    length = int(gene_dict[gene]['global_end']) - int(gene_dict[gene]['global_start']))
                    library_AML[sN]['alabels'] = np.zeros(length)
                    library_AML[sN]['dlabels'] = np.zeros(length)
                    for s in gene_dict[gene][data_tr[j]]['starts']:
                        gs = int(gene_dict[gene]['global_start'])
                        # int(s)-gs is an array index
                        library_AML[sN][gene]['alabels'][int(s)-gs] += float(samples[i][j])
                    for s in gene_dict[gene][data_tr[j]]['ends']:
                        ge = int(gene_dict[gene]['global_end'])
                        # int(s)-gs is an array index
                        library_AML[sN][gene]['dlabels'][int(s)-ge] += float(samples[i][j])
                    library_AML[sN][gene]['norm_factor'] += samples[i][j]
                
                

In [5]:
handle = open('./data/DE_AML_transcripts', "w")
records = []

for i in range(len(transcripts)):
    try:
        records.append(SeqRecord(Seq(str(transcripts[i]), generic_dna), DE_tr_[i]))
    except TypeError:
        print(i)
        print(transcripts[i])

SeqIO.write(records, handle, "fasta")

50