In [38]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_dna
import numpy as np
from utils import *

import numpy as np
import random
import pandas as pd

import time
import os

def labels_to_str(labels):
    return ''.join([str(x)+' ' for x in labels])

Import count files for AML/random samples; here rows are transcripts, columns are samples

In [2]:
data = pd.read_csv('./data/counts_AML.tsv', delimiter='\t', header=0, index_col=0, low_memory=False)

# transpose so transcripts are in the columns
data = data.T

Import the GENCODE v34 comprehensive transcript info file 

In [5]:
# 0 - tr. name, 1 - chr, 2 - strand, 3,4 - tr. start/end, 5,6 - exon starts/ends, 7 - gene name
transcript_file = np.genfromtxt('./data/GENCODE_v34_hg38_comprehensive', usecols=(1, 2, 3, 4, 5, 9, 10, 12), skip_header=1, dtype='str')
print(len(transcript_file))

Construct the gene dictionary containing summary of transcript info used for splicing labels: for each gene we find all the transcripts that belong to it, and finding a "global start" and "global end" as the uttermost left start among all the transcripts and the uttermost right end, respectively, because we will be constructing splicing labels for the whole gene, and we must take a stretch of the sequence that covers all the transcripts of the gene. 

In [46]:
gene_dict = {}

start_time = time.time()

# each key is a gene
for row in transcript_file:
    
    # only for chr22 for now
    if row[1]=='chr22':
        
        if row[7] not in gene_dict.keys():
            gene_dict[row[7]] = {}
            gene_dict[row[7]]['chr'] = row[1]
            gene_dict[row[7]]['strand'] = row[2]
            gene_dict[row[7]][row[0]] = {}
            gene_dict[row[7]][row[0]]['starts'] = [int(x) for x in row[5].split(',')[:-1]]
            gene_dict[row[7]][row[0]]['ends'] = [int(x) for x in row[6].split(',')[:-1]]
            gene_dict[row[7]]['global_start'] = int(row[3])
            gene_dict[row[7]]['global_end'] = int(row[4])

        else:
            gene_dict[row[7]][row[0]] = {}
            gene_dict[row[7]][row[0]]['starts'] = [int(x) for x in row[5].split(',')[:-1]]
            gene_dict[row[7]][row[0]]['ends'] = [int(x) for x in row[6].split(',')[:-1]]
            if int(row[3]) < gene_dict[row[7]]['global_start']:
                gene_dict[row[7]]['global_start'] = int(row[3])
            if int(row[4]) > gene_dict[row[7]]['global_end']:
                gene_dict[row[7]]['global_end'] = int(row[4])

        try:
            gene_dict[row[7]]['labels'] = np.zeros(
                (int(gene_dict[row[7]]['global_end']) - 
                 int(gene_dict[row[7]]['global_start'])))
        except ValueError:
            print(row[7])
            print(gene_dict[row[7]]['global_start'], gene_dict[row[7]]['global_end'])

print("Took {} seconds to construct the gene dict".format(time.time() - start_time))
print("Number of genes in the region of interest:", len(gene_dict.keys()))

Took 6.940023899078369 seconds


Now constructing the library of the samples and respective labels: the set of genes and transcripts is exactly the same for each sample, but the RNA-seq counts will differ. 

For each individual sample ('AML1', 'AML2', ...), and using the gene dict constructed previously, we're adding the number of counts from each transcript to all the exon start/end positions of this transcript to the respective positions of acceptor/donor label arrays. This will give us non-normalised number of counts covering each exon start/end. Then we divide these numbers by the total number of counts of all the transcripts belonging to the gene, so if an exon belongs to all the transcripts, then the resulting normalised value for the start/end positions of this exon will be 1. The start and end (acceptor / donor) labels are stored in separate arrays. But for one particular exon with unique start/end positions the normalised number of counts (the label) of the start and the end positions will be the same.

In [1]:
# a dictionary with a respective structure: 
# library -> samples: 'AML1', 'AML2', ...
# each sample -> 'AMLi': 'gene1', 'gene2', ...
# each gene -> 'genei': array(donor_labels), array(acceptor_labels)
library_AML = {}

start_time = time.time()

# data.index are pandas dataframe rows which are sample names in this case
for sample in data.index[0:5]:
    print(sample)
    # sample number
    library_AML[sample] = {}
    for gene in gene_dict.keys():
        for tr in gene_dict[gene].keys():
            # go thru the genes and find if the transcript belongs to the gene
            if tr in data.columns:
                # found a transcript in the gene; 
                # adding the counts to the respective exon start/end pos
                # meanwhile summing up the counts to normalise afterwards
                # ind1, ind2 based on global start/end; samples i j shouldn't be a string!
                # labels[i,j] <- 
                # gene_dict[key][all_transcripts[j]]['starts'/'ends'] - 
                # - global_start / global_end
                if gene not in library_AML[sample].keys():
                    # if gene is not there yet create
                    library_AML[sample][gene] = {}
                    length = int(gene_dict[gene]['global_end']) - int(gene_dict[gene]['global_start'])
                    library_AML[sample][gene]['alabels'] = np.zeros(length)
                    library_AML[sample][gene]['dlabels'] = np.zeros(length)
                    library_AML[sample][gene]['norm_factor'] = 0
                    for s in gene_dict[gene][tr]['starts']:
                        gs = int(gene_dict[gene]['global_start'])
                        # int(s)-gs is an array index
                        library_AML[sample][gene]['alabels'][int(s)-gs] += float(data.at[sample,tr])
                    for s in gene_dict[gene][tr]['ends']:
                        ge = int(gene_dict[gene]['global_end'])
                         # int(s)-gs is an array index
                        library_AML[sample][gene]['dlabels'][int(s)-ge] += float(data.at[sample,tr])
                    library_AML[sample][gene]['norm_factor'] += float(data.at[sample,tr])
                else:
                    # if gene is alrd there just add
                    # labels but not only for one index, for a set of indices
                    for s in gene_dict[gene][tr]['starts']:
                        gs = int(gene_dict[gene]['global_start'])
                        # int(s)-gs is an array index
                        library_AML[sample][gene]['alabels'][int(s)-gs] += float(data.at[sample,tr])
                    for s in gene_dict[gene][tr]['ends']:
                        ge = int(gene_dict[gene]['global_end'])
                        # int(s)-gs is an array index
                        library_AML[sample][gene]['dlabels'][int(s)-ge] += float(data.at[sample,tr])
                    library_AML[sample][gene]['norm_factor'] += float(data.at[sample,tr])

print("All samples calc in {} seconds".format(time.time() - start_time))

NameError: name 'time' is not defined

In [None]:
# normalisation
for sample in library_AML.keys():
    for gene in library_AML[sample].keys():
        c = library_AML[sample][gene]['norm_factor']
        library_AML[sample][gene]['alabels'] = [x/c for x in library_AML[sample][gene]['alabels']]
        library_AML[sample][gene]['dlabels'] = [x/c for x in library_AML[sample][gene]['dlabels']]

Loading human genome (version hg38) to extract the respective "extended" gene sequences (covering all the transcripts of the gene + flanking ends of 1000nt for context).

In [45]:
path_to_file = './data/hg38.fa'

hg38 = {}

with open(path_to_file, mode='r') as handle:

    for record in SeqIO.parse(handle, 'fasta'):

        identifier = record.id
        description = record.description
        sequence = record.seq
        
        hg38[identifier] = sequence

Saving the sample files with gene names, sequences and respective labels (acceptor then donor) into files, separate fpr each sample. The files are heavy.

In [53]:
start_time = time.time()

context = 1000

# save the transcript seq + labels in a separate file for each sample
for sample in library_AML.keys():
    with open('./data/AML_library/'+sample+'.txt', 'a') as f1:
        # genes
        for gene in library_AML[sample].keys():
            # header: gene name 
            header = '@' + gene
            f1.write(header + os.linesep)
            
            gs = int(gene_dict[gene]['global_start'])
            ge = int(gene_dict[gene]['global_end'])
            seq = str(hg38[gene_dict[gene]['chr']][gs - context : ge + context])
            # next line: sequence + context flanks on each side
            f1.write(seq + os.linesep)
            
            # next two lines: labels, acceptor then donor, length = seq length - flanks
            f1.write(labels_to_str(library_AML[sample][gene]['alabels']) + os.linesep)
            f1.write(labels_to_str(library_AML[sample][gene]['dlabels']) + os.linesep)
            
print("All samples saved in {} seconds".format(time.time() - start_time))

All samples saved: 69.96654510498047 seconds
