In [2]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_dna
from utils import *

import numpy as np
import random
import pandas as pd

import time
import os

# Import GENCODE v34 & construct gene dict

Then construct the gene dictionary containing summary of transcript info used for splicing labels: for each gene we find all the transcripts that belong to it, and finding a "global start" and "global end" as the uttermost left start among all the transcripts and the uttermost right end, respectively, because we will be constructing splicing labels for the whole gene, and we must take a stretch of the sequence that covers all the transcripts of the gene. 

In [15]:
# 0 - tr. name, 1 - chr, 2 - strand, 3,4 - tr. start/end, 5,6 - exon starts/ends, 7 - gene name
transcript_file = np.genfromtxt('./data/GENCODE_v34_hg38_comprehensive', usecols=(1, 2, 3, 4, 5, 9, 10, 12), skip_header=1, dtype='str')
print(len(transcript_file))

region = ['chr22']

gene_dict = {}

def mRNA_len(starts, ends):
    l = 0
    for x in zip(starts, ends):
        l += x[1]-x[0]
    return l

start_time = time.time()

# each key is a gene
for row in transcript_file:
    
    # only for chr22 for now
    if row[1] in region:
        
        if row[7] not in gene_dict.keys():
            gene_dict[row[7]] = {}
            gene_dict[row[7]]['chr'] = row[1]
            gene_dict[row[7]]['strand'] = row[2]
            gene_dict[row[7]][row[0][:15]] = {}
            gene_dict[row[7]][row[0][:15]]['starts'] = [int(x) for x in row[5].split(',')[:-1]]
            gene_dict[row[7]][row[0][:15]]['ends'] = [int(x) for x in row[6].split(',')[:-1]]
            gene_dict[row[7]][row[0][:15]]['length'] = mRNA_len(gene_dict[row[7]][row[0][:15]]['starts'],
                                                               gene_dict[row[7]][row[0][:15]]['ends'])
            gene_dict[row[7]]['global_start'] = int(row[3])
            gene_dict[row[7]]['global_end'] = int(row[4])

        else:
            gene_dict[row[7]][row[0][:15]] = {}
            gene_dict[row[7]][row[0][:15]]['starts'] = [int(x) for x in row[5].split(',')[:-1]]
            gene_dict[row[7]][row[0][:15]]['ends'] = [int(x) for x in row[6].split(',')[:-1]]
            gene_dict[row[7]][row[0][:15]]['length'] = mRNA_len(gene_dict[row[7]][row[0][:15]]['starts'],
                                                               gene_dict[row[7]][row[0][:15]]['ends'])
            if int(row[3]) < gene_dict[row[7]]['global_start']:
                gene_dict[row[7]]['global_start'] = int(row[3])
            if int(row[4]) > gene_dict[row[7]]['global_end']:
                gene_dict[row[7]]['global_end'] = int(row[4])

        try:
            gene_dict[row[7]]['labels'] = np.zeros(
                (int(gene_dict[row[7]]['global_end']) - 
                 int(gene_dict[row[7]]['global_start'])))
        except ValueError:
            print(row[7])
            print(gene_dict[row[7]]['global_start'], gene_dict[row[7]]['global_end'])

print("Took {} seconds to construct the gene dict".format(time.time() - start_time))
print("Number of genes in the region of interest:", len(gene_dict.keys()))

227517
Took 5.8211989402771 seconds to construct the gene dict
Number of genes in the region of interest: 1044


# Import hg38

Loading human genome (version hg38) to extract the respective "extended" gene sequences (covering all the transcripts of the gene + flanking ends of 1000nt for context).

In [22]:
# need to download and unpack the genome file into the ./data directory:
# wget --timestamping 'ftp://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz' -O hg38.fa.gz
# gunzip hg38.fa.gz
path_to_file = './data/hg38.fa'

hg38 = {}

with open(path_to_file, mode='r') as handle:

    for record in SeqIO.parse(handle, 'fasta'):

        identifier = record.id
        description = record.description
        sequence = record.seq
        
        hg38[identifier] = sequence

# Import count files by tissue type + AML

## construct the library separately for each tissue type and count file

In [20]:
def construct_library(gene_dict, counts):
    library = {}
    start_time = time.time()
    
    for sample in counts.index:
        library[sample] = {}
        for gene in gene_dict.keys():
            for tr in gene_dict[gene].keys():
                
                if tr in counts.columns:
                    if gene not in library[sample].keys():
                        library[sample][gene] = {}
                        library[sample][gene] = {}
                        length = int(gene_dict[gene]['global_end']) - int(gene_dict[gene]['global_start']) + 1
                        library[sample][gene]['alabels'] = np.zeros(length)
                        library[sample][gene]['dlabels'] = np.zeros(length)
                        library[sample][gene]['norm_factor'] = 0
                        gs = int(gene_dict[gene]['global_start'])
                        ge = int(gene_dict[gene]['global_end'])
                        l = gene_dict[gene][tr]['length']
                        for s in gene_dict[gene][tr]['starts']:
                            # normalise to tpkm
                            library[sample][gene]['alabels'][int(s)-gs] += float(counts.at[sample,tr]/l)
                        for s in gene_dict[gene][tr]['ends']:
                            # normalise to tpkm
                            library[sample][gene]['dlabels'][int(s)-gs] += float(counts.at[sample,tr]/l)
                        # normalise to tpkm
                        library[sample][gene]['norm_factor'] += float(counts.at[sample,tr]/l)
                    else:
                        gs = int(gene_dict[gene]['global_start'])
                        ge = int(gene_dict[gene]['global_end'])
                        l = gene_dict[gene][tr]['length']
                        for s in gene_dict[gene][tr]['starts']:
                            # normalise to tpkm
                            library[sample][gene]['alabels'][int(s)-gs] += float(counts.at[sample,tr]/l)
                        for s in gene_dict[gene][tr]['ends']:
                            # normalise to tpkm
                            library[sample][gene]['dlabels'][int(s)-gs] += float(counts.at[sample,tr]/l)
                        # normalise to tpkm
                        library[sample][gene]['norm_factor'] += float(counts.at[sample,tr]/l)
                        
    print("All samples calc in {} seconds".format(time.time() - start_time))
    return library


def labels_squeeze(labels, c):
    ind = np.nonzero(labels)[0]
    if c==0:
        c = 1
    labels = [round(labels[i]/c, 4) for i in ind]
    labels_ = ''
    for i, l in zip(ind,labels):
        labels_ += str(i) + ' ' + str(l) + ',' 
    return labels_


def exons(gene):
    exons = ''
    gs = int(gene['global_start'])
    for key in gene.keys():
        if 'ENST' in key:
            starts = gene[key]['starts']
            ends = gene[key]['ends']
            for x in zip(starts, ends):
                exons += str(x[0]-gs) + ' ' + str(x[1]-gs) + ','
            if exons:
                exons = exons[:-1] + ';'
    return exons


def save_labels_jsonl(counts, gene_dict, hg38):
    
    if not os.path.exists('./data/main_inputs/'):
        os.makedirs('./data/main_inputs/')
    
    # normalise to cpm first
    F = counts.sum(axis = 1)/10**6
    counts = counts.divide(F, axis='index')
    
    context = 1000

    start_time = time.time()
    library = construct_library(gene_dict, counts)
    for sample in library.keys():
        print(sample)
        with open('./data/main_inputs/'+sample+'.jsonl', 'a') as f1:
            # genes
            for gene in library[sample].keys():
                # header: gene name 
                #header = '@' + gene
                #f1.write(header + os.linesep)
                jsonl_dict = {}

                gs = int(gene_dict[gene]['global_start'])
                ge = int(gene_dict[gene]['global_end'])
                seq = str(hg38[gene_dict[gene]['chr']][gs - context : ge + context + 1])
                # next line: sequence + context flanks on each side
                #f1.write(seq + os.linesep)
                jsonl_dict[gene] = seq

                # next two lines: labels, acceptor then donor, length = seq length - flanks
                c = library[sample][gene]['norm_factor']
                #f1.write(labels_squeeze(library[sample][gene]['alabels'], c) + os.linesep)
                #f1.write(labels_squeeze(library[sample][gene]['dlabels'], c) + os.linesep)
                jsonl_dict['alabels'] = labels_squeeze(library[sample][gene]['alabels'], c)
                jsonl_dict['dlabels'] = labels_squeeze(library[sample][gene]['dlabels'], c)
                jsonl_dict['exons'] = exons(gene_dict[gene])
                
                json.dump(jsonl_dict, f1)
                f1.write('\n')
                
    print("All samples saved in {} seconds".format(time.time() - start_time))
    return

In [25]:
import json

path = './data/counts_by_tissue/'
dirs = os.listdir(path)
print(dirs)

for file in dirs:
    if 'csv' in file:
        counts = pd.read_csv(path+file, sep=',', header=0, index_col=0, low_memory=False)
        print(file, '; shape:', np.shape(counts))
        save_labels_jsonl(counts, gene_dict, hg38)

['.DS_Store', 'AML.csv', 'adipose.csv', 'blood.csv', 'brain.csv', 'breast.csv', 'colon.csv', 'heart.csv', 'kidney.csv', 'liver.csv', 'lung.csv', 'lymph.csv', 'prostate.csv', 'skeletal.csv', 'testis.csv', 'thyroid.csv']
AML.csv ; shape: (1874, 178136)
All samples calc in 1.5012640953063965 seconds
AML1
AML2
AML3
All samples saved in 3.7682077884674072 seconds
adipose.csv ; shape: (200, 178136)
All samples calc in 1.005049705505371 seconds
ADP1
ADP2
ADP3
All samples saved in 2.6345481872558594 seconds
