In [38]:
from IPython.display import HTML, display

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_dna
import numpy as np
from utils import *

import h5py
import numpy as np
import random

import tabulate
import time
import os

def labels_to_str(labels):
    return ''.join([str(x)+' ' for x in labels])

In [2]:
import pandas as pd

#data = np.loadtxt('./data/counts_AML.tsv', dtype='str', delimiter='\t', unpack=False)
data = pd.read_csv('./data/counts_AML.tsv', delimiter='\t', header=0, index_col=0, low_memory=False)

In [3]:
data = data.T

In [4]:
print(data.columns[0])
print(data.at['AML1','ENST00000448914.1'])

ENST00000448914.1
0


In [5]:
# 0 - tr. name, 1 - chr, 2 - strand, 3,4 - tr. start/end, 5,6 - exon starts/ends, 7 - gene name
transcript_file = np.genfromtxt('./data/GENCODE_v34_hg38_comprehensive', usecols=(1, 2, 3, 4, 5, 9, 10, 12), skip_header=1, dtype='str')


In [46]:
gene_dict = {}
i = 0

start_time = time.time()

for row in transcript_file:
    
    if row[1]=='chr22':
        
        if row[7] not in gene_dict.keys():
            gene_dict[row[7]] = {}
            gene_dict[row[7]]['chr'] = row[1]
            gene_dict[row[7]]['strand'] = row[2]
            gene_dict[row[7]][row[0]] = {}
            gene_dict[row[7]][row[0]]['starts'] = [int(x) for x in row[5].split(',')[:-1]]
            gene_dict[row[7]][row[0]]['ends'] = [int(x) for x in row[6].split(',')[:-1]]
            gene_dict[row[7]]['global_start'] = int(row[3])
            gene_dict[row[7]]['global_end'] = int(row[4])

        else:
            gene_dict[row[7]][row[0]] = {}
            gene_dict[row[7]][row[0]]['starts'] = [int(x) for x in row[5].split(',')[:-1]]
            gene_dict[row[7]][row[0]]['ends'] = [int(x) for x in row[6].split(',')[:-1]]
            if int(row[3]) < gene_dict[row[7]]['global_start']:
                gene_dict[row[7]]['global_start'] = int(row[3])
            if int(row[4]) > gene_dict[row[7]]['global_end']:
                gene_dict[row[7]]['global_end'] = int(row[4])

        try:
            gene_dict[row[7]]['labels'] = np.zeros(
                (int(gene_dict[row[7]]['global_end']) - 
                 int(gene_dict[row[7]]['global_start'])))
        except ValueError:
            print(row[7])
            print(gene_dict[row[7]]['global_start'], gene_dict[row[7]]['global_end'])

print("Took {} seconds".format(time.time() - start_time))

Took 6.940023899078369 seconds


In [7]:
tg = np.stack((transcript_file[:,0], transcript_file[:,7]), axis=0).T
print(tg[0], np.shape(tg))

['ENST00000371007.6' 'C1orf141'] (227517, 2)


In [None]:
#print(np.shape(np.concatenate((tg[:100],tg[100+1:]),axis=0)))

In [47]:
print(len(gene_dict.keys()))

if 'ENST00000421768.1' in data.columns:
    print('yes')

1044
yes


In [48]:
# a dictionary with a respective structure: 
# library -> samples: 'AML1', 'AML2', ...
# each sample -> 'AMLi': 'gene1', 'gene2', ...
# each gene -> 'genei': array(donor_labels), array(acceptor_labels)
library_AML = {}

start_time = time.time()

for sample in data.index[0:5]:
    print(sample)
    # sample number
    library_AML[sample] = {}
    for gene in gene_dict.keys():
        for tr in gene_dict[gene].keys():
            # go thru the genes and find if the transcript belongs to the gene
            if tr in data.columns:
                # found a transcript in the gene; 
                # adding the counts to the respective exon start/end pos
                # meanwhile summing up the counts to normalise afterwards
                # ind1, ind2 based on global start/end; samples i j shouldn't be a string!
                # labels[i,j] <- 
                # gene_dict[key][all_transcripts[j]]['starts'/'ends'] - 
                # - global_start / global_end
                if gene not in library_AML[sample].keys():
                    # if gene is not there yet create
                    library_AML[sample][gene] = {}
                    length = int(gene_dict[gene]['global_end']) - int(gene_dict[gene]['global_start'])
                    library_AML[sample][gene]['alabels'] = np.zeros(length)
                    library_AML[sample][gene]['dlabels'] = np.zeros(length)
                    library_AML[sample][gene]['norm_factor'] = 0
                    for s in gene_dict[gene][tr]['starts']:
                        gs = int(gene_dict[gene]['global_start'])
                        # int(s)-gs is an array index
                        library_AML[sample][gene]['alabels'][int(s)-gs] += float(data.at[sample,tr])
                    for s in gene_dict[gene][tr]['ends']:
                        ge = int(gene_dict[gene]['global_end'])
                         # int(s)-gs is an array index
                        library_AML[sample][gene]['dlabels'][int(s)-ge] += float(data.at[sample,tr])
                    library_AML[sample][gene]['norm_factor'] += float(data.at[sample,tr])
                else:
                    # if gene is alrd there just add
                    # labels but not only for one index, for a set of indices
                    for s in gene_dict[gene][tr]['starts']:
                        gs = int(gene_dict[gene]['global_start'])
                        # int(s)-gs is an array index
                        library_AML[sample][gene]['alabels'][int(s)-gs] += float(data.at[sample,tr])
                    for s in gene_dict[gene][tr]['ends']:
                        ge = int(gene_dict[gene]['global_end'])
                        # int(s)-gs is an array index
                        library_AML[sample][gene]['dlabels'][int(s)-ge] += float(data.at[sample,tr])
                    library_AML[sample][gene]['norm_factor'] += float(data.at[sample,tr])

print("All samples calc: {} seconds".format(time.time() - start_time))

AML1
AML2
AML3
AML4
AML5
All samples calc: 1.477175235748291 seconds


In [44]:
print(library_AML['AML2']['ACTL8'].keys())

dict_keys(['alabels', 'dlabels', 'norm_factor'])


In [45]:
path_to_file = './data/hg38.fa'

hg38 = {}

with open(path_to_file, mode='r') as handle:

    for record in SeqIO.parse(handle, 'fasta'):

        identifier = record.id
        description = record.description
        sequence = record.seq
        
        hg38[identifier] = sequence

In [52]:
start_time = time.time()

context = 1000

for sample in library_AML.keys():
    with open('./data/AML_library/'+sample+'.txt', 'a') as f1:
        # genes
        for gene in library_AML[sample].keys():
            header = '@' + gene
            f1.write(header + os.linesep)
            
            gs = int(gene_dict[gene]['global_start'])
            ge = int(gene_dict[gene]['global_end'])
            seq = str(hg38[gene_dict[gene]['chr']][gs - context : ge + context])
            f1.write(seq + os.linesep)
            
            f1.write(labels_to_str(library_AML[sample][gene]['alabels']) + os.linesep)
            f1.write(labels_to_str(library_AML[sample][gene]['dlabels']) + os.linesep)
            
print("All samples saved: {} seconds".format(time.time() - start_time))

KeyboardInterrupt: 

In [None]:
print(len(library_AML['AML2'].keys()))

In [None]:
np.savetxt('alabels_NOXA1.txt', library_AML['AML1']['NOXA1']['alabels'], delimiter=' ')

In [None]:
print(tr[:15])