In [29]:
import os
import time
import json
import numpy as np
from functools import reduce
import re
from collections import Counter
from intervaltree import Interval, IntervalTree


In [2]:

nts = 'AGNCT'
pairs = {nts[i]:nts[4-i] for i in range(5)}
def revComp(seq):
    return ''.join([pairs[nt] for nt in seq][::-1])



In [3]:
def getTranscriptGeneName(name):
    match = re.search(r'(:|=)((\w+[A-Z]\w+\.t?\d+)|([A-Z]\w+\.t?\d+))((\.\d+)|([a-z](\.\d+)*))*(;|$)', name)
    if match:
        return match.group(2)
    return '-'


def getAlias(name):
    match = re.search(r'(Alias=)([\w\-.]+)', name)
    if match:
        return match.group(2)
    return '-'


def getBiotype(name):
    match = re.search(r'(biotype=)([\w\-]+);?', name)
    if match:
        return match.group(2)
    return '-'

def getWBGene(name):
    match = re.search(r'(WBGene\d+);?', name)
    if match:
        return match.group(1)
    return '-'


def parseWormbaseExon(data):
    avoid = {'intron', 'gene', 'mRNA'}
    gene2ivs = {}
    gene2info = {}
    seen = {}
    for curr in data:
        metadata = curr[-1]
        gene = getTranscriptGeneName(metadata)
            
        if gene not in gene2info:
            gene2info[gene] = {'chrom':curr[0], 'strand':curr[6], 'element':set([curr[2]])}
            
        else:
            gene2info[gene]['element'].add(curr[2])
        
        if 'Alias=' in metadata:
            alias = getAlias(metadata)
            if 'alias' not in gene2info[gene]:
                gene2info[gene]['alias'] = set([alias])
            else:
                gene2info[gene]['alias'].add(alias)
        if 'biotype=' in metadata:
            biotype = getBiotype(metadata)
            gene2info[gene]['biotype'] = biotype
        if 'WBGene' in metadata:
            wbgene = getWBGene(metadata)
            if 'wbgene' in gene2info[gene]:
                gene2info[gene]['wbgene'].add(wbgene)                    
            else:
                gene2info[gene]['wbgene'] = set([wbgene])
                
                
        if gene in gene2ivs:
            if curr[2] not in avoid:
                gene2ivs[gene].append((int(curr[3])-1, int(curr[4])))
        else:
            if curr[2] not in avoid:
                gene2ivs[gene] = [(int(curr[3])-1, int(curr[4]))]
                   
            
    sub_elements = {'CDS', 'exon', 'five_prime_UTR', 'three_prime_UTR', 'nc_primary_transcript', 'pre_miRNA', 'miRNA_primary_transcript'}
    for curr in gene2info:
        if 'biotype' not in gene2info[curr]:
            diff = gene2info[curr]['element'] - sub_elements
            if isinstance(diff, str):
                gene2info[curr]['biotype'] = diff
            else:
                gene2info[curr]['biotype'] = ','.join(list(diff))
        
        if 'alias' not in gene2info[curr]:
            gene2info[curr]['alias'] = curr
        else:
            gene2info[curr]['alias'] = ','.join(list(gene2info[curr]['alias']))
            
        if 'wbgene' in gene2info[curr]:
            gene2info[curr]['wbgene'] = ','.join(list(gene2info[curr]['wbgene']))
        
            
    return gene2ivs, gene2info

In [4]:
def getWormbaseData(file):
    with open(file) as f:
        data = [row.strip().split("\t") for row in f.readlines()]
    return data

def getMergedSequence(interval, info, id2seq):
    merged_seq = ''
    for curr in interval:
        start, end, _ = curr
        merged_seq += id2seq[info['chrom']][start:end]
    return merged_seq if info['strand'] == '+' else revComp(merged_seq)

    
def getMetaGenome(file): 
    data = getWormbaseData(file)
    gene2ivs, gene2info = parseWormbaseExon(data)

    gene2intervals = {}
    for gene in gene2ivs:
        curr = IntervalTree.from_tuples(gene2ivs[gene])
        curr.merge_overlaps()
        gene2info[gene]['interval'] = curr
        #gene2intervals[gene] = curr

        gene2info[gene]['start'] = curr.begin()
        gene2info[gene]['end'] = curr.end()
        
    
    return gene2info
    

In [5]:
file = 'ws268.wormbase'


In [6]:
gene2info= getMetaGenome(file)

In [7]:
[curr for curr in gene2info if 'wbgene' not in gene2info[curr]]

[]

In [8]:
gene2info['Y105E8A.7']

{'alias': 'lev-10,eat-18',
 'biotype': 'protein_coding',
 'chrom': 'I',
 'element': {'CDS',
  'exon',
  'five_prime_UTR',
  'gene',
  'mRNA',
  'three_prime_UTR'},
 'end': 14400222,
 'interval': IntervalTree([Interval(14390332, 14390458), Interval(14391959, 14392203), Interval(14393324, 14393449), Interval(14393546, 14393629), Interval(14393757, 14393969), Interval(14395101, 14395309), Interval(14395359, 14395521), Interval(14396345, 14396496), Interval(14396539, 14396664), Interval(14396716, 14397098), Interval(14397611, 14397942), Interval(14398317, 14398857), Interval(14399417, 14399536), Interval(14399622, 14399839), Interval(14400057, 14400222)]),
 'start': 14390332,
 'strand': '+',
 'wbgene': 'WBGene00001147,WBGene00002977'}

In [9]:
len(gene2info)

46910

In [None]:
[(curr, gene2info[curr]) for curr in gene2info if 'alias' not in gene2info[curr]]

In [None]:
Counter([gene2info[curr]['biotype'] for curr in gene2info ])

In [None]:
len(gene2intervals)

In [None]:
[(curr, gene2info[curr]['wbgene']) for curr in gene2info if gene2info[curr]['alias'] == 'lev-10']

In [None]:
[(curr, gene2info[curr]['wbgene']) for curr in gene2info if ',' in gene2info[curr]['wbgene']]

In [None]:
gene2info['Y105E8A.7']

In [None]:
[(curr,gene2info[curr]) for curr in gene2info if gene2info[curr]['biotype']=='mRNA']

In [None]:
len(gene2info)

In [10]:
gene2info

{'Y74C9A.6': {'alias': 'Y74C9A.6',
  'biotype': 'snoRNA',
  'chrom': 'I',
  'element': {'exon', 'gene', 'snoRNA'},
  'end': 3909,
  'interval': IntervalTree([Interval(3746, 3909)]),
  'start': 3746,
  'strand': '-',
  'wbgene': 'WBGene00023193'},
 'Y74C9A.3': {'alias': 'homt-1',
  'biotype': 'protein_coding',
  'chrom': 'I',
  'element': {'CDS',
   'exon',
   'five_prime_UTR',
   'gene',
   'mRNA',
   'three_prime_UTR'},
  'end': 10230,
  'interval': IntervalTree([Interval(4115, 4358), Interval(5194, 5296), Interval(6036, 6327), Interval(9726, 9846), Interval(10094, 10230)]),
  'start': 4115,
  'strand': '-',
  'wbgene': 'WBGene00022277'},
 'Y74C9A.2': {'alias': 'nlp-40',
  'biotype': 'protein_coding',
  'chrom': 'I',
  'element': {'CDS',
   'exon',
   'five_prime_UTR',
   'gene',
   'mRNA',
   'three_prime_UTR'},
  'end': 16837,
  'interval': IntervalTree([Interval(11494, 11561), Interval(11617, 11689), Interval(14949, 15160), Interval(16472, 16837)]),
  'start': 11494,
  'strand': '+

In [98]:
def getRangesFromTree(tree, k=10000, m=0):
    ranges = {chrom:{'+':[], '-':[]} for chrom in chroms}
    for chrom in trees:
        for strand in ['+', '-']:
            tree = trees[chrom][strand]
            for i in range(0, tree.end()+1, k):
                ranges[chrom][strand].append([curr[-1] for curr in tree2json(tree.overlap(i-m, i+k+m))])
    return ranges


def getBlocksFromTree(tree):
    #diameter of a semi-circle with given pixels: pixels*pi/2
    pixels = {'low':800, 'mid':1280, 'high':2880, 'ultra':5120}
    resolutions = {curr:pixels[curr]*3/2 for curr in pixels}
    arc_blocks = {curr:{} for curr in resolutions}

    for curr in arc_blocks:
        ranges_arc = {chrom:{'+':{}, '-':{}} for chrom in chroms}
        countd = {chrom:{'+':[], '-':[]} for chrom in chroms}
        count_tree = {chrom:{'+':None, '-':None} for chrom in chroms}
        for chrom in tree:
            chrom_len = max(tree[chrom]['+'].end(), tree[chrom]['-'].end())
            block_size =  chrom_len//resolutions[curr]
            arc_blocks[curr][chrom] = getArcChromBlock(tree[chrom], chrom_len, block_size)    
    return arc_blocks

                
def getArcChromBlock(tree, chrom_len, block_size):
    m = 0
    arcBlockInterval = {strand:None for strand in ['+', '-']}
    for strand in ['+', '-']:
        blocks = []
        curr_tree = tree[strand]
        for i in range(0, int(chrom_len//block_size+1)):
            overlap = curr_tree.overlap(i*block_size, (i+1)*block_size)
            if len(overlap) > 0: 
                blocks.append([i, i+1.01])                
        block_tree = IntervalTree.from_tuples(blocks)
        block_tree.merge_overlaps()
        arcBlockInterval[strand] = tree2json(block_tree, False)
    return arcBlockInterval


def tree2json(tree, add_data=True):
    output = []
    for branch in tree:
        start, end, data = branch
        if add_data:
            output.append([start, end, data])
        else:
            output.append([start, end])
    return sorted(output)


def getTrackData(gene2info):
    
    gene2intervals = {}
    interval2genes = {}
    interval2blocks = {}

    fields = ['chrom', 'start', 'end', 'strand', 'alias', 'biotype']
    gene2intervals = {curr:{**{'intervals':tree2json(gene2info[curr]['interval'], False)}, **{field:gene2info[curr][field] for field in fields}} for curr in gene2info}

    chroms = set([gene2info[gene]['chrom'] for gene in gene2info])
    chrom2intervals = {chrom:{'+':[], '-':[]} for chrom in chroms}

    for gene in gene2info:
        curr = gene2info[gene]
        chrom2intervals[gene2info[gene]['chrom']][gene2info[gene]['strand']].append([curr['start'], curr['end'], gene])

    chrom2trees = {}
    for chrom in chroms:
        chrom2trees[chrom] = {}
        for strand in ['+', '-']:
            chrom2trees[chrom][strand] = IntervalTree.from_tuples(chrom2intervals[chrom][strand]) 

    interval2genes = getRangesFromTree(chrom2trees)
    interval2blocks = getBlocksFromTree(chrom2trees)
    
    return gene2intervals, interval2genes, interval2blocks
    

def generateTrackData():
    tracks = []
    protein_coding = {gene:gene2info[gene] for gene in gene2info if gene2info[gene]['biotype']=='protein_coding'}
    gene2intervals, interval2genes, interval2blocks = getTrackData(protein_coding)
    tracks.append({'track':'protein coding', 'gene2intervals':gene2intervals, 'interval2genes':interval2genes, 'interval2blocks':interval2blocks})
    
    others = {gene:gene2info[gene] for gene in gene2info if gene2info[gene]['biotype']!='protein_coding'}
    gene2intervals, interval2genes, interval2blocks = getTrackData(others)
    tracks.append({'track':'others', 'gene2intervals':gene2intervals, 'interval2genes':interval2genes, 'interval2blocks':interval2blocks})
    
    return tracks
    