In [1]:
## Import of used libraries
import numpy
import pandas as pd
import networkx as nx

In [2]:
## First index of chromosomes if put in the order 1...22 X Y
offset = dict()
offset[1] = 0
offset[2] = 248956422
offset[3] = 491149951
offset[4] = 689445510
offset[5] = 879660065
offset[6] = 1061198324
offset[7] = 1232004303
offset[8] = 1391350276
offset[9] = 1536488912
offset[10] = 1674883629
offset[11] = 1808681051
offset[12] = 1943767673
offset[13] = 2077042982
offset[14] = 2191407310
offset[15] = 2298451028
offset[16] = 2400442217
offset[17] = 2490780562
offset[18] = 2574038003
offset[19] = 2654411288
offset[20] = 2713028904
offset[21] = 2777473071
offset[22] = 2824183054

In [3]:
# Load data in .bed format
d = pd.read_csv(f'../all.bed', header=None, sep='\t')

In [4]:
def preprocess_bed_format(d, offset):
    ## Separate data into negative and positive strand
    d_pos = d[d[5] == '+']
    d_neg = d[d[5] == '-']
    
    ## Drop unnecessary columns and name the required
    d_pos = d_pos.drop([2, 4, 5, 6, 7, 8, 9], axis=1)
    d_neg = d_neg.drop([2, 4, 5, 6, 7, 8, 9], axis=1)
    d_pos.columns = ['chromosome', 'transcript_start', 'transcript_id', 'exon_sizes', 'exon_starts']
    d_neg.columns = ['chromosome', 'transcript_start', 'transcript_id', 'exon_sizes', 'exon_starts']
    
    ##Compute absolute genome positions for transcripts
    d_pos.transcript_start = list(map(lambda x: x[0]+offset[x[1]], list(zip(d_pos.transcript_start, d_pos.chromosome))))
    d_neg.transcript_start = list(map(lambda x: x[0]+offset[x[1]], list(zip(d_neg.transcript_start, d_neg.chromosome))))
    
    ##Maybe something different for negative strand?

    #Convert exon_sizes and exon_starts into list of numbers
    d_pos.exon_sizes = list(map(lambda x: list(map(numpy.int64 , x[:-1].split(','))) ,d_pos.exon_sizes))
    d_pos.exon_starts = list(map(lambda x: list(map(numpy.int64 , x[:-1].split(','))) ,d_pos.exon_starts))
    d_neg.exon_sizes = list(map(lambda x: list(map(numpy.int64 , x[:-1].split(','))) ,d_neg.exon_sizes))
    d_neg.exon_starts = list(map(lambda x: list(map(numpy.int64 , x[:-1].split(','))) ,d_neg.exon_starts))
    
    return d_pos, d_neg

d_pos, dneg = preprocess_bed_format(d, offset)

In [5]:
## Function transforms processed table into a list of exon_endpoints
def get_exon_endpoint_positions(d):
    exons_endpoints = list()
    for index, transcript in d.iterrows():
        start = transcript.transcript_start
        transcript_exons_endpoints = list()
        for i in range(len(transcript.exon_starts)):
            exon_start = start+transcript.exon_starts[i]
            transcript_exons_endpoints.append({'position': exon_start, 'transcript_index': index, 'exon_index': i, 'start_point': True})
            transcript_exons_endpoints.append({'position': exon_start+transcript.exon_sizes[i]-1, 'transcript_index': index, 'exon_index': i, 'start_point': False})
        exons_endpoints.append(transcript_exons_endpoints)
    return [item for sublist in exons_endpoints for item in sublist]

In [6]:
## Obtains a list of all exons and sorts them according to its position and in case of ties it puts first the starting positions
exon_endpoint_pos_list = get_exon_endpoint_positions(d_pos)
exon_endpoint_pos_list.sort(key=lambda x: [x['position'], not(x['start_point'])])

In [7]:
# Mark the exons when overlapping with another
active_exons = dict()
for exon_endpoint in exon_endpoint_pos_list:
    if exon_endpoint['start_point']:
        exon_endpoint['starting_points'] = list()
        active_exons[(exon_endpoint['transcript_index'], exon_endpoint['exon_index'])] = exon_endpoint
        
        for key in active_exons:
            active_exons[key]['starting_points'].append(exon_endpoint['position'])
    else:
        for key in active_exons:
            active_exons[key]['starting_points'].append(exon_endpoint['position']+1)
        del active_exons[(exon_endpoint['transcript_index'], exon_endpoint['exon_index'])]

In [8]:
#Compute pseudo-exons
for exon_endpoint in exon_endpoint_pos_list:
    if exon_endpoint['start_point']:
        exon_endpoint['pseudo_exons'] = list()
        previous_value = exon_endpoint['starting_points'][0]
        for i in range(1, len(exon_endpoint['starting_points'])):
            if previous_value != exon_endpoint['starting_points'][i]:
                exon_endpoint['pseudo_exons'].append((previous_value, exon_endpoint['starting_points'][i]-1))
                previous_value = exon_endpoint['starting_points'][i]

In [9]:
#Group pseudo_exons by transcript (assumption: exons of a transcript do not overlap)
transcripts = dict()
for exon_endpoint in exon_endpoint_pos_list:
    if exon_endpoint['start_point']:
        if transcripts.get(exon_endpoint['transcript_index'], None) is None:
            transcripts[exon_endpoint['transcript_index']] = {'pseudo_exons': list()}
        for pseudo_exon in exon_endpoint['pseudo_exons']:
            transcripts[exon_endpoint['transcript_index']]['pseudo_exons'].append(pseudo_exon)

In [10]:
# Computes the pseudo_exons column
pseudo_exons_list = list()
for key in transcripts:
    transcripts[key].update({'id': key})
    pseudo_exons_list.append(transcripts[key])
pseudo_exons_list.sort(key= lambda x: x['id'])

pseudo_exons_column = list(map(lambda x: x['pseudo_exons'] ,pseudo_exons_list))
d_pos['pseudo_exons'] = pseudo_exons_column

In [11]:
# Build vertex set
vertices = dict() # Given a exon presudo_exon (x,y) return its id
vertices_inv = dict() # Given an id returns the corresponding exon (x,y)
next_id = 0
for pseudo_exons in pseudo_exons_column:
    for pseudo_exon in pseudo_exons:
        if vertices.get(pseudo_exon, None) is None:
            vertices[pseudo_exon] = next_id
            vertices_inv[next_id] = pseudo_exon
            next_id += 1

In [12]:
# Build edge set, source and target vertices
# It also builds the transcript paths starting at every source (This could generate multiedges in the graph)

## These dicts are indexed by the pair [exon_start, exon_end]
transcript_paths = dict()
sources = dict()
targets = dict()

## The keys are the edges and the edges and the value the corresponding id
edges = dict()
next_id = 0 
for index, row in d_pos.iterrows():
    pseudo_exons = row['pseudo_exons']
    if sources.get(pseudo_exons[0], None) is None:
        sources[pseudo_exons[0]] = set()
    if targets.get(pseudo_exons[-1], None) is None:
        targets[pseudo_exons[-1]] = set()
    
    if transcript_paths.get(pseudo_exons[0], None) is None:
        transcript_paths[pseudo_exons[0]] = list()
    
    
    sources[pseudo_exons[0]].add(index)
    targets[pseudo_exons[-1]].add(index)
    
    
    transcript_path = [vertices[pseudo_exons[0]]]
    ## Consecutive pseudo exons in pseudo_exons are linked by an edge
    for i in range(len(pseudo_exons)-1):
        current_pe = pseudo_exons[i]
        next_pe = pseudo_exons[i+1]
        edge = (vertices[current_pe], vertices[next_pe])
        if edges.get(edge, None) is None:
            edges[edge] = next_id
            next_id += 1
        transcript_path.append(vertices[next_pe])
    
    transcript_paths[pseudo_exons[0]].append(transcript_path)

In [13]:
# Build graph to find weakly connected components, and also 
# computes len, sources, target and transcipt paths for each
G = nx.DiGraph()
G.add_nodes_from(range(len(vertices)))
G.add_edges_from(edges.keys())

components = list()
for component_v in nx.weakly_connected_components(G):
    component_dict = {'graph':G.subgraph(component_v)}
    component_dict['len'] = len(component_dict['graph'])
    sources_component = set()
    targets_component = set()
    transcript_paths_component = list()
    
    for vertex in component_v:
        interval = vertices_inv[vertex]
        if sources.get(interval, None) is not None:
            sources_component.add(vertex)
            transcript_paths_component += transcript_paths[interval]
            
        if targets.get(interval, None) is not None:
            targets_component.add(vertex)
    component_dict['sources'] = sources_component
    component_dict['targets'] = targets_component
    component_dict['transcript_paths'] = transcript_paths_component
    component_dict['vertex_constrains'] = set(component_v)
    
    components.append(component_dict)

In [26]:
from json import dump
## It stores the networkx graph, the transcript paths, the sources and targets
def store_components_to_files(component, i):
    gene_graph = component['graph']
    nx.write_edgelist(gene_graph, path=f'../gene_graphs/graphs/component_{i+1}.edgelist', delimiter=':')
    
    transcript_paths = component['transcript_paths']
    f = open(f'../gene_graphs/transcript_paths/component_{i+1}.json', 'w')
    dump(transcript_paths, f)
    f.close()
    
    
    sources = list(component['sources'])
    f = open(f'../gene_graphs/sources/component_{i+1}.json', 'w')
    dump(sources, f)
    f.close()
    
    targets = list(component['targets'])
    f = open(f'../gene_graphs/targets/component_{i+1}.json', 'w')
    dump(targets, f)
    f.close()
    
    
    vertex_constrains = list(component['vertex_constrains'])
    f = open(f'../gene_graphs/vertex_constrains/component_{i+1}.json', 'w')
    dump(vertex_constrains, f)
    f.close()
    

In [28]:
## Store components/gene_graphs to files
for i,component in enumerate(components):
    store_components_to_files(component, i)


## Store vertices_inv: id --> (genome_pos, genome_pos)
vertices_inv = {key: (int(x), int(y)) for key, (x, y) in vertices_inv.items()}
f = open(f'../gene_graphs/vertices_inv.json', 'w')
dump(vertices_inv, f)
f.close()

In [17]:
## It stores the corresponding component as lemon graph format file in filename
## Vertex mappings original_id (networkx's id), is_source (if it is a source), is_target (if it is a target)
## and is_vertex_constrain (if it is a vertex constrain) are included in the file
def store_to_file_in_lemon_format_with_mappings(component, filename):
    G = component['graph']
    sources = component['sources']
    targets = component['targets']
    constrains = component['vertex_constrains']
    
    file = open(filename, 'w')
    file.write("@nodes\n")
    file.write("label\toriginal_id\tis_source\tis_target\tis_vertex_constrain\t\n")
    for vertex in G.nodes:
        file.write(str(vertex)+"\t"+str(vertex)+"\t"+str(1 if vertex in sources else 0)+"\t"+str(1 if vertex in targets else 0)+"\t"+str(1 if vertex in constrains else 0)+"\t\n")
    file.write("@arcs\n")
    file.write("\t\tlabel\t\n")
    for i, edge in enumerate(G.edges):
        file.write(str(edge[0])+"\t"+str(edge[1])+"\t"+ str(i) + "\t\n")  
    file.close()


In [18]:
## Store components with more than to vertices to the corresponding lemon graph format
## Can be skipped if already computed
for i, component in enumerate(components):
    if component['len'] > 2:
        store_to_file_in_lemon_format_with_mappings(component,'../lgf/component_'+str(i+1)+'.lgf')