In [1]:
## Import of used libraries
import numpy
import pandas as pd
import networkx as nx

In [2]:
# Load data in .bed format
d = pd.read_csv(f'../exons.bed', header=None, sep='\t')
#Only strings in chromosome names
d[0] = list(map(lambda chr_name: str(chr_name), d[0]))

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
## It computes an artificial set of offsets for the chromosomes in the bed file
d_pos = d[d[5] == '+']
starts_exons = list(d_pos[10])+list(d_pos[11])
starts_exons = list(map(lambda x: x.split(',')[:-1], starts_exons))
import itertools
starts_exons = list(itertools.chain.from_iterable(starts_exons))
starts_exons = list(map(lambda x: int(x), starts_exons))

relative_max_away = max(starts_exons)

offset = dict()
chr_names = sorted(list(set(d[0])))
previous_chr_name = chr_names[0]
offset[previous_chr_name] = 0
for chr_name in chr_names[1:]:
    offset[chr_name] = max(d_pos[d_pos[0]==previous_chr_name][1])+offset[previous_chr_name]+2*relative_max_away
    previous_chr_name = chr_name

In [4]:
def preprocess_bed_format(d, offset):
    ## Get positive strand
    d_pos = d[d[5] == '+']
    
    ## Drop unnecessary columns and name the required
    d_pos = d_pos.drop([2, 4, 5, 6, 7, 8, 9], axis=1)
    d_pos.columns = ['chromosome', 'transcript_start', 'transcript_id', 'exon_sizes', 'exon_starts']
    
    ##Compute absolute genome positions for transcripts
    d_pos.transcript_start = list(map(lambda x: x[0]+offset[x[1]], list(zip(d_pos.transcript_start, d_pos.chromosome))))
    
    ##Maybe something different for negative strand?

    #Convert exon_sizes and exon_starts into list of numbers
    d_pos.exon_sizes = list(map(lambda x: list(map(numpy.int64 , x.split(','))) ,d_pos.exon_sizes))
    d_pos.exon_starts = list(map(lambda x: list(map(numpy.int64 , x.split(','))) ,d_pos.exon_starts))
    
    return d_pos

d_pos = preprocess_bed_format(d, offset)

In [5]:
s = pd.read_csv(f'../CDS.bed', header=None, sep='\t')
s[0] = list(map(lambda chr_name: str(chr_name), s[0]))

s_pos = preprocess_bed_format(s, offset)

s_c = dict()
for index, CDSs in s_pos.iterrows():
    first_CDS_start = int(CDSs['transcript_start'])
    s_c[CDSs['transcript_id']] = list(map(lambda x: (x[0]+first_CDS_start, x[0]+first_CDS_start+x[1]-1), zip(CDSs['exon_starts'], CDSs['exon_sizes'])))

CDSs_column = list()

for index, transcript in d_pos.iterrows():
    if transcript['transcript_id'] in s_c:
        CDSs_column.append(list(map(lambda CDS: (CDS[0], CDS[1]), s_c[transcript['transcript_id']])))
    else:
        CDSs_column.append(list())
d_pos['CDSs'] = CDSs_column

In [6]:
## Function transforms processed table into a list of exon_endpoints
def get_exon_endpoint_positions(d):
    exons_endpoints = list()
    for index, transcript in d.iterrows():
        start = transcript.transcript_start
        transcript_id = transcript.transcript_id
        transcript_exons_endpoints = list()
        for i in range(len(transcript.exon_starts)):
            exon_start = start+transcript.exon_starts[i]
            transcript_exons_endpoints.append({'index':index, 'position': exon_start, 'transcript_id': transcript_id, 'exon_index': i, 'start_point': True})
            transcript_exons_endpoints.append({'index':index, 'position': exon_start+transcript.exon_sizes[i]-1, 'transcript_id': transcript_id, 'exon_index': i, 'start_point': False})
        exons_endpoints.append(transcript_exons_endpoints)
    return [item for sublist in exons_endpoints for item in sublist]

In [7]:
## Obtains a list of all exons and sorts them according to its position and in case of ties it puts first the starting positions
exon_endpoint_pos_list = get_exon_endpoint_positions(d_pos)
exon_endpoint_pos_list.sort(key=lambda x: [x['position'], not(x['start_point'])])

In [8]:
# Mark the exons when overlapping with another
active_exons = dict()
for exon_endpoint in exon_endpoint_pos_list:
    if exon_endpoint['start_point']:
        exon_endpoint['starting_points'] = list()
        active_exons[(exon_endpoint['index'], exon_endpoint['exon_index'])] = exon_endpoint
        
        for key in active_exons:
            active_exons[key]['starting_points'].append(exon_endpoint['position'])
    else:
        for key in active_exons:
            active_exons[key]['starting_points'].append(exon_endpoint['position']+1)
        del active_exons[(exon_endpoint['index'], exon_endpoint['exon_index'])]

In [9]:
#Compute pseudo-exons
for exon_endpoint in exon_endpoint_pos_list:
    if exon_endpoint['start_point']:
        exon_endpoint['pseudo_exons'] = list()
        previous_value = exon_endpoint['starting_points'][0]
        for i in range(1, len(exon_endpoint['starting_points'])):
            if previous_value != exon_endpoint['starting_points'][i]:
                exon_endpoint['pseudo_exons'].append((previous_value, exon_endpoint['starting_points'][i]-1))
                previous_value = exon_endpoint['starting_points'][i]

In [10]:
#Group pseudo_exons by transcript (assumption: exons of a transcript do not overlap)
transcripts = dict()
for exon_endpoint in exon_endpoint_pos_list:
    if exon_endpoint['start_point']:
        if transcripts.get(exon_endpoint['index'], None) is None:
            transcripts[exon_endpoint['index']] = {'pseudo_exons': list(), 'index': exon_endpoint['index']}
        for pseudo_exon in exon_endpoint['pseudo_exons']:
            transcripts[exon_endpoint['index']]['pseudo_exons'].append(pseudo_exon)

In [11]:
# def intersect(i1, i2):
#     mac = min(i1[0], i2[0])
#     Mac = max(i1[0], i2[0])
#     mbd = min(i1[1], i2[1])
#     return mac <= Mac and Mac <= mbd
    
# def get_CDS_path(pseudo_exon_path, CDS):
#     CDS_path = list()
#     for pseudo_exon in pseudo_exon_path:
#         if intersect(pseudo_exon, CDS):
#             CDS_path.append(pseudo_exon)
#     return CDS_path

# #Compute CDSs in pseudo_exons terms per each transcript
# for transcript_id in transcripts:
#     if transcript_id in s_c:
#         CDSs = s_c[transcript_id]
#         transcripts[transcript_id]['CDSs'] = list(map(lambda CDS:
#                                                              get_CDS_path(transcripts[transcript_id]['pseudo_exons'],
#                                                                             CDS)
#                                                               ,CDSs))
#     else:
#         transcripts[transcript_id]['CDSs'] = list()

In [12]:
# Computes the pseudo_exons column
pseudo_exons_list = list(transcripts.values())
pseudo_exons_list.sort(key= lambda x: x['index'])

pseudo_exons_column = list(map(lambda x: x['pseudo_exons'] ,pseudo_exons_list))

d_pos['pseudo_exons'] = pseudo_exons_column

In [13]:
# Build vertex set
vertices = dict() # Given a exon presudo_exon (x,y) return its id
vertices_inv = dict() # Given an id returns the corresponding exon (x,y)
next_id = 0
for pseudo_exons in pseudo_exons_column:
    for pseudo_exon in pseudo_exons:
        if vertices.get(pseudo_exon, None) is None:
            vertices[pseudo_exon] = next_id
            vertices_inv[next_id] = pseudo_exon
            next_id += 1

In [14]:
def intersect(i1, i2):
    mac = min(i1[0], i2[0])
    Mac = max(i1[0], i2[0])
    mbd = min(i1[1], i2[1])
    return mac <= Mac and Mac <= mbd

def get_CDS(CDSs, path, vertices_inv):
    CDS = dict()
    if CDSs:
        first_coding_vertex = 0
        while not(intersect(vertices_inv[path[first_coding_vertex]], CDSs[0])):
            first_coding_vertex += 1

        last_coding_vertex = len(path)-1
        while not(intersect(vertices_inv[path[last_coding_vertex]], CDSs[-1])):
            last_coding_vertex -= 1

        cds_path = path[first_coding_vertex:last_coding_vertex+1]
        
        CDS['subpath'] = cds_path
        CDS['start'] = CDSs[0][0]
        CDS['end'] = CDSs[-1][1]
        
    return CDS

# Build edge set, source and target vertices
# It also builds the transcript paths starting at every source (This could generate multiedges in the graph)

## These dicts are indexed by the pair [exon_start, exon_end]
transcript_paths = dict()
sources = dict()
targets = dict()

## The keys are the edges and the edges and the value the corresponding id
edges = dict()
next_id = 0 
for index, row in d_pos.iterrows():
    pseudo_exons = row['pseudo_exons']
    if sources.get(pseudo_exons[0], None) is None:
        sources[pseudo_exons[0]] = set()
    if targets.get(pseudo_exons[-1], None) is None:
        targets[pseudo_exons[-1]] = set()
    
    if transcript_paths.get(pseudo_exons[0], None) is None:
        transcript_paths[pseudo_exons[0]] = list()
    
    
    sources[pseudo_exons[0]].add(index)
    targets[pseudo_exons[-1]].add(index)
    
    CDSs = row['CDSs']
#     CDSs_path = row['CDSs_path']
    
    transcript_path = [vertices[pseudo_exons[0]]]
    ## Consecutive pseudo exons in pseudo_exons are linked by an edge
    for i in range(len(pseudo_exons)-1):
        current_pe = pseudo_exons[i]
        next_pe = pseudo_exons[i+1]
        edge = (vertices[current_pe], vertices[next_pe])
        if edges.get(edge, None) is None:
            edges[edge] = next_id
            next_id += 1
        transcript_path.append(vertices[next_pe])
    
    transcript_paths[pseudo_exons[0]].append({
        'transcript_path':transcript_path,
        'CDS': get_CDS(list(map(lambda CDS: (int(CDS[0]), int(CDS[1])),CDSs)), transcript_path, vertices_inv)
    })
    
#     transcript_paths[pseudo_exons[0]].append({
#         'transcript_path':transcript_path,
#         'CDSs': list(map(lambda CDS: {
#             'CDS': (int(CDS[0][0]), int(CDS[0][1])),
#             'path': list(map(lambda interval: vertices[interval], CDS[1]))
#         }, zip(CDSs, CDSs_path)))
#     })

In [15]:
# Build graph to find weakly connected components, and also 
# computes len, sources, target and transcipt paths for each
G = nx.DiGraph()
G.add_nodes_from(range(len(vertices)))
G.add_edges_from(edges.keys())

components = list()
for component_v in nx.weakly_connected_components(G):
    component_dict = {'graph':G.subgraph(component_v)}
    component_dict['len'] = len(component_dict['graph'])
    sources_component = set()
    targets_component = set()
    transcript_paths_component = list()
    
    for vertex in component_v:
        interval = vertices_inv[vertex]
        if sources.get(interval, None) is not None:
            sources_component.add(vertex)
            transcript_paths_component += transcript_paths[interval]
            
        if targets.get(interval, None) is not None:
            targets_component.add(vertex)
    component_dict['sources'] = sources_component
    component_dict['targets'] = targets_component
    component_dict['transcript_paths'] = transcript_paths_component
    component_dict['vertex_constrains'] = set(component_v)
    
    components.append(component_dict)

In [16]:
from json import dump
## It stores the networkx graph, the transcript paths, the sources and targets
def store_components_to_files(component, i):
    gene_graph = component['graph']
    nx.write_edgelist(gene_graph, path=f'../gene_graphs/graphs/component_{i+1}.edgelist', delimiter=':')
    
    transcript_paths = component['transcript_paths']
    f = open(f'../gene_graphs/transcript_paths/component_{i+1}.json', 'w')
    dump(transcript_paths, f)
    f.close()
    
    
    sources = list(component['sources'])
    f = open(f'../gene_graphs/sources/component_{i+1}.json', 'w')
    dump(sources, f)
    f.close()
    
    targets = list(component['targets'])
    f = open(f'../gene_graphs/targets/component_{i+1}.json', 'w')
    dump(targets, f)
    f.close()
    
    
    vertex_constrains = list(component['vertex_constrains'])
    f = open(f'../gene_graphs/vertex_constrains/component_{i+1}.json', 'w')
    dump(vertex_constrains, f)
    f.close()
    

In [17]:
## Store components/gene_graphs to files
for i,component in enumerate(components):
    store_components_to_files(component, i)


## Store vertices_inv: id --> (genome_pos, genome_pos)
vertices_inv = {key: (int(x), int(y)) for key, (x, y) in vertices_inv.items()}
f = open(f'../gene_graphs/vertices_inv.json', 'w')
dump(vertices_inv, f)
f.close()

In [18]:
## It stores the corresponding component as lemon graph format file in filename
## Vertex mappings original_id (networkx's id), is_source (if it is a source), is_target (if it is a target)
## and is_vertex_constrain (if it is a vertex constrain) are included in the file
def store_to_file_in_lemon_format_with_mappings(component, filename):
    G = component['graph']
    sources = component['sources']
    targets = component['targets']
    constrains = component['vertex_constrains']
    
    file = open(filename, 'w')
    file.write("@nodes\n")
    file.write("label\toriginal_id\tis_source\tis_target\tis_vertex_constrain\t\n")
    for vertex in G.nodes:
        file.write(str(vertex)+"\t"+str(vertex)+"\t"+str(1 if vertex in sources else 0)+"\t"+str(1 if vertex in targets else 0)+"\t"+str(1 if vertex in constrains else 0)+"\t\n")
    file.write("@arcs\n")
    file.write("\t\tlabel\t\n")
    for i, edge in enumerate(G.edges):
        file.write(str(edge[0])+"\t"+str(edge[1])+"\t"+ str(i) + "\t\n")  
    file.close()


In [19]:
## Store components with more than to vertices to the corresponding lemon graph format
## Can be skipped if already computed
for i, component in enumerate(components):
    if component['len'] > 2 and len(component['transcript_paths']) > 1:
        store_to_file_in_lemon_format_with_mappings(component,'../lgf/component_'+str(i+1)+'.lgf')

In [20]:
info = open('../gene_graphs/info','w')
info.write(str(len(components)))
info.close()