In [1]:
import re
from os.path import join, exists 

def _convert_edgeStr (_edgeStr):
    assert isinstance(_edgeStr, str) and 'EDGE_' in _edgeStr, "{} is not legal component from spades FASTG".format(edgeString)
    _edgeId = _edgeStr.split('_')[1]
    _edgeDirection = '-' if "'" in _edgeStr else '+'
    return f'{_edgeId}{_edgeDirection}'

def getContigsAdjacency(spadesOutDir=None, graphFilePath=None, pathFilePath=None):
    if graphFilePath == None and spadesOutDir != None:
        graphFilePath = join(spadesOutDir, 'assembly_graph.fastg')
    if pathFilePath == None and spadesOutDir != None:
        pathFilePath = join(spadesOutDir, 'contigs.paths')    
        
    assert exists(graphFilePath), "FASTG file not found"    
    assert exists(pathFilePath), "Contig paths file not found"    

    #Init set of link and dicts of endpoint
    componentLinks = set()
    contigLinks = set()
    startDict = {}
    endDict = {}
    #Read links between components from assembly graph FASTG file
    graph_file = open(graphFilePath, 'r')
    for _line in graph_file.readlines():
        if(_line.startswith('>')):
            _edges_list = re.split(':|,', _line.strip());
            if(len(_edges_list) <2):
                continue
            else:
                #print("From {} to {}".format(_edges_list[0], _edges_list[1:]))
                root = _convert_edgeStr(_edges_list[0])
                for _edge in _edges_list[1:]:
                    componentLinks.add((root, _convert_edgeStr(_edge)))
                    #print("{},{}".format(root, _convert_edgeStr(_edge)))                
    graph_file.close()
    #Read endpoints of each path (CONTIG) consisting of above components
    path_file = open(pathFilePath, 'r')
    for _line in path_file.readlines():
        if(_line.startswith('NODE_')):
            ctg = _line.strip()
        else:
            _edges_from_path = _line.strip().split(',')
            _start = _edges_from_path[0]; _end = _edges_from_path[-1]
            if _start in startDict.keys():
                startDict[_start].add(ctg)
            else:
                startDict[_start] = {ctg}

            if _end in endDict.keys():
                endDict[_end].add(ctg)
            else:
                endDict[_end] = {ctg}

    path_file.close()

    #Output all possible links between paths (contigs)
    # <------leftCtg---------/-start-/-> <-/-end-/---rightCtg----->
    #       
    for start,end in componentLinks:
        #print("{} to {}".format(start,end))
        if (start in endDict.keys()) and (end in startDict.keys()):
            for leftCtg in endDict[start]:
                for rightCtg in startDict[end]:
                    contigLinks.add((leftCtg, rightCtg))
    return contigLinks



In [2]:
for l,r in getContigsAdjacency('../data/spades'):
    print("{} --> {}".format(l,r))

NODE_772_length_84_cov_9.655172 --> NODE_665_length_112_cov_57.789474
NODE_764_length_85_cov_136.600000' --> NODE_785_length_72_cov_52.823529
NODE_579_length_160_cov_36.171429 --> NODE_104_length_15415_cov_40.773242
NODE_734_length_108_cov_24.490566' --> NODE_817_length_58_cov_15.333333
NODE_814_length_58_cov_60.666667 --> NODE_522_length_230_cov_18.748571
NODE_668_length_111_cov_53.196429 --> NODE_539_length_207_cov_90.118421
NODE_658_length_114_cov_78.101695 --> NODE_538_length_208_cov_41.751634
NODE_749_length_99_cov_127.750000' --> NODE_389_length_829_cov_158.896641
NODE_379_length_898_cov_33.663108' --> NODE_698_length_111_cov_18.232143'
NODE_9_length_50252_cov_20.549316 --> NODE_339_length_1437_cov_73.812590
NODE_814_length_58_cov_60.666667 --> NODE_521_length_230_cov_29.697143
NODE_815_length_58_cov_23.333333 --> NODE_724_length_110_cov_18.490909'
NODE_822_length_57_cov_34.500000 --> NODE_780_length_76_cov_83.190476'
NODE_769_length_84_cov_269.000000 --> NODE_588_length_154_cov_