In [1]:

import gfapy
import re
from gfapy.sequence import rc

gfaFile = input("Enter the address of the gfa file: ")
#Loading the graph
try:
    print('Loading the graph ...')
    myGraph = gfapy.Gfa.from_file(gfaFile)
except Exception as e:
    print("Graph not loaded successfully: " + str(e))
    import pdb
    pdb.set_trace()

Enter the address of the gfa file: CAMI_H_1_graph.gfa
Loading the graph ...


In [2]:
import numpy as np
import pandas as pd
# Save the data in a dataframe
df= pd.read_csv('CAMIH1Sequences.tsv', sep='\t')

In [3]:
df

Unnamed: 0,Query,Path_GraphAligner,Start_GraphAligner,End_GraphAligner
0,gb|AB023477|+|0-861|ARO:3001082|SHV-24,1612535,17,877
1,gb|AB049569|+|0-861|ARO:3000958|TEM-91,3478183,47,907
2,gb|AB200915.1|-|1831-2305|ARO:3005084|dfrA31,54829473,8,481
3,gb|AB302939|+|8-869|ARO:3001115|SHV-60,1612535,17,877
4,gb|AB372881|+|8-869|ARO:3001160|SHV-111,1612535,17,877
...,...,...,...,...
702,gb|AY130285|+|0-785|ARO:3000981|TEM-118 Partial,0,0,0
703,gb|NG_050218.1|+|0-1061|ARO:3001044|TEM-181 P...,0,0,0
704,gb|AF527798.1|+|0-785|ARO:3000879|TEM-7 Partial,0,0,0
705,gb|AY130284|+|0-785|ARO:3000941|TEM-75 Partial,0,0,0


In [4]:
# Replace any spaces in PathGraphAligner with a comma
df['Node'] = df['Path_GraphAligner'].str.replace(' ', ',')
# Replace commas with '<' and add '<' at the start of each row
df['Node'] = '<' + df['Path_GraphAligner'].str.replace(',', '<')


In [5]:
df

Unnamed: 0,Query,Path_GraphAligner,Start_GraphAligner,End_GraphAligner,Node
0,gb|AB023477|+|0-861|ARO:3001082|SHV-24,1612535,17,877,<1612535
1,gb|AB049569|+|0-861|ARO:3000958|TEM-91,3478183,47,907,<3478183
2,gb|AB200915.1|-|1831-2305|ARO:3005084|dfrA31,54829473,8,481,<54829473
3,gb|AB302939|+|8-869|ARO:3001115|SHV-60,1612535,17,877,<1612535
4,gb|AB372881|+|8-869|ARO:3001160|SHV-111,1612535,17,877,<1612535
...,...,...,...,...,...
702,gb|AY130285|+|0-785|ARO:3000981|TEM-118 Partial,0,0,0,<0
703,gb|NG_050218.1|+|0-1061|ARO:3001044|TEM-181 P...,0,0,0,<0
704,gb|AF527798.1|+|0-785|ARO:3000879|TEM-7 Partial,0,0,0,<0
705,gb|AY130284|+|0-785|ARO:3000941|TEM-75 Partial,0,0,0,<0


In [6]:
df.dtypes

Query                 object
Path_GraphAligner     object
Start_GraphAligner     int64
End_GraphAligner       int64
Node                  object
dtype: object

In [7]:
df['Node'] = df['Node'].str.replace(r'\s+', '', regex=True)

In [8]:
df

Unnamed: 0,Query,Path_GraphAligner,Start_GraphAligner,End_GraphAligner,Node
0,gb|AB023477|+|0-861|ARO:3001082|SHV-24,1612535,17,877,<1612535
1,gb|AB049569|+|0-861|ARO:3000958|TEM-91,3478183,47,907,<3478183
2,gb|AB200915.1|-|1831-2305|ARO:3005084|dfrA31,54829473,8,481,<54829473
3,gb|AB302939|+|8-869|ARO:3001115|SHV-60,1612535,17,877,<1612535
4,gb|AB372881|+|8-869|ARO:3001160|SHV-111,1612535,17,877,<1612535
...,...,...,...,...,...
702,gb|AY130285|+|0-785|ARO:3000981|TEM-118 Partial,0,0,0,<0
703,gb|NG_050218.1|+|0-1061|ARO:3001044|TEM-181 P...,0,0,0,<0
704,gb|AF527798.1|+|0-785|ARO:3000879|TEM-7 Partial,0,0,0,<0
705,gb|AY130284|+|0-785|ARO:3000941|TEM-75 Partial,0,0,0,<0


In [9]:
#Convert the start and end position from double to int
df['End_GraphAligner'] =df['End_GraphAligner'].astype(int)
df['Start_GraphAligner'] =df['Start_GraphAligner'].astype(int)

In [10]:

# Process paths from the DataFrame'
for index, row in df.iterrows():
    path = row['Node']
    start = row['Start_GraphAligner']
    end = row['End_GraphAligner']

    # Extract nodes and orientations
    nodeOrients = path
    orientList = ['+' if ch == '>' else '-' for ch in nodeOrients]
    nodeList = re.split('>|<', nodeOrients)[1:]

    print('Generating the sequence ...')
    sequence = ''
    counter = 1
    for node, orient in zip(nodeList, orientList):
        try:
            seq = myGraph.segment(node).sequence
            if orient == '-':
                seq = rc(seq)
            if counter == 1:
                sequence = seq[start:]
            elif counter == len(nodeList):
                sequence += seq[:end]
            else:
                sequence += seq
        except AttributeError:
            print(f"Node {node} not found in the graph.")
            break

        counter += 1

    print(sequence)
    df.at[ index, 'Sequence_GraphAligner'] = sequence


Generating the sequence ...
GCGGTAGACTTTGCCGGCCACGGCGTCCAGCACCGGCGCGAGTCGGGCAAAGGCGGCATCGCTCCCGGAGGCCATCACCGTCATGTCGCCGGCGGCCGCTTTCACGGCGCCGCCCGATACCGGCGCGTCGAGCATCAATAGCTGGTACTCCGCCAGCGCCTCGGCAATGGCCTGAGCATCGGCGGAGGCGATGGTGGACGACACCATCACGACGGTGCCCGGCTTCAGATGGGCGGCGAGGCCGCTCTCGCCGAACAGGATCCCCCGCACCTGGGCGGCATTGACCACCAGCAGCACAACTGCATCCAGTTCCGCGGCGAACGGCACCGCGCTGGGGCCCGCGCCTTTGGCGCCCGCCGCCAGCAGTGCGCGACAGTTGTCGGGATTGATGTCAACGCCCCAGGTGTTCAGGCCCGCCTGCAGGCAGGCGCGGGCGGCGCCCATGCCCATTGAACCCAGTCCAATCACGCAGACGTTAGTGTGTGCAGCCATGCTGGTCTCCTTGTGAACATTAGTTAATTAATGTGATTTAATGATAGAATAAAGCGTTATCATGTGAATTTATGTGAGAGTTATCACGAATAACCCACATGAATATAAAAAATTCACAGGCGGAAAGAGGCTGCACAGCGGCGATTCTTGCCCTAAAATAGCGTAAAAAACAGCGGCAAAGCGCGATAACAACACCGGGTGTCGCCAGGGCCGTAGCCTGGGTGGTGACGCCTGGTGTGAAAAAATGGTAAGGGGAGAGCGGTGATTCCAATAGAACGACATCAGCGTATTTTAGCCCTGGTGGAGCAGCGTGGGGCGGTAAGCATTAACGAGCTGACGGAGATCCTCGGCGTGTCCCATATGACCATCCGTCGGGACGTCAGTAAACTGGAGGAGCAGGGGCTGCTGGTCAGCGTCTCGGGCGGCGTACGCGCCGTCAGCCGGCTGGCCGCGGAACCCAGTCATCTGGTGAAAAGCACG

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
df

In [11]:
df.to_csv('CAMIH1Sequencestest.tsv', sep='\t', index=False)