## This Script is used to get the unique sequences from GraphAligner and measure the edit distance between the sequences from Bandage and GraphAligner

In [1]:
# import the libraries
import numpy as np
import pandas as pd
import re 
from Bio import SeqIO
from collections import defaultdict
import gfapy

In [2]:
# Open the GFA file
file_path = "CAMI_M_2_graph.gfa"
gfa = gfapy.Gfa.from_file(file_path)

num_segments = len(gfa.segments)

In [3]:
data_graph = []

In [4]:
# Store the name and sequence for each node from the graph
for segment in gfa.segments:
    data_graph.append({"Name": segment.name, "Sequence": segment.sequence})

# Convert the list to a DataFrame
df_graph = pd.DataFrame(data_graph)

# Print the DataFrame
print(df_graph)

            Name                                           Sequence
0          30121  AGAATCGTTCTCTTTGAGCTAAGGCGAGGCAACGCTGTACTGGTTT...
1          30131  GAAAAATTAAAAGAAGAGGGGGGACAAGTCCCTCCTCTTTGTTTAC...
2          13081  ATTTGCTCCCATTCCACATCCGTTGCTTGAGGATAGAGCTTTAGAA...
3          30139  TCGACGACCGCACCATCGACAGCCACATCAAGCGGCTGCGCAAGAA...
4       28881274  GTCCGGCTTGATGAGCTGCCGGGGGAAGAGTTTGAACAGGGCGGTG...
...          ...                                                ...
396314  28353608  CGCACTTCCCCCCGGGTTACTGGGGCAGCTGGCTGTTGCTGGGCCT...
396315  28500301  TTTAGCTCTTTTCACGAGTAAAGTTCGATTAATCCGTGGGCCATTA...
396316  29318044  AGCTCTCCGTGGTGGCCTCCATGCCGCCCGGCGTGCCGAGGCAGAG...
396317  21284397  GCGATGTGGACATGGCGCGCAACCTGACGCCGGACCAGATCGGCGG...
396318    670891  CGCTCCGTCTCGGACAGGCCTGGATAGACGGAGCCGAGCGGCAGGT...

[396319 rows x 2 columns]


In [5]:
df_graph['Sequence']

0         AGAATCGTTCTCTTTGAGCTAAGGCGAGGCAACGCTGTACTGGTTT...
1         GAAAAATTAAAAGAAGAGGGGGGACAAGTCCCTCCTCTTTGTTTAC...
2         ATTTGCTCCCATTCCACATCCGTTGCTTGAGGATAGAGCTTTAGAA...
3         TCGACGACCGCACCATCGACAGCCACATCAAGCGGCTGCGCAAGAA...
4         GTCCGGCTTGATGAGCTGCCGGGGGAAGAGTTTGAACAGGGCGGTG...
                                ...                        
396314    CGCACTTCCCCCCGGGTTACTGGGGCAGCTGGCTGTTGCTGGGCCT...
396315    TTTAGCTCTTTTCACGAGTAAAGTTCGATTAATCCGTGGGCCATTA...
396316    AGCTCTCCGTGGTGGCCTCCATGCCGCCCGGCGTGCCGAGGCAGAG...
396317    GCGATGTGGACATGGCGCGCAACCTGACGCCGGACCAGATCGGCGG...
396318    CGCTCCGTCTCGGACAGGCCTGGATAGACGGAGCCGAGCGGCAGGT...
Name: Sequence, Length: 396319, dtype: object

In [6]:
#Save only one copy of each row to the dataframe
df_graph=df_graph.drop_duplicates()

In [7]:
df_graph

Unnamed: 0,Name,Sequence
0,30121,AGAATCGTTCTCTTTGAGCTAAGGCGAGGCAACGCTGTACTGGTTT...
1,30131,GAAAAATTAAAAGAAGAGGGGGGACAAGTCCCTCCTCTTTGTTTAC...
2,13081,ATTTGCTCCCATTCCACATCCGTTGCTTGAGGATAGAGCTTTAGAA...
3,30139,TCGACGACCGCACCATCGACAGCCACATCAAGCGGCTGCGCAAGAA...
4,28881274,GTCCGGCTTGATGAGCTGCCGGGGGAAGAGTTTGAACAGGGCGGTG...
...,...,...
396314,28353608,CGCACTTCCCCCCGGGTTACTGGGGCAGCTGGCTGTTGCTGGGCCT...
396315,28500301,TTTAGCTCTTTTCACGAGTAAAGTTCGATTAATCCGTGGGCCATTA...
396316,29318044,AGCTCTCCGTGGTGGCCTCCATGCCGCCCGGCGTGCCGAGGCAGAG...
396317,21284397,GCGATGTGGACATGGCGCGCAACCTGACGCCGGACCAGATCGGCGG...


In [8]:
df_GraphAligner_Paths=pd.read_csv('CAMIM2_Results/SequencesBandageGraphAlignerSPAligner.tsv', sep='\t')

In [9]:
selected_rows_BandageVSGraphAligner = df_GraphAligner_Paths[df_GraphAligner_Paths['FinalResultBandageVSGraphAligner'] != 'Full']
selected_rows_BandageVSGraphAligner.columns

Index(['Query', 'Sequence', 'Sequence_Bandage', 'Sequence_SPAligner',
       'Start_GraphAligner_x', 'End_GraphAligner_x', 'Path_GraphAligner_x',
       'FinalResultBandageVSGraphAligner', 'Unnamed: 8'],
      dtype='object')

In [10]:
df_BandageVSGraphAligner=selected_rows_BandageVSGraphAligner[['Query', 'Sequence', 'Sequence_Bandage', 'Sequence_SPAligner',
       'Start_GraphAligner_x', 'End_GraphAligner_x', 'Path_GraphAligner_x',
       'FinalResultBandageVSGraphAligner']].copy()


In [11]:
df_BandageVSGraphAligner.reset_index()

Unnamed: 0,index,Query,Sequence,Sequence_Bandage,Sequence_SPAligner,Start_GraphAligner_x,End_GraphAligner_x,Path_GraphAligner_x,FinalResultBandageVSGraphAligner
0,0,gb|AB765395|+|0-1143|ARO:3002993|AQU-1,ATGAAGCAAACCTCACCCTTGTCGTCGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTCGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,24880,26021,2079546,SinglePathEndMatch
1,1,gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,3688,4123,118828,SinglePathEndMatch
2,2,gb|AE016877.1|+|1972251-1972668|ARO:3000172|FosB,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,312,727,15445,SinglePathEndMatch
3,3,gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,2130,2965,2007548,SinglePathEndMatch
4,4,gb|AF118110.1|-|71-1037|ARO:3003002|CfxA2,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,113,1077,821183,SinglePathEndMatch
5,5,gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy,ATGGACATCAGGCAAATGAACAAAACCCATCTGGAGCACTGGCGCG...,ATGGACATCAGGCAAATGAACAAAACCCATCTGGAGCACTGGCGCG...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,3688,4123,118828,SinglePathEndMatch
6,6,gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,1327,2129,2007548,SinglePathEndMatch
7,7,gb|AF472622|+|52-1018|ARO:3003003|CfxA3,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,113,1077,821183,SinglePathEndMatch
8,8,gb|AF550679.1|-|80976-81768|ARO:3002601|aadA,ATGAGGGAAGCGGTGATCGCCGAAGTATCGACTCAACTATCAGAGG...,ATGAGGGAAGCGGTGATCGCCGAAGTATCGACTCAACTATCAGAGG...,ATGAGGGAAGCGGTGATCGCCGAAGTATCGACTCAACTATCAGAGG...,555,1312,28874510,MatchOnPath
9,9,gb|AJ222769.3|+|3686-5606|ARO:3000194|tetW,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...,566,2484,136860,SinglePathEndMatch


In [12]:
df_BandageVSGraphAligner.to_csv('CAMIM2_Results/BandageVSGraphAlignerSequences.tsv',sep='\t')

In [13]:
df_GraphAligner_Path= pd.read_csv('CAMIM2_Results/BandageVSGraphAlignerSequences.tsv', sep='\t')

In [14]:
df_GraphAligner_Path

Unnamed: 0.1,Unnamed: 0,Query,Sequence,Sequence_Bandage,Sequence_SPAligner,Start_GraphAligner_x,End_GraphAligner_x,Path_GraphAligner_x,FinalResultBandageVSGraphAligner
0,0,gb|AB765395|+|0-1143|ARO:3002993|AQU-1,ATGAAGCAAACCTCACCCTTGTCGTCGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTCGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,24880,26021,2079546,SinglePathEndMatch
1,1,gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,3688,4123,118828,SinglePathEndMatch
2,2,gb|AE016877.1|+|1972251-1972668|ARO:3000172|FosB,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,312,727,15445,SinglePathEndMatch
3,3,gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,2130,2965,2007548,SinglePathEndMatch
4,4,gb|AF118110.1|-|71-1037|ARO:3003002|CfxA2,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,113,1077,821183,SinglePathEndMatch
5,5,gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy,ATGGACATCAGGCAAATGAACAAAACCCATCTGGAGCACTGGCGCG...,ATGGACATCAGGCAAATGAACAAAACCCATCTGGAGCACTGGCGCG...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,3688,4123,118828,SinglePathEndMatch
6,6,gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,1327,2129,2007548,SinglePathEndMatch
7,7,gb|AF472622|+|52-1018|ARO:3003003|CfxA3,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,113,1077,821183,SinglePathEndMatch
8,8,gb|AF550679.1|-|80976-81768|ARO:3002601|aadA,ATGAGGGAAGCGGTGATCGCCGAAGTATCGACTCAACTATCAGAGG...,ATGAGGGAAGCGGTGATCGCCGAAGTATCGACTCAACTATCAGAGG...,ATGAGGGAAGCGGTGATCGCCGAAGTATCGACTCAACTATCAGAGG...,555,1312,28874510,MatchOnPath
9,9,gb|AJ222769.3|+|3686-5606|ARO:3000194|tetW,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...,566,2484,136860,SinglePathEndMatch


In [15]:
df_GraphAligner_Path[['Path1', 'Path2','Path3','Path4','Path5']] = df_GraphAligner_Path['Path_GraphAligner_x'].str.split(',', expand=True)

In [16]:
df_GraphAligner_Path

Unnamed: 0.1,Unnamed: 0,Query,Sequence,Sequence_Bandage,Sequence_SPAligner,Start_GraphAligner_x,End_GraphAligner_x,Path_GraphAligner_x,FinalResultBandageVSGraphAligner,Path1,Path2,Path3,Path4,Path5
0,0,gb|AB765395|+|0-1143|ARO:3002993|AQU-1,ATGAAGCAAACCTCACCCTTGTCGTCGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTCGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,24880,26021,2079546,SinglePathEndMatch,2079546,,,,
1,1,gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,3688,4123,118828,SinglePathEndMatch,118828,,,,
2,2,gb|AE016877.1|+|1972251-1972668|ARO:3000172|FosB,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,312,727,15445,SinglePathEndMatch,15445,,,,
3,3,gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,2130,2965,2007548,SinglePathEndMatch,2007548,,,,
4,4,gb|AF118110.1|-|71-1037|ARO:3003002|CfxA2,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,113,1077,821183,SinglePathEndMatch,821183,,,,
5,5,gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy,ATGGACATCAGGCAAATGAACAAAACCCATCTGGAGCACTGGCGCG...,ATGGACATCAGGCAAATGAACAAAACCCATCTGGAGCACTGGCGCG...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,3688,4123,118828,SinglePathEndMatch,118828,,,,
6,6,gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,1327,2129,2007548,SinglePathEndMatch,2007548,,,,
7,7,gb|AF472622|+|52-1018|ARO:3003003|CfxA3,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,113,1077,821183,SinglePathEndMatch,821183,,,,
8,8,gb|AF550679.1|-|80976-81768|ARO:3002601|aadA,ATGAGGGAAGCGGTGATCGCCGAAGTATCGACTCAACTATCAGAGG...,ATGAGGGAAGCGGTGATCGCCGAAGTATCGACTCAACTATCAGAGG...,ATGAGGGAAGCGGTGATCGCCGAAGTATCGACTCAACTATCAGAGG...,555,1312,28874510,MatchOnPath,28874510,,,,
9,9,gb|AJ222769.3|+|3686-5606|ARO:3000194|tetW,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...,566,2484,136860,SinglePathEndMatch,136860,,,,


## Get the rows that have a unique node, Path1

In [17]:
# Get the rows where Path 2 is  None
Path1_Only = df_GraphAligner_Path[df_GraphAligner_Path['Path2'].isna()]

# Print the resulting dataframe
Path1_Only 

Unnamed: 0.1,Unnamed: 0,Query,Sequence,Sequence_Bandage,Sequence_SPAligner,Start_GraphAligner_x,End_GraphAligner_x,Path_GraphAligner_x,FinalResultBandageVSGraphAligner,Path1,Path2,Path3,Path4,Path5
0,0,gb|AB765395|+|0-1143|ARO:3002993|AQU-1,ATGAAGCAAACCTCACCCTTGTCGTCGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTCGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,24880,26021,2079546,SinglePathEndMatch,2079546,,,,
1,1,gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,3688,4123,118828,SinglePathEndMatch,118828,,,,
2,2,gb|AE016877.1|+|1972251-1972668|ARO:3000172|FosB,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,312,727,15445,SinglePathEndMatch,15445,,,,
3,3,gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,2130,2965,2007548,SinglePathEndMatch,2007548,,,,
4,4,gb|AF118110.1|-|71-1037|ARO:3003002|CfxA2,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,113,1077,821183,SinglePathEndMatch,821183,,,,
5,5,gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy,ATGGACATCAGGCAAATGAACAAAACCCATCTGGAGCACTGGCGCG...,ATGGACATCAGGCAAATGAACAAAACCCATCTGGAGCACTGGCGCG...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,3688,4123,118828,SinglePathEndMatch,118828,,,,
6,6,gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,1327,2129,2007548,SinglePathEndMatch,2007548,,,,
7,7,gb|AF472622|+|52-1018|ARO:3003003|CfxA3,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,113,1077,821183,SinglePathEndMatch,821183,,,,
8,8,gb|AF550679.1|-|80976-81768|ARO:3002601|aadA,ATGAGGGAAGCGGTGATCGCCGAAGTATCGACTCAACTATCAGAGG...,ATGAGGGAAGCGGTGATCGCCGAAGTATCGACTCAACTATCAGAGG...,ATGAGGGAAGCGGTGATCGCCGAAGTATCGACTCAACTATCAGAGG...,555,1312,28874510,MatchOnPath,28874510,,,,
9,9,gb|AJ222769.3|+|3686-5606|ARO:3000194|tetW,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...,566,2484,136860,SinglePathEndMatch,136860,,,,


In [18]:
#Drop the 0 parts
Path1_Only = Path1_Only.drop(Path1_Only[(Path1_Only['Start_GraphAligner_x'] == 0) & (Path1_Only['End_GraphAligner_x'] == 0)].index)


In [19]:
# Get the sequence where we have only Path1
merged_df_Path1= pd.merge(Path1_Only , df_graph, left_on='Path1', right_on='Name')
merged_df_Path1

Unnamed: 0.1,Unnamed: 0,Query,Sequence_x,Sequence_Bandage,Sequence_SPAligner,Start_GraphAligner_x,End_GraphAligner_x,Path_GraphAligner_x,FinalResultBandageVSGraphAligner,Path1,Path2,Path3,Path4,Path5,Name,Sequence_y
0,0,gb|AB765395|+|0-1143|ARO:3002993|AQU-1,ATGAAGCAAACCTCACCCTTGTCGTCGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTCGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,24880,26021,2079546,SinglePathEndMatch,2079546,,,,,2079546,TCGCGACCCCGACCTTGGCAAGGTCGTGCTCTACCAACTGAGCTAT...
1,36,gb|KF730243.1|+|0-1143|ARO:3004647|AQU-2,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,24880,26021,2079546,SinglePathEndMatch,2079546,,,,,2079546,TCGCGACCCCGACCTTGGCAAGGTCGTGCTCTACCAACTGAGCTAT...
2,37,gb|KF730244.1|+|0-1149|ARO:3004648|AQU-3,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,24880,26021,2079546,SinglePathEndMatch,2079546,,,,,2079546,TCGCGACCCCGACCTTGGCAAGGTCGTGCTCTACCAACTGAGCTAT...
3,1,gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,3688,4123,118828,SinglePathEndMatch,118828,,,,,118828,GATCGATGAGTACATGGCGCCTTTTCTGGTTGGCAAAGATCCGACC...
4,5,gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy,ATGGACATCAGGCAAATGAACAAAACCCATCTGGAGCACTGGCGCG...,ATGGACATCAGGCAAATGAACAAAACCCATCTGGAGCACTGGCGCG...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,3688,4123,118828,SinglePathEndMatch,118828,,,,,118828,GATCGATGAGTACATGGCGCCTTTTCTGGTTGGCAAAGATCCGACC...
5,2,gb|AE016877.1|+|1972251-1972668|ARO:3000172|FosB,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,312,727,15445,SinglePathEndMatch,15445,,,,,15445,AAATGGATTATTCTAATCCGAAGTATGATGATTTAGTAGCGAAATC...
6,3,gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,2130,2965,2007548,SinglePathEndMatch,2007548,,,,,2007548,ACAAGGTGGCTTAACTAGAGCGGAATATCACTTATAAATAAGCTTT...
7,6,gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,1327,2129,2007548,SinglePathEndMatch,2007548,,,,,2007548,ACAAGGTGGCTTAACTAGAGCGGAATATCACTTATAAATAAGCTTT...
8,4,gb|AF118110.1|-|71-1037|ARO:3003002|CfxA2,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,113,1077,821183,SinglePathEndMatch,821183,,,,,821183,ATTCGACAAACTGTTATTTTTCTATCTATTTATTTGGGTGGGAAAC...
9,7,gb|AF472622|+|52-1018|ARO:3003003|CfxA3,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,113,1077,821183,SinglePathEndMatch,821183,,,,,821183,ATTCGACAAACTGTTATTTTTCTATCTATTTATTTGGGTGGGAAAC...


In [20]:
merged_df_Path1.dtypes

Unnamed: 0                           int64
Query                               object
Sequence_x                          object
Sequence_Bandage                    object
Sequence_SPAligner                  object
Start_GraphAligner_x                 int64
End_GraphAligner_x                   int64
Path_GraphAligner_x                 object
FinalResultBandageVSGraphAligner    object
Path1                               object
Path2                               object
Path3                               object
Path4                               object
Path5                               object
Name                                object
Sequence_y                          object
dtype: object

In [21]:
merged_df_Path1['Start_GraphAligner_x']=merged_df_Path1['Start_GraphAligner_x'].astype(int)
merged_df_Path1['End_GraphAligner_x']=merged_df_Path1['End_GraphAligner_x'].astype(int)

In [22]:
merged_df_Path1.dtypes

Unnamed: 0                           int64
Query                               object
Sequence_x                          object
Sequence_Bandage                    object
Sequence_SPAligner                  object
Start_GraphAligner_x                 int64
End_GraphAligner_x                   int64
Path_GraphAligner_x                 object
FinalResultBandageVSGraphAligner    object
Path1                               object
Path2                               object
Path3                               object
Path4                               object
Path5                               object
Name                                object
Sequence_y                          object
dtype: object

In [23]:
merged_df_Path1

Unnamed: 0.1,Unnamed: 0,Query,Sequence_x,Sequence_Bandage,Sequence_SPAligner,Start_GraphAligner_x,End_GraphAligner_x,Path_GraphAligner_x,FinalResultBandageVSGraphAligner,Path1,Path2,Path3,Path4,Path5,Name,Sequence_y
0,0,gb|AB765395|+|0-1143|ARO:3002993|AQU-1,ATGAAGCAAACCTCACCCTTGTCGTCGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTCGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,24880,26021,2079546,SinglePathEndMatch,2079546,,,,,2079546,TCGCGACCCCGACCTTGGCAAGGTCGTGCTCTACCAACTGAGCTAT...
1,36,gb|KF730243.1|+|0-1143|ARO:3004647|AQU-2,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,24880,26021,2079546,SinglePathEndMatch,2079546,,,,,2079546,TCGCGACCCCGACCTTGGCAAGGTCGTGCTCTACCAACTGAGCTAT...
2,37,gb|KF730244.1|+|0-1149|ARO:3004648|AQU-3,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,24880,26021,2079546,SinglePathEndMatch,2079546,,,,,2079546,TCGCGACCCCGACCTTGGCAAGGTCGTGCTCTACCAACTGAGCTAT...
3,1,gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,3688,4123,118828,SinglePathEndMatch,118828,,,,,118828,GATCGATGAGTACATGGCGCCTTTTCTGGTTGGCAAAGATCCGACC...
4,5,gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy,ATGGACATCAGGCAAATGAACAAAACCCATCTGGAGCACTGGCGCG...,ATGGACATCAGGCAAATGAACAAAACCCATCTGGAGCACTGGCGCG...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,3688,4123,118828,SinglePathEndMatch,118828,,,,,118828,GATCGATGAGTACATGGCGCCTTTTCTGGTTGGCAAAGATCCGACC...
5,2,gb|AE016877.1|+|1972251-1972668|ARO:3000172|FosB,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,312,727,15445,SinglePathEndMatch,15445,,,,,15445,AAATGGATTATTCTAATCCGAAGTATGATGATTTAGTAGCGAAATC...
6,3,gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,2130,2965,2007548,SinglePathEndMatch,2007548,,,,,2007548,ACAAGGTGGCTTAACTAGAGCGGAATATCACTTATAAATAAGCTTT...
7,6,gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,1327,2129,2007548,SinglePathEndMatch,2007548,,,,,2007548,ACAAGGTGGCTTAACTAGAGCGGAATATCACTTATAAATAAGCTTT...
8,4,gb|AF118110.1|-|71-1037|ARO:3003002|CfxA2,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,113,1077,821183,SinglePathEndMatch,821183,,,,,821183,ATTCGACAAACTGTTATTTTTCTATCTATTTATTTGGGTGGGAAAC...
9,7,gb|AF472622|+|52-1018|ARO:3003003|CfxA3,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,113,1077,821183,SinglePathEndMatch,821183,,,,,821183,ATTCGACAAACTGTTATTTTTCTATCTATTTATTTGGGTGGGAAAC...


In [24]:
def extract_sequences_Path1(dataframe):
    for index, row in dataframe.iterrows():
        start_pos = row['Start_GraphAligner_x']
        end_pos = row['End_GraphAligner_x']
        sequence = row['Sequence_y']
        
        extracted_sequence = sequence[start_pos-1:end_pos]
        dataframe.loc[index, 'Result_node1'] = extracted_sequence
    
    return dataframe

In [25]:
extracted_sequences_Path1_df = extract_sequences_Path1(merged_df_Path1)
extracted_sequences_Path1_df
#Save the dataframe to a pdf
extracted_sequences_Path1_df.to_csv("CAMIM2_Results/Path1_TestSequences.tsv", sep="\t")

In [26]:
extracted_sequences_Path1_df

Unnamed: 0.1,Unnamed: 0,Query,Sequence_x,Sequence_Bandage,Sequence_SPAligner,Start_GraphAligner_x,End_GraphAligner_x,Path_GraphAligner_x,FinalResultBandageVSGraphAligner,Path1,Path2,Path3,Path4,Path5,Name,Sequence_y,Result_node1
0,0,gb|AB765395|+|0-1143|ARO:3002993|AQU-1,ATGAAGCAAACCTCACCCTTGTCGTCGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTCGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,24880,26021,2079546,SinglePathEndMatch,2079546,,,,,2079546,TCGCGACCCCGACCTTGGCAAGGTCGTGCTCTACCAACTGAGCTAT...,CCATGATGGAGCCGGTGCGGATGCTGTTGGCGAACAGGGTGGTCGC...
1,36,gb|KF730243.1|+|0-1143|ARO:3004647|AQU-2,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,24880,26021,2079546,SinglePathEndMatch,2079546,,,,,2079546,TCGCGACCCCGACCTTGGCAAGGTCGTGCTCTACCAACTGAGCTAT...,CCATGATGGAGCCGGTGCGGATGCTGTTGGCGAACAGGGTGGTCGC...
2,37,gb|KF730244.1|+|0-1149|ARO:3004648|AQU-3,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,24880,26021,2079546,SinglePathEndMatch,2079546,,,,,2079546,TCGCGACCCCGACCTTGGCAAGGTCGTGCTCTACCAACTGAGCTAT...,CCATGATGGAGCCGGTGCGGATGCTGTTGGCGAACAGGGTGGTCGC...
3,1,gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,3688,4123,118828,SinglePathEndMatch,118828,,,,,118828,GATCGATGAGTACATGGCGCCTTTTCTGGTTGGCAAAGATCCGACC...,TTATCCCCATTTTATTGTCGGATATGGCGTACTTTCTCCATAAACT...
4,5,gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy,ATGGACATCAGGCAAATGAACAAAACCCATCTGGAGCACTGGCGCG...,ATGGACATCAGGCAAATGAACAAAACCCATCTGGAGCACTGGCGCG...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,3688,4123,118828,SinglePathEndMatch,118828,,,,,118828,GATCGATGAGTACATGGCGCCTTTTCTGGTTGGCAAAGATCCGACC...,TTATCCCCATTTTATTGTCGGATATGGCGTACTTTCTCCATAAACT...
5,2,gb|AE016877.1|+|1972251-1972668|ARO:3000172|FosB,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,312,727,15445,SinglePathEndMatch,15445,,,,,15445,AAATGGATTATTCTAATCCGAAGTATGATGATTTAGTAGCGAAATC...,TGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAGA...
6,3,gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,2130,2965,2007548,SinglePathEndMatch,2007548,,,,,2007548,ACAAGGTGGCTTAACTAGAGCGGAATATCACTTATAAATAAGCTTT...,TGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAACC...
7,6,gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,1327,2129,2007548,SinglePathEndMatch,2007548,,,,,2007548,ACAAGGTGGCTTAACTAGAGCGGAATATCACTTATAAATAAGCTTT...,TGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGTT...
8,4,gb|AF118110.1|-|71-1037|ARO:3003002|CfxA2,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,113,1077,821183,SinglePathEndMatch,821183,,,,,821183,ATTCGACAAACTGTTATTTTTCTATCTATTTATTTGGGTGGGAAAC...,TGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTTT...
9,7,gb|AF472622|+|52-1018|ARO:3003003|CfxA3,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,113,1077,821183,SinglePathEndMatch,821183,,,,,821183,ATTCGACAAACTGTTATTTTTCTATCTATTTATTTGGGTGGGAAAC...,TGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTTT...


In [30]:
import Levenshtein
#Create a method to calculate the levenstein distance
def calculate_edit_distance_sequence_GraphAligner(row):
    return Levenshtein.distance(row['Sequence_x'], row['Result_node1'])

In [31]:
extracted_sequences_Path1_df['edit_distance_query_sequence_GraphAligner'] = extracted_sequences_Path1_df.apply(calculate_edit_distance_sequence_GraphAligner, axis=1)

In [32]:
extracted_sequences_Path1_df

Unnamed: 0.1,Unnamed: 0,Query,Sequence_x,Sequence_Bandage,Sequence_SPAligner,Start_GraphAligner_x,End_GraphAligner_x,Path_GraphAligner_x,FinalResultBandageVSGraphAligner,Path1,Path2,Path3,Path4,Path5,Name,Sequence_y,Result_node1,edit_distance_query_sequence_GraphAligner
0,0,gb|AB765395|+|0-1143|ARO:3002993|AQU-1,ATGAAGCAAACCTCACCCTTGTCGTCGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTCGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,24880,26021,2079546,SinglePathEndMatch,2079546,,,,,2079546,TCGCGACCCCGACCTTGGCAAGGTCGTGCTCTACCAACTGAGCTAT...,CCATGATGGAGCCGGTGCGGATGCTGTTGGCGAACAGGGTGGTCGC...,582
1,36,gb|KF730243.1|+|0-1143|ARO:3004647|AQU-2,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,24880,26021,2079546,SinglePathEndMatch,2079546,,,,,2079546,TCGCGACCCCGACCTTGGCAAGGTCGTGCTCTACCAACTGAGCTAT...,CCATGATGGAGCCGGTGCGGATGCTGTTGGCGAACAGGGTGGTCGC...,584
2,37,gb|KF730244.1|+|0-1149|ARO:3004648|AQU-3,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...,24880,26021,2079546,SinglePathEndMatch,2079546,,,,,2079546,TCGCGACCCCGACCTTGGCAAGGTCGTGCTCTACCAACTGAGCTAT...,CCATGATGGAGCCGGTGCGGATGCTGTTGGCGAACAGGGTGGTCGC...,583
3,1,gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,3688,4123,118828,SinglePathEndMatch,118828,,,,,118828,GATCGATGAGTACATGGCGCCTTTTCTGGTTGGCAAAGATCCGACC...,TTATCCCCATTTTATTGTCGGATATGGCGTACTTTCTCCATAAACT...,239
4,5,gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy,ATGGACATCAGGCAAATGAACAAAACCCATCTGGAGCACTGGCGCG...,ATGGACATCAGGCAAATGAACAAAACCCATCTGGAGCACTGGCGCG...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,3688,4123,118828,SinglePathEndMatch,118828,,,,,118828,GATCGATGAGTACATGGCGCCTTTTCTGGTTGGCAAAGATCCGACC...,TTATCCCCATTTTATTGTCGGATATGGCGTACTTTCTCCATAAACT...,240
5,2,gb|AE016877.1|+|1972251-1972668|ARO:3000172|FosB,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,312,727,15445,SinglePathEndMatch,15445,,,,,15445,AAATGGATTATTCTAATCCGAAGTATGATGATTTAGTAGCGAAATC...,TGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAGA...,4
6,3,gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,2130,2965,2007548,SinglePathEndMatch,2007548,,,,,2007548,ACAAGGTGGCTTAACTAGAGCGGAATATCACTTATAAATAAGCTTT...,TGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAACC...,3
7,6,gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,1327,2129,2007548,SinglePathEndMatch,2007548,,,,,2007548,ACAAGGTGGCTTAACTAGAGCGGAATATCACTTATAAATAAGCTTT...,TGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGTT...,2
8,4,gb|AF118110.1|-|71-1037|ARO:3003002|CfxA2,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,113,1077,821183,SinglePathEndMatch,821183,,,,,821183,ATTCGACAAACTGTTATTTTTCTATCTATTTATTTGGGTGGGAAAC...,TGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTTT...,1
9,7,gb|AF472622|+|52-1018|ARO:3003003|CfxA3,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,113,1077,821183,SinglePathEndMatch,821183,,,,,821183,ATTCGACAAACTGTTATTTTTCTATCTATTTATTTGGGTGGGAAAC...,TGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTTT...,2


In [34]:
extracted_sequences_Path1_df.to_csv('CAMIM2_Results/GraphAlignerSequencesEditDistance.tsv',sep='\t')

In [None]:
# Get the rows where Path 2 is not None
df_multiple = df_GraphAligner_Path[df_GraphAligner_Path['Path2'].notna() & df_GraphAligner_Path['Path3'].isna() ]

# Print the resulting dataframe
df_multiple

In [None]:
merged_df_multiple_Path1= pd.merge(df_multiple, df_graph, left_on='Path1', right_on='Name')
merged_df_multiple_Path1


In [None]:
merged_df_multiple_Path1['Path2'] = merged_df_multiple_Path1['Path2'].str.strip()


In [None]:
merged_df_multiple_Path2=pd.merge(merged_df_multiple_Path1, df_graph,left_on='Path2', right_on='Name')
merged_df_multiple_Path2

In [None]:
#Rename Name_X, Sequence_X, Name_y, Sequence_y
merged_df_multiple_Path2.rename(columns = {'Sequence_x':'Sequence_Path1'}, inplace = True)
merged_df_multiple_Path2.rename(columns = {'Sequence_y':'Sequence_Path2'}, inplace = True)
merged_df_multiple_Path2

In [None]:
def extract_sequences_multiple_sequences(dataframe):
    for index, row in dataframe.iterrows():
        start_pos = row['Start_GraphAligner_x']
        sequence_1 = row['Sequence_Path1']
       
        #Get the first sequence, and remove 55 kmer from it
        extracted_sequence_first = sequence_1[start_pos-1:]
        
        dataframe.loc[index, 'Sequencefirst'] = extracted_sequence_first
    
    return dataframe

In [None]:
merged_df_multiple_Path2['Start_GraphAligner_x']=merged_df_multiple_Path2['Start_GraphAligner_x'].astype(int)
merged_df_multiple_Path2['End_GraphAligner_x']=merged_df_multiple_Path2['End_GraphAligner_x'].astype(int)

In [None]:
extracted_sequences_2 = extract_sequences_multiple_sequences(merged_df_multiple_Path2)
extracted_sequences_2.to_csv('TestSeque.tsv',sep='\t')

extracted_sequences_2

In [None]:
def extract_sequences_multiple_sequences_path2(dataframe):
    for index, row in dataframe.iterrows():
        end_pos = row['End_GraphAligner_x']
        sequence_2 = row['Sequence_Path2']
       
        #Get the first sequence, and remove 55 kmer from it
        extracted_sequence_second = sequence_2[:end_pos]
        
        dataframe.loc[index, 'SequencefSecond'] = extracted_sequence_second
    
    return dataframe

In [None]:
extracted_sequences_3 = extract_sequences_multiple_sequences_path2(extracted_sequences_2)
extracted_sequences_3.to_csv('TestSeque3.tsv',sep='\t')

extracted_sequences_3

In [None]:
def extract_sequences_multiple_sequences_overlap(dataframe):
   
    for index, row in dataframe.iterrows():
        sequence_first = row['Sequencefirst']
        length_end=len(sequence_first)-55
       
        #Get the first sequence, and the last 55 characters
        extracted_sequence_first = sequence_first[:len(sequence_first)-55]
        
        dataframe.loc[index, 'length'] = len(sequence_first)
        dataframe.loc[index, 'length_end'] = length_end
        dataframe.loc[index, 'FinalSequence1'] = sequence_first[:length_end]
    
    return dataframe
    

In [None]:
extracted_sequences_multiple = extract_sequences_multiple_sequences_overlap(extracted_sequences_3)

In [None]:
extracted_sequences_multiple

In [None]:
def extract_sequences_multiple_sequences_final(dataframe):
   
    for index, row in dataframe.iterrows():
        sequence_first = row['FinalSequence1']
        sequence_second = row['SequencefSecond']
       
        #Get the first sequence, and the last 55 characters
        extracted_sequence_final = sequence_first+sequence_second
        
      
        dataframe.loc[index, 'FinalSequence'] = sequence_first + sequence_second
    
    return dataframe

In [None]:
extracted_sequences_multiple_final =extract_sequences_multiple_sequences_final(extracted_sequences_multiple)

In [None]:
extracted_sequences_multiple_final

In [None]:
#Drop some columns
extracted_sequences_multiple_final=extracted_sequences_multiple_final.drop(['Path_Bandage', 'Path_GraphAligner','Equal','Name_x','Name_y'], axis=1)

In [None]:
extracted_sequences_multiple_final.rename(columns = {'Sequence_Path1':'Sequence_Path1'}, inplace = True)

In [None]:
extracted_sequences_multiple_final.rename(columns = {'Sequence_Path2':'Sequence_FromGraph_Path2'}, inplace = True)

In [None]:
extracted_sequences_multiple_final.rename(columns = {'Sequencefirst':'Sequence_FromGraph_Path1_No_Overlap'}, inplace = True)

In [None]:
extracted_sequences_multiple_final.rename(columns = {'length':'Length_Sequence_Path1_No_Overlap'}, inplace = True)


In [None]:
extracted_sequences_multiple_final.rename(columns = {'length_end':'Length_Sequence_Path1_With_Overlap'}, inplace = True)

In [None]:
extracted_sequences_multiple_final.rename(columns = {'FinalSequence1':'Sequence_Path1_With_Overlap'}, inplace = True)

In [None]:
extracted_sequences_multiple_final.rename(columns = {'FinalSequence':'Concatenated_Sequence'}, inplace = True)

In [None]:
extracted_sequences_multiple_final.to_csv("CAMIH1_Results/BandageVsGraphAlignerFinalConcatenatedSequences.tsv",sep='\t')

## This section measures the edit distance between the sequences 

In [None]:
import Levenshtein

In [None]:
bandage_df= pd.read_csv('Real_Results/TestDistance.tsv', sep='\t')
bandage_df

In [None]:
#Get the sequences from bandage where the Path2 in GraphAligner=none
distance_df=pd.merge(extracted_sequences_Path1_df,bandage_df, on='Query')

In [None]:
distance_df

In [None]:
distance_df.rename(columns = {'Result_node1':'Sequence_GraphAligner'}, inplace = True)
distance_df.rename(columns = {'Sequence_y':'Sequence_Bandage'}, inplace = True)

In [None]:
distance_df=distance_df.drop('Unnamed: 0_x', axis=1)

In [None]:
distance_df=distance_df.drop('Unnamed: 0_y', axis=1)

In [None]:
distance_df

In [None]:
#Create a method to calculate the levenstein distance
def calculate_edit_distance_Bandage_GraphAligner(row):
    return Levenshtein.distance(row['Sequence_Bandage'], row['Sequence_GraphAligner'])

In [None]:
distance_df['edit_distance_query_Bandage_GraphAligner'] = distance_df.apply(calculate_edit_distance_Bandage_GraphAligner, axis=1)

In [None]:
distance_df.dtypes

In [None]:
type_counts_bandage_GraphAligner=distance_df['edit_distance_query_Bandage_GraphAligner'].value_counts().reset_index()

In [None]:
# Rename the columns
type_counts_bandage_GraphAligner.columns = ['Type', 'Count']

# Display the result
print(type_counts_bandage_GraphAligner)

In [None]:
type_counts_bandage_GraphAligner.to_csv("Real_Results/type_counts_bandage_GraphAligner.tsv",sep='\t')

In [None]:
#Get the sequences from bandage where the Path2 in GraphAligner is not none
distance_df_multiple=pd.merge(extracted_sequences_multiple_final,bandage_df, on='Query')
distance_df_multiple

In [None]:
distance_df_multiple.rename(columns = {'Concatenated_Sequence':'Sequence_GraphAligner'}, inplace = True)
distance_df_multiple.rename(columns = {'Sequence':'Sequence_Bandage'}, inplace = True)

In [None]:
distance_df_multiple['edit_distance_query_Bandage_GraphAligner'] = distance_df_multiple.apply(calculate_edit_distance_Bandage_GraphAligner, axis=1)

In [None]:
distance_df_multiple

In [None]:
type_counts_bandage_GraphAligner_multiple=distance_df_multiple['edit_distance_query_Bandage_GraphAligner'].value_counts().reset_index()

In [None]:
# Rename the columns
type_counts_bandage_GraphAligner_multiple.columns = ['Type', 'Count']

# Display the result
print(type_counts_bandage_GraphAligner_multiple)

In [None]:
type_counts_bandage_GraphAligner_multiple.to_csv("Real_Results/type_counts_bandage_GraphAligner_multiple.tsv",sep='\t')

In [None]:
#Read the query and sequences from the .fasta file
fasta_file = "111_Graph/combined_1.fasta"  

# Create empty lists to store the data
queries = []
sequences = []

# Read through the FASTA file and extract query and sequence information
for record in SeqIO.parse(fasta_file, "fasta"):
    queries.append(record.id)
    sequences.append(str(record.seq))

# Create DataFrame from the lists
fasta_df = pd.DataFrame({'Query': queries, 'Sequence': sequences})

# Display the DataFrame
print(fasta_df)

In [None]:
distance_df_Sequence=pd.merge(extracted_sequences_Path1_df,fasta_df, on='Query')

In [None]:
distance_df_Sequence

In [None]:
distance_df_Sequence.rename(columns = {'Result_node1':'Sequence_GraphAligner'}, inplace = True)
distance_df_Sequence.rename(columns = {'Sequence_y':'Sequence'}, inplace = True)

In [None]:
distance_df_Sequence

In [None]:
#Create a method to calculate the levenstein distance
def calculate_edit_distance_sequence_GraphAligner(row):
    return Levenshtein.distance(row['Sequence'], row['Sequence_GraphAligner'])

In [None]:
distance_df_Sequence['edit_distance_query_sequence_GraphAligner'] = distance_df_Sequence.apply(calculate_edit_distance_sequence_GraphAligner, axis=1)

In [None]:
distance_df_Sequence