## This Script is used to get the unique sequences from GraphAligner and measure the edit distance between the sequences from Bandage and GraphAligner

In [None]:
# import the libraries
import numpy as np
import pandas as pd
import re 
from Bio import SeqIO
from collections import defaultdict
import gfapy

In [2]:
# Open the GFA file
file_path = "Real.gfa"
gfa = gfapy.Gfa.from_file(file_path)

num_segments = len(gfa.segments)

In [3]:
data_graph = []

In [4]:
# Store the name and sequence for each node from the graph
for segment in gfa.segments:
    data_graph.append({"Name": segment.name, "Sequence": segment.sequence})

# Convert the list to a DataFrame
df_graph = pd.DataFrame(data_graph)

# Print the DataFrame
print(df_graph)

             Name                                           Sequence
0               7  AACTCAAGTTTCGCCTAGTTCACAATACGAAGGCGCAGTAACATAC...
1               9  GCTAACACTCCCCACCCATGATTCGACTTGAAACCATCCTGACCCC...
2              11  GCCTCAACGTCGTCCACCGCCTGCACGAAGGCGAACTGGCCAAGGC...
3              13  TACTGAAGATATATCTGATATTTCTACATAGAAATAAGGTTTTAAT...
4              19  AAGTGCGCGCAGACACAAATCTCGCGATTGAAAAATTTTATTTTTC...
...           ...                                                ...
3852221  87719752  GGGAATGATGATTTCGCTGCCCCGCGGCAGATCCAGCGCCAGCAGG...
3852222  87719758  GGCGCCGTGCTCGAGCGAGTGCACCGCGTTCTCGTTCTGCTGCGGC...
3852223  87719770  CAGGGCGCTCTTGCCGCGGCTGGAGTTGCCGACGCCGTAGAACATC...
3852224  87719778  CGGTCAAAGAACCCGCGCAGGAATTCGGCAACGTGTTCGAATTCGA...
3852225  87719794  CGTCACCCTGATCTGCGAACAGCCCAAGGTGAAGCCGCACCGTCAG...

[3852226 rows x 2 columns]


In [5]:
df_graph['Sequence']

0          AACTCAAGTTTCGCCTAGTTCACAATACGAAGGCGCAGTAACATAC...
1          GCTAACACTCCCCACCCATGATTCGACTTGAAACCATCCTGACCCC...
2          GCCTCAACGTCGTCCACCGCCTGCACGAAGGCGAACTGGCCAAGGC...
3          TACTGAAGATATATCTGATATTTCTACATAGAAATAAGGTTTTAAT...
4          AAGTGCGCGCAGACACAAATCTCGCGATTGAAAAATTTTATTTTTC...
                                 ...                        
3852221    GGGAATGATGATTTCGCTGCCCCGCGGCAGATCCAGCGCCAGCAGG...
3852222    GGCGCCGTGCTCGAGCGAGTGCACCGCGTTCTCGTTCTGCTGCGGC...
3852223    CAGGGCGCTCTTGCCGCGGCTGGAGTTGCCGACGCCGTAGAACATC...
3852224    CGGTCAAAGAACCCGCGCAGGAATTCGGCAACGTGTTCGAATTCGA...
3852225    CGTCACCCTGATCTGCGAACAGCCCAAGGTGAAGCCGCACCGTCAG...
Name: Sequence, Length: 3852226, dtype: object

In [6]:
#Save only one copy of each row to the dataframe
df_graph=df_graph.drop_duplicates()

In [7]:
df_graph

Unnamed: 0,Name,Sequence
0,7,AACTCAAGTTTCGCCTAGTTCACAATACGAAGGCGCAGTAACATAC...
1,9,GCTAACACTCCCCACCCATGATTCGACTTGAAACCATCCTGACCCC...
2,11,GCCTCAACGTCGTCCACCGCCTGCACGAAGGCGAACTGGCCAAGGC...
3,13,TACTGAAGATATATCTGATATTTCTACATAGAAATAAGGTTTTAAT...
4,19,AAGTGCGCGCAGACACAAATCTCGCGATTGAAAAATTTTATTTTTC...
...,...,...
3852221,87719752,GGGAATGATGATTTCGCTGCCCCGCGGCAGATCCAGCGCCAGCAGG...
3852222,87719758,GGCGCCGTGCTCGAGCGAGTGCACCGCGTTCTCGTTCTGCTGCGGC...
3852223,87719770,CAGGGCGCTCTTGCCGCGGCTGGAGTTGCCGACGCCGTAGAACATC...
3852224,87719778,CGGTCAAAGAACCCGCGCAGGAATTCGGCAACGTGTTCGAATTCGA...


In [8]:
df_GraphAligner_Paths=pd.read_csv('Real_Results/FinalCategories.tsv', sep='\t')

In [9]:
selected_rows_BandageVSGraphAligner = df_GraphAligner_Paths[df_GraphAligner_Paths['FinalResultBandageVSGraphAligner'] != 'Full']
selected_rows_BandageVSGraphAligner.columns

Index(['Unnamed: 0', 'Query', 'Path_Bandage_x', 'Start_Bandage_x',
       'End_Bandage_x', 'Extracted_Path_x', 'Start_SPAligner_x',
       'End_SPAligner_x', 'Path_SPAligner_x', 'FinalResultBandageVSSPAligner',
       'Path_Bandage_y', 'Start_Bandage_y', 'End_Bandage_y',
       'Extracted_Path_y', 'Start_GraphAligner_x', 'End_GraphAligner_x',
       'Path_GraphAligner_x', 'FinalResultBandageVSGraphAligner',
       'Start_SPAligner_y', 'End_SPAligner_y', 'Path_SPAligner_y',
       'Start_GraphAligner_y', 'End_GraphAligner_y', 'Path_GraphAligner_y',
       'SPAlignerVSGraphAligner'],
      dtype='object')

In [10]:
df_BandageVSGraphAligner=selected_rows_BandageVSGraphAligner[['Query','Start_GraphAligner_x', 'End_GraphAligner_x',
       'Path_GraphAligner_x','FinalResultBandageVSGraphAligner']].copy()


In [11]:
df_BandageVSGraphAligner.reset_index()

Unnamed: 0,index,Query,Start_GraphAligner_x,End_GraphAligner_x,Path_GraphAligner_x,FinalResultBandageVSGraphAligner
0,0,gb|AB113580|+|1329-2193|ARO:3002332|GES-3,93,956,32422725,Different
1,1,gb|AB114632|+|655-1453|ARO:3002607|aadA7,61,858,26862905,Different
2,2,gb|AB116260|+|1329-2193|ARO:3002333|GES-4,93,956,32422725,Different
3,4,gb|AB571865.1|-|143423-144308|ARO:3003742|mphG,353,1237,"50056, 2638631, 6283287, 5689755",Different
4,5,gb|AB901141|+|0-864|ARO:3002353|GES-24,93,956,32422725,Different
...,...,...,...,...,...,...
103,135,gb|X02340.1|+|222-1194|ARO:3004089|ANT(3'')-IIa,3,967,"31214326, 15299651, 77161691, 15936737, 316896...",Different
104,136,gb|X07753|+|102-882|ARO:3002652|APH(3')-VIa,180,959,"6903442, 506744",Different
105,138,gb|X75562|+|134-935|ARO:3001402|OXA-7,72,872,"32592258, 758212",Different
106,139,gb|Z21523|+|0-1974|ARO:3000191|tetQ,142,2115,"4764923, 2707722, 11583809, 43494081, 43494077...",Different


In [12]:
df_BandageVSGraphAligner.to_csv('Real_Results/BandageVSGraphAlignerSequences.tsv',sep='\t')

In [13]:
df_GraphAligner_Path= pd.read_csv('Real_Results/BandageVSGraphAlignerSequences.tsv', sep='\t')

In [14]:
df_GraphAligner_Path

Unnamed: 0.1,Unnamed: 0,Query,Start_GraphAligner_x,End_GraphAligner_x,Path_GraphAligner_x,FinalResultBandageVSGraphAligner
0,0,gb|AB113580|+|1329-2193|ARO:3002332|GES-3,93,956,32422725,Different
1,1,gb|AB114632|+|655-1453|ARO:3002607|aadA7,61,858,26862905,Different
2,2,gb|AB116260|+|1329-2193|ARO:3002333|GES-4,93,956,32422725,Different
3,4,gb|AB571865.1|-|143423-144308|ARO:3003742|mphG,353,1237,"50056, 2638631, 6283287, 5689755",Different
4,5,gb|AB901141|+|0-864|ARO:3002353|GES-24,93,956,32422725,Different
...,...,...,...,...,...,...
103,135,gb|X02340.1|+|222-1194|ARO:3004089|ANT(3'')-IIa,3,967,"31214326, 15299651, 77161691, 15936737, 316896...",Different
104,136,gb|X07753|+|102-882|ARO:3002652|APH(3')-VIa,180,959,"6903442, 506744",Different
105,138,gb|X75562|+|134-935|ARO:3001402|OXA-7,72,872,"32592258, 758212",Different
106,139,gb|Z21523|+|0-1974|ARO:3000191|tetQ,142,2115,"4764923, 2707722, 11583809, 43494081, 43494077...",Different


In [16]:
df_GraphAligner_Path[['Path1', 'Path2','Path3','Path4','Path5','Path6','Path7','Path8','Path9','Path10']] = df_GraphAligner_Path['Path_GraphAligner_x'].str.split(',', expand=True)

In [17]:
df_GraphAligner_Path

Unnamed: 0.1,Unnamed: 0,Query,Start_GraphAligner_x,End_GraphAligner_x,Path_GraphAligner_x,FinalResultBandageVSGraphAligner,Path1,Path2,Path3,Path4,Path5,Path6,Path7,Path8,Path9,Path10
0,0,gb|AB113580|+|1329-2193|ARO:3002332|GES-3,93,956,32422725,Different,32422725,,,,,,,,,
1,1,gb|AB114632|+|655-1453|ARO:3002607|aadA7,61,858,26862905,Different,26862905,,,,,,,,,
2,2,gb|AB116260|+|1329-2193|ARO:3002333|GES-4,93,956,32422725,Different,32422725,,,,,,,,,
3,4,gb|AB571865.1|-|143423-144308|ARO:3003742|mphG,353,1237,"50056, 2638631, 6283287, 5689755",Different,50056,2638631,6283287,5689755,,,,,,
4,5,gb|AB901141|+|0-864|ARO:3002353|GES-24,93,956,32422725,Different,32422725,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,135,gb|X02340.1|+|222-1194|ARO:3004089|ANT(3'')-IIa,3,967,"31214326, 15299651, 77161691, 15936737, 316896...",Different,31214326,15299651,77161691,15936737,31689609,31674241,385047,,,
104,136,gb|X07753|+|102-882|ARO:3002652|APH(3')-VIa,180,959,"6903442, 506744",Different,6903442,506744,,,,,,,,
105,138,gb|X75562|+|134-935|ARO:3001402|OXA-7,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,
106,139,gb|Z21523|+|0-1974|ARO:3000191|tetQ,142,2115,"4764923, 2707722, 11583809, 43494081, 43494077...",Different,4764923,2707722,11583809,43494081,43494077,3922650,,,,


## Get the rows that have a unique node, Path1

In [18]:
# Get the rows where Path 2 is  None
Path1_Only = df_GraphAligner_Path[df_GraphAligner_Path['Path2'].isna()]

# Print the resulting dataframe
Path1_Only 

Unnamed: 0.1,Unnamed: 0,Query,Start_GraphAligner_x,End_GraphAligner_x,Path_GraphAligner_x,FinalResultBandageVSGraphAligner,Path1,Path2,Path3,Path4,Path5,Path6,Path7,Path8,Path9,Path10
0,0,gb|AB113580|+|1329-2193|ARO:3002332|GES-3,93,956,32422725,Different,32422725,,,,,,,,,
1,1,gb|AB114632|+|655-1453|ARO:3002607|aadA7,61,858,26862905,Different,26862905,,,,,,,,,
2,2,gb|AB116260|+|1329-2193|ARO:3002333|GES-4,93,956,32422725,Different,32422725,,,,,,,,,
4,5,gb|AB901141|+|0-864|ARO:3002353|GES-24,93,956,32422725,Different,32422725,,,,,,,,,
8,12,gb|AF099140|+|136-1363|ARO:3002826|EreA2,128,1345,31482731,Different,31482731,,,,,,,,,
9,13,gb|AF118110.1|-|71-1037|ARO:3003002|CfxA2,142,1107,5661688,Different,5661688,,,,,,,,,
11,15,gb|AF137361|+|63-852|ARO:3002605|aadA5,57,845,1370533,Different,1370533,,,,,,,,,
12,16,gb|AF156486|+|1331-2195|ARO:3002330|GES-1,93,956,32422725,Different,32422725,,,,,,,,,
19,26,gb|AF326355|+|0-864|ARO:3002331|GES-2,93,956,32422725,Different,32422725,,,,,,,,,
20,27,gb|AF329699|+|372-1236|ARO:3002337|GES-8,93,956,32422725,Different,32422725,,,,,,,,,


In [19]:
#Drop the 0 parts
Path1_Only = Path1_Only.drop(Path1_Only[(Path1_Only['Start_GraphAligner_x'] == 0) & (Path1_Only['End_GraphAligner_x'] == 0)].index)


In [20]:
# Get the sequence where we have only Path1
merged_df_Path1= pd.merge(Path1_Only , df_graph, left_on='Path1', right_on='Name')
merged_df_Path1

Unnamed: 0.1,Unnamed: 0,Query,Start_GraphAligner_x,End_GraphAligner_x,Path_GraphAligner_x,FinalResultBandageVSGraphAligner,Path1,Path2,Path3,Path4,Path5,Path6,Path7,Path8,Path9,Path10,Name,Sequence
0,0,gb|AB113580|+|1329-2193|ARO:3002332|GES-3,93,956,32422725,Different,32422725,,,,,,,,,,32422725,ACGAATTGTTAGACGGGCGTCAACTATTTGTCCGTGCTCAGGATGA...
1,2,gb|AB116260|+|1329-2193|ARO:3002333|GES-4,93,956,32422725,Different,32422725,,,,,,,,,,32422725,ACGAATTGTTAGACGGGCGTCAACTATTTGTCCGTGCTCAGGATGA...
2,5,gb|AB901141|+|0-864|ARO:3002353|GES-24,93,956,32422725,Different,32422725,,,,,,,,,,32422725,ACGAATTGTTAGACGGGCGTCAACTATTTGTCCGTGCTCAGGATGA...
3,16,gb|AF156486|+|1331-2195|ARO:3002330|GES-1,93,956,32422725,Different,32422725,,,,,,,,,,32422725,ACGAATTGTTAGACGGGCGTCAACTATTTGTCCGTGCTCAGGATGA...
4,26,gb|AF326355|+|0-864|ARO:3002331|GES-2,93,956,32422725,Different,32422725,,,,,,,,,,32422725,ACGAATTGTTAGACGGGCGTCAACTATTTGTCCGTGCTCAGGATGA...
5,27,gb|AF329699|+|372-1236|ARO:3002337|GES-8,93,956,32422725,Different,32422725,,,,,,,,,,32422725,ACGAATTGTTAGACGGGCGTCAACTATTTGTCCGTGCTCAGGATGA...
6,51,gb|AY260546|+|4477-5341|ARO:3002336|GES-7,93,956,32422725,Different,32422725,,,,,,,,,,32422725,ACGAATTGTTAGACGGGCGTCAACTATTTGTCCGTGCTCAGGATGA...
7,53,gb|AY494717|+|0-864|ARO:3002334|GES-5,93,956,32422725,Different,32422725,,,,,,,,,,32422725,ACGAATTGTTAGACGGGCGTCAACTATTTGTCCGTGCTCAGGATGA...
8,54,gb|AY494718|+|0-864|ARO:3002335|GES-6,93,956,32422725,Different,32422725,,,,,,,,,,32422725,ACGAATTGTTAGACGGGCGTCAACTATTTGTCCGTGCTCAGGATGA...
9,58,gb|AY920928|+|2689-3553|ARO:3002338|GES-9,93,956,32422725,Different,32422725,,,,,,,,,,32422725,ACGAATTGTTAGACGGGCGTCAACTATTTGTCCGTGCTCAGGATGA...


In [23]:
merged_df_Path1.dtypes

Unnamed: 0                           int64
Query                               object
Start_GraphAligner_x                object
End_GraphAligner_x                  object
Path_GraphAligner_x                 object
FinalResultBandageVSGraphAligner    object
Path1                               object
Path2                               object
Path3                               object
Path4                               object
Path5                               object
Path6                               object
Path7                               object
Path8                               object
Path9                               object
Path10                              object
Name                                object
Sequence                            object
dtype: object

In [26]:
merged_df_Path1['Start_GraphAligner_x']=merged_df_Path1['Start_GraphAligner_x'].astype(int)
merged_df_Path1['End_GraphAligner_x']=merged_df_Path1['Start_GraphAligner_x'].astype(int)

In [27]:
def extract_sequences_Path1(dataframe):
    for index, row in dataframe.iterrows():
        start_pos = row['Start_GraphAligner_x']
        end_pos = row['End_GraphAligner_x']
        sequence = row['Sequence']
        
        extracted_sequence = sequence[start_pos-1:end_pos]
        dataframe.loc[index, 'Result_node1'] = extracted_sequence
    
    return dataframe

In [28]:
extracted_sequences_Path1_df = extract_sequences_Path1(merged_df_Path1)
extracted_sequences_Path1_df
#Save the dataframe to a pdf
extracted_sequences_Path1_df.to_csv("Real_Results/Path1_TestSequences.tsv", sep="\t")

In [29]:
# Get the rows where Path 2 is not None
df_multiple = df_GraphAligner_Path[df_GraphAligner_Path['Path2'].notna() & df_GraphAligner_Path['Path3'].isna() ]

# Print the resulting dataframe
df_multiple

Unnamed: 0.1,Unnamed: 0,Query,Start_GraphAligner_x,End_GraphAligner_x,Path_GraphAligner_x,FinalResultBandageVSGraphAligner,Path1,Path2,Path3,Path4,Path5,Path6,Path7,Path8,Path9,Path10
6,9,gb|AF043381|+|943-1744|ARO:3001414|OXA-19,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,
10,14,gb|AF135373|+|11-908|ARO:3002244|CARB-5,146,1042,"267426, 63777837",Different,267426,63777837,,,,,,,,
14,19,gb|AF205943|+|7511-8312|ARO:3001405|OXA-10,8,808,"26012357, 758212",Different,26012357,758212,,,,,,,,
15,20,gb|AF231133|+|1345-2146|ARO:3001423|OXA-28,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,
16,22,gb|AF300985|+|0-739|ARO:3001430|OXA-36,104,841,"32571183, 5674312",Different,32571183,5674312,,,,,,,,
17,24,gb|AF315351|+|672-1500|ARO:3001426|OXA-32,71,898,"32571183, 5674312",Different,32571183,5674312,,,,,,,,
18,25,gb|AF315786|+|1313-2114|ARO:3001429|OXA-35,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,
21,28,gb|AF350424|+|0-771|ARO:3001428|OXA-34,71,841,"32571183, 5674312",Different,32571183,5674312,,,,,,,,
25,33,gb|AJ627643.4|+|4933-5713|ARO:3002653|APH(3')-VIb,180,959,"6903442, 86408359",Different,6903442,86408359,,,,,,,,
27,35,gb|AJ854182|+|0-801|ARO:3001798|OXA-74,8,808,"26012357, 758212",Different,26012357,758212,,,,,,,,


In [30]:
merged_df_multiple_Path1= pd.merge(df_multiple, df_graph, left_on='Path1', right_on='Name')
merged_df_multiple_Path1


Unnamed: 0.1,Unnamed: 0,Query,Start_GraphAligner_x,End_GraphAligner_x,Path_GraphAligner_x,FinalResultBandageVSGraphAligner,Path1,Path2,Path3,Path4,Path5,Path6,Path7,Path8,Path9,Path10,Name,Sequence
0,9,gb|AF043381|+|943-1744|ARO:3001414|OXA-19,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...
1,20,gb|AF231133|+|1345-2146|ARO:3001423|OXA-28,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...
2,25,gb|AF315786|+|1313-2114|ARO:3001429|OXA-35,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...
3,40,gb|AM412777|+|117-918|ARO:3001439|OXA-101,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...
4,52,gb|AY445080|+|0-801|ARO:3001795|OXA-56,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...
5,75,gb|FJ790516|+|1287-2085|ARO:3001804|OXA-145,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...
6,77,gb|FJ848783|+|1270-2071|ARO:3001801|OXA-147,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...
7,90,gb|HQ111474|+|1056-1857|ARO:3001475|OXA-183,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...
8,133,gb|U59183|+|939-1740|ARO:3001408|OXA-13,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...
9,138,gb|X75562|+|134-935|ARO:3001402|OXA-7,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...


In [31]:
merged_df_multiple_Path1['Path2'] = merged_df_multiple_Path1['Path2'].str.strip()


In [32]:
merged_df_multiple_Path2=pd.merge(merged_df_multiple_Path1, df_graph,left_on='Path2', right_on='Name')
merged_df_multiple_Path2

Unnamed: 0.1,Unnamed: 0,Query,Start_GraphAligner_x,End_GraphAligner_x,Path_GraphAligner_x,FinalResultBandageVSGraphAligner,Path1,Path2,Path3,Path4,Path5,Path6,Path7,Path8,Path9,Path10,Name_x,Sequence_x,Name_y,Sequence_y
0,9,gb|AF043381|+|943-1744|ARO:3001414|OXA-19,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
1,20,gb|AF231133|+|1345-2146|ARO:3001423|OXA-28,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
2,25,gb|AF315786|+|1313-2114|ARO:3001429|OXA-35,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
3,40,gb|AM412777|+|117-918|ARO:3001439|OXA-101,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
4,52,gb|AY445080|+|0-801|ARO:3001795|OXA-56,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
5,75,gb|FJ790516|+|1287-2085|ARO:3001804|OXA-145,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
6,77,gb|FJ848783|+|1270-2071|ARO:3001801|OXA-147,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
7,90,gb|HQ111474|+|1056-1857|ARO:3001475|OXA-183,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
8,133,gb|U59183|+|939-1740|ARO:3001408|OXA-13,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
9,138,gb|X75562|+|134-935|ARO:3001402|OXA-7,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...


In [33]:
#Rename Name_X, Sequence_X, Name_y, Sequence_y
merged_df_multiple_Path2.rename(columns = {'Sequence_x':'Sequence_Path1'}, inplace = True)
merged_df_multiple_Path2.rename(columns = {'Sequence_y':'Sequence_Path2'}, inplace = True)
merged_df_multiple_Path2

Unnamed: 0.1,Unnamed: 0,Query,Start_GraphAligner_x,End_GraphAligner_x,Path_GraphAligner_x,FinalResultBandageVSGraphAligner,Path1,Path2,Path3,Path4,Path5,Path6,Path7,Path8,Path9,Path10,Name_x,Sequence_Path1,Name_y,Sequence_Path2
0,9,gb|AF043381|+|943-1744|ARO:3001414|OXA-19,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
1,20,gb|AF231133|+|1345-2146|ARO:3001423|OXA-28,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
2,25,gb|AF315786|+|1313-2114|ARO:3001429|OXA-35,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
3,40,gb|AM412777|+|117-918|ARO:3001439|OXA-101,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
4,52,gb|AY445080|+|0-801|ARO:3001795|OXA-56,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
5,75,gb|FJ790516|+|1287-2085|ARO:3001804|OXA-145,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
6,77,gb|FJ848783|+|1270-2071|ARO:3001801|OXA-147,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
7,90,gb|HQ111474|+|1056-1857|ARO:3001475|OXA-183,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
8,133,gb|U59183|+|939-1740|ARO:3001408|OXA-13,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
9,138,gb|X75562|+|134-935|ARO:3001402|OXA-7,72,872,"32592258, 758212",Different,32592258,758212,,,,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...


In [34]:
def extract_sequences_multiple_sequences(dataframe):
    for index, row in dataframe.iterrows():
        start_pos = row['Start_GraphAligner_x']
        sequence_1 = row['Sequence_Path1']
       
        #Get the first sequence, and remove 55 kmer from it
        extracted_sequence_first = sequence_1[start_pos-1:]
        
        dataframe.loc[index, 'Sequencefirst'] = extracted_sequence_first
    
    return dataframe

In [39]:
merged_df_multiple_Path2['Start_GraphAligner_x']=merged_df_multiple_Path2['Start_GraphAligner_x'].astype(int)
merged_df_multiple_Path2['End_GraphAligner_x']=merged_df_multiple_Path2['Start_GraphAligner_x'].astype(int)

In [40]:
extracted_sequences_2 = extract_sequences_multiple_sequences(merged_df_multiple_Path2)
extracted_sequences_2.to_csv('TestSeque.tsv',sep='\t')

extracted_sequences_2

Unnamed: 0.1,Unnamed: 0,Query,Start_GraphAligner_x,End_GraphAligner_x,Path_GraphAligner_x,FinalResultBandageVSGraphAligner,Path1,Path2,Path3,Path4,...,Path6,Path7,Path8,Path9,Path10,Name_x,Sequence_Path1,Name_y,Sequence_Path2,Sequencefirst
0,9,gb|AF043381|+|943-1744|ARO:3001414|OXA-19,72,72,"32592258, 758212",Different,32592258,758212,,,...,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
1,20,gb|AF231133|+|1345-2146|ARO:3001423|OXA-28,72,72,"32592258, 758212",Different,32592258,758212,,,...,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
2,25,gb|AF315786|+|1313-2114|ARO:3001429|OXA-35,72,72,"32592258, 758212",Different,32592258,758212,,,...,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
3,40,gb|AM412777|+|117-918|ARO:3001439|OXA-101,72,72,"32592258, 758212",Different,32592258,758212,,,...,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
4,52,gb|AY445080|+|0-801|ARO:3001795|OXA-56,72,72,"32592258, 758212",Different,32592258,758212,,,...,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
5,75,gb|FJ790516|+|1287-2085|ARO:3001804|OXA-145,72,72,"32592258, 758212",Different,32592258,758212,,,...,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
6,77,gb|FJ848783|+|1270-2071|ARO:3001801|OXA-147,72,72,"32592258, 758212",Different,32592258,758212,,,...,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
7,90,gb|HQ111474|+|1056-1857|ARO:3001475|OXA-183,72,72,"32592258, 758212",Different,32592258,758212,,,...,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
8,133,gb|U59183|+|939-1740|ARO:3001408|OXA-13,72,72,"32592258, 758212",Different,32592258,758212,,,...,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
9,138,gb|X75562|+|134-935|ARO:3001402|OXA-7,72,72,"32592258, 758212",Different,32592258,758212,,,...,,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...


In [41]:
def extract_sequences_multiple_sequences_path2(dataframe):
    for index, row in dataframe.iterrows():
        end_pos = row['End_GraphAligner_x']
        sequence_2 = row['Sequence_Path2']
       
        #Get the first sequence, and remove 55 kmer from it
        extracted_sequence_second = sequence_2[:end_pos]
        
        dataframe.loc[index, 'SequencefSecond'] = extracted_sequence_second
    
    return dataframe

In [42]:
extracted_sequences_3 = extract_sequences_multiple_sequences_path2(extracted_sequences_2)
extracted_sequences_3.to_csv('TestSeque3.tsv',sep='\t')

extracted_sequences_3

Unnamed: 0.1,Unnamed: 0,Query,Start_GraphAligner_x,End_GraphAligner_x,Path_GraphAligner_x,FinalResultBandageVSGraphAligner,Path1,Path2,Path3,Path4,...,Path7,Path8,Path9,Path10,Name_x,Sequence_Path1,Name_y,Sequence_Path2,Sequencefirst,SequencefSecond
0,9,gb|AF043381|+|943-1744|ARO:3001414|OXA-19,72,72,"32592258, 758212",Different,32592258,758212,,,...,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
1,20,gb|AF231133|+|1345-2146|ARO:3001423|OXA-28,72,72,"32592258, 758212",Different,32592258,758212,,,...,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
2,25,gb|AF315786|+|1313-2114|ARO:3001429|OXA-35,72,72,"32592258, 758212",Different,32592258,758212,,,...,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
3,40,gb|AM412777|+|117-918|ARO:3001439|OXA-101,72,72,"32592258, 758212",Different,32592258,758212,,,...,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
4,52,gb|AY445080|+|0-801|ARO:3001795|OXA-56,72,72,"32592258, 758212",Different,32592258,758212,,,...,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
5,75,gb|FJ790516|+|1287-2085|ARO:3001804|OXA-145,72,72,"32592258, 758212",Different,32592258,758212,,,...,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
6,77,gb|FJ848783|+|1270-2071|ARO:3001801|OXA-147,72,72,"32592258, 758212",Different,32592258,758212,,,...,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
7,90,gb|HQ111474|+|1056-1857|ARO:3001475|OXA-183,72,72,"32592258, 758212",Different,32592258,758212,,,...,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
8,133,gb|U59183|+|939-1740|ARO:3001408|OXA-13,72,72,"32592258, 758212",Different,32592258,758212,,,...,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...
9,138,gb|X75562|+|134-935|ARO:3001402|OXA-7,72,72,"32592258, 758212",Different,32592258,758212,,,...,,,,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...


In [43]:
def extract_sequences_multiple_sequences_overlap(dataframe):
   
    for index, row in dataframe.iterrows():
        sequence_first = row['Sequencefirst']
        length_end=len(sequence_first)-55
       
        #Get the first sequence, and the last 55 characters
        extracted_sequence_first = sequence_first[:len(sequence_first)-55]
        
        dataframe.loc[index, 'length'] = len(sequence_first)
        dataframe.loc[index, 'length_end'] = length_end
        dataframe.loc[index, 'FinalSequence1'] = sequence_first[:length_end]
    
    return dataframe
    

In [44]:
extracted_sequences_multiple = extract_sequences_multiple_sequences_overlap(extracted_sequences_3)

In [45]:
extracted_sequences_multiple

Unnamed: 0.1,Unnamed: 0,Query,Start_GraphAligner_x,End_GraphAligner_x,Path_GraphAligner_x,FinalResultBandageVSGraphAligner,Path1,Path2,Path3,Path4,...,Path10,Name_x,Sequence_Path1,Name_y,Sequence_Path2,Sequencefirst,SequencefSecond,length,length_end,FinalSequence1
0,9,gb|AF043381|+|943-1744|ARO:3001414|OXA-19,72,72,"32592258, 758212",Different,32592258,758212,,,...,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,175.0,120.0,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
1,20,gb|AF231133|+|1345-2146|ARO:3001423|OXA-28,72,72,"32592258, 758212",Different,32592258,758212,,,...,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,175.0,120.0,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
2,25,gb|AF315786|+|1313-2114|ARO:3001429|OXA-35,72,72,"32592258, 758212",Different,32592258,758212,,,...,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,175.0,120.0,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
3,40,gb|AM412777|+|117-918|ARO:3001439|OXA-101,72,72,"32592258, 758212",Different,32592258,758212,,,...,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,175.0,120.0,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
4,52,gb|AY445080|+|0-801|ARO:3001795|OXA-56,72,72,"32592258, 758212",Different,32592258,758212,,,...,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,175.0,120.0,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
5,75,gb|FJ790516|+|1287-2085|ARO:3001804|OXA-145,72,72,"32592258, 758212",Different,32592258,758212,,,...,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,175.0,120.0,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
6,77,gb|FJ848783|+|1270-2071|ARO:3001801|OXA-147,72,72,"32592258, 758212",Different,32592258,758212,,,...,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,175.0,120.0,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
7,90,gb|HQ111474|+|1056-1857|ARO:3001475|OXA-183,72,72,"32592258, 758212",Different,32592258,758212,,,...,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,175.0,120.0,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
8,133,gb|U59183|+|939-1740|ARO:3001408|OXA-13,72,72,"32592258, 758212",Different,32592258,758212,,,...,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,175.0,120.0,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
9,138,gb|X75562|+|134-935|ARO:3001402|OXA-7,72,72,"32592258, 758212",Different,32592258,758212,,,...,,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,175.0,120.0,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...


In [46]:
def extract_sequences_multiple_sequences_final(dataframe):
   
    for index, row in dataframe.iterrows():
        sequence_first = row['FinalSequence1']
        sequence_second = row['SequencefSecond']
       
        #Get the first sequence, and the last 55 characters
        extracted_sequence_final = sequence_first+sequence_second
        
      
        dataframe.loc[index, 'FinalSequence'] = sequence_first + sequence_second
    
    return dataframe

In [47]:
extracted_sequences_multiple_final =extract_sequences_multiple_sequences_final(extracted_sequences_multiple)

In [48]:
extracted_sequences_multiple_final

Unnamed: 0.1,Unnamed: 0,Query,Start_GraphAligner_x,End_GraphAligner_x,Path_GraphAligner_x,FinalResultBandageVSGraphAligner,Path1,Path2,Path3,Path4,...,Name_x,Sequence_Path1,Name_y,Sequence_Path2,Sequencefirst,SequencefSecond,length,length_end,FinalSequence1,FinalSequence
0,9,gb|AF043381|+|943-1744|ARO:3001414|OXA-19,72,72,"32592258, 758212",Different,32592258,758212,,,...,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,175.0,120.0,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
1,20,gb|AF231133|+|1345-2146|ARO:3001423|OXA-28,72,72,"32592258, 758212",Different,32592258,758212,,,...,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,175.0,120.0,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
2,25,gb|AF315786|+|1313-2114|ARO:3001429|OXA-35,72,72,"32592258, 758212",Different,32592258,758212,,,...,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,175.0,120.0,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
3,40,gb|AM412777|+|117-918|ARO:3001439|OXA-101,72,72,"32592258, 758212",Different,32592258,758212,,,...,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,175.0,120.0,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
4,52,gb|AY445080|+|0-801|ARO:3001795|OXA-56,72,72,"32592258, 758212",Different,32592258,758212,,,...,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,175.0,120.0,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
5,75,gb|FJ790516|+|1287-2085|ARO:3001804|OXA-145,72,72,"32592258, 758212",Different,32592258,758212,,,...,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,175.0,120.0,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
6,77,gb|FJ848783|+|1270-2071|ARO:3001801|OXA-147,72,72,"32592258, 758212",Different,32592258,758212,,,...,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,175.0,120.0,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
7,90,gb|HQ111474|+|1056-1857|ARO:3001475|OXA-183,72,72,"32592258, 758212",Different,32592258,758212,,,...,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,175.0,120.0,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
8,133,gb|U59183|+|939-1740|ARO:3001408|OXA-13,72,72,"32592258, 758212",Different,32592258,758212,,,...,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,175.0,120.0,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...
9,138,gb|X75562|+|134-935|ARO:3001402|OXA-7,72,72,"32592258, 758212",Different,32592258,758212,,,...,32592258,CACGAGCTAAGTCATTGGTAGCGCAGGATTTACTGCTACTTTTACA...,758212,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TTAGCCACCAATGATGCCCTCACTTTCCATGATTTTGGTGGGAATG...,175.0,120.0,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...,TCGGCAGAGAACTCTTTGTTCCAAAACGTATTTTCTGTAATTGAAC...


In [49]:
#Drop some columns
extracted_sequences_multiple_final=extracted_sequences_multiple_final.drop(['Path_Bandage', 'Path_GraphAligner','Equal','Name_x','Name_y'], axis=1)

KeyError: "['Path_Bandage', 'Path_GraphAligner', 'Equal'] not found in axis"

In [50]:
extracted_sequences_multiple_final.rename(columns = {'Sequence_Path1':'Sequence_Path1'}, inplace = True)

In [51]:
extracted_sequences_multiple_final.rename(columns = {'Sequence_Path2':'Sequence_FromGraph_Path2'}, inplace = True)

In [52]:
extracted_sequences_multiple_final.rename(columns = {'Sequencefirst':'Sequence_FromGraph_Path1_No_Overlap'}, inplace = True)

In [53]:
extracted_sequences_multiple_final.rename(columns = {'length':'Length_Sequence_Path1_No_Overlap'}, inplace = True)


In [54]:
extracted_sequences_multiple_final.rename(columns = {'length_end':'Length_Sequence_Path1_With_Overlap'}, inplace = True)

In [55]:
extracted_sequences_multiple_final.rename(columns = {'FinalSequence1':'Sequence_Path1_With_Overlap'}, inplace = True)

In [56]:
extracted_sequences_multiple_final.rename(columns = {'FinalSequence':'Concatenated_Sequence'}, inplace = True)

In [57]:
extracted_sequences_multiple_final.to_csv("CAMIH1_Results/BandageVsGraphAlignerFinalConcatenatedSequences.tsv",sep='\t')

## This section measures the edit distance between the sequences 

In [None]:
import Levenshtein

In [None]:
bandage_df= pd.read_csv('Real_Results/TestDistance.tsv', sep='\t')
bandage_df

In [None]:
#Get the sequences from bandage where the Path2 in GraphAligner=none
distance_df=pd.merge(extracted_sequences_Path1_df,bandage_df, on='Query')

In [None]:
distance_df

In [None]:
distance_df.rename(columns = {'Result_node1':'Sequence_GraphAligner'}, inplace = True)
distance_df.rename(columns = {'Sequence_y':'Sequence_Bandage'}, inplace = True)

In [None]:
distance_df=distance_df.drop('Unnamed: 0_x', axis=1)

In [None]:
distance_df=distance_df.drop('Unnamed: 0_y', axis=1)

In [None]:
distance_df

In [None]:
#Create a method to calculate the levenstein distance
def calculate_edit_distance_Bandage_GraphAligner(row):
    return Levenshtein.distance(row['Sequence_Bandage'], row['Sequence_GraphAligner'])

In [None]:
distance_df['edit_distance_query_Bandage_GraphAligner'] = distance_df.apply(calculate_edit_distance_Bandage_GraphAligner, axis=1)

In [None]:
distance_df.dtypes

In [None]:
type_counts_bandage_GraphAligner=distance_df['edit_distance_query_Bandage_GraphAligner'].value_counts().reset_index()

In [None]:
# Rename the columns
type_counts_bandage_GraphAligner.columns = ['Type', 'Count']

# Display the result
print(type_counts_bandage_GraphAligner)

In [None]:
type_counts_bandage_GraphAligner.to_csv("Real_Results/type_counts_bandage_GraphAligner.tsv",sep='\t')

In [None]:
#Get the sequences from bandage where the Path2 in GraphAligner is not none
distance_df_multiple=pd.merge(extracted_sequences_multiple_final,bandage_df, on='Query')
distance_df_multiple

In [None]:
distance_df_multiple.rename(columns = {'Concatenated_Sequence':'Sequence_GraphAligner'}, inplace = True)
distance_df_multiple.rename(columns = {'Sequence':'Sequence_Bandage'}, inplace = True)

In [None]:
distance_df_multiple['edit_distance_query_Bandage_GraphAligner'] = distance_df_multiple.apply(calculate_edit_distance_Bandage_GraphAligner, axis=1)

In [None]:
distance_df_multiple

In [None]:
type_counts_bandage_GraphAligner_multiple=distance_df_multiple['edit_distance_query_Bandage_GraphAligner'].value_counts().reset_index()

In [None]:
# Rename the columns
type_counts_bandage_GraphAligner_multiple.columns = ['Type', 'Count']

# Display the result
print(type_counts_bandage_GraphAligner_multiple)

In [None]:
type_counts_bandage_GraphAligner_multiple.to_csv("Real_Results/type_counts_bandage_GraphAligner_multiple.tsv",sep='\t')

In [None]:
#Read the query and sequences from the .fasta file
fasta_file = "111_Graph/combined_1.fasta"  

# Create empty lists to store the data
queries = []
sequences = []

# Read through the FASTA file and extract query and sequence information
for record in SeqIO.parse(fasta_file, "fasta"):
    queries.append(record.id)
    sequences.append(str(record.seq))

# Create DataFrame from the lists
fasta_df = pd.DataFrame({'Query': queries, 'Sequence': sequences})

# Display the DataFrame
print(fasta_df)

In [None]:
distance_df_Sequence=pd.merge(extracted_sequences_Path1_df,fasta_df, on='Query')

In [None]:
distance_df_Sequence

In [None]:
distance_df_Sequence.rename(columns = {'Result_node1':'Sequence_GraphAligner'}, inplace = True)
distance_df_Sequence.rename(columns = {'Sequence_y':'Sequence'}, inplace = True)

In [None]:
distance_df_Sequence

In [None]:
#Create a method to calculate the levenstein distance
def calculate_edit_distance_sequence_GraphAligner(row):
    return Levenshtein.distance(row['Sequence'], row['Sequence_GraphAligner'])

In [None]:
distance_df_Sequence['edit_distance_query_sequence_GraphAligner'] = distance_df_Sequence.apply(calculate_edit_distance_sequence_GraphAligner, axis=1)

In [None]:
distance_df_Sequence