## This script is used to compare the results from all 3 software.
## Written by Yusreen Shah
## Date: May 10th 2023

In [1]:
# import the libraries
import numpy as np
import pandas as pd
import re 
from Bio import SeqIO
from collections import defaultdict


## This section saves the list of all the queries in a dataframe

In [2]:
# Save the data
data = defaultdict(list)

# Read the values from the .fasta file, and save them to data
for seq_record in SeqIO.parse("CAMIM2_graph/combined_2.fasta", "fasta"):
    query=seq_record.id
    sequence= repr(seq_record.seq)
    length=len(seq_record)
    data['Query'].append(query)
    data['Sequence'].append(sequence)
    data['Length'].append(length)
    
# Add the data to a dataframe
df = pd.DataFrame.from_dict(data)


In [3]:
# Remove Seq(' and ') from the sequences
df['Sequence'] = df['Sequence'].str.replace('Seq(''', '')
df['Sequence'] = df['Sequence'].str.replace(')', '')
df['Sequence'] = df['Sequence'].str.strip(" \' ")
df['Sequence']=df['Sequence'].str.rstrip()

In [4]:
df=df.sort_values("Query")
df=df.reset_index(drop=True)
df

Unnamed: 0,Query,Sequence,Length
0,gb|AB765395|+|0-1143|ARO:3002993|AQU-1,ATGAAGCAAACCTCACCCTTGTCGTCGCTGGCGCTGAGCGCCCTGC...,1143
1,gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,438
2,gb|AE016877.1|+|1972251-1972668|ARO:3000172|FosB,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,417
3,gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,837
4,gb|AF118110.1|-|71-1037|ARO:3003002|CfxA2,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,966
5,gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy,ATGGACATCAGGCAAATGAACAAAACCCATCTGGAGCACTGGCGCG...,438
6,gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,804
7,gb|AF472622|+|52-1018|ARO:3003003|CfxA3,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,966
8,gb|AF550679.1|-|80976-81768|ARO:3002601|aadA,ATGAGGGAAGCGGTGATCGCCGAAGTATCGACTCAACTATCAGAGG...,792
9,gb|AJ222769.3|+|3686-5606|ARO:3000194|tetW,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...,1920


## This section is used to compare the results from Bandage and the actual list of queries.

In [5]:
# Create a dataframe for Bandage Combined1
Bandage_Combined1= pd.read_csv('outputCAMIM2Bandage.tsv', sep='\t')

In [6]:
Bandage_Combined1.head()

Unnamed: 0,Query,Path,Length,Query covered by path,Query covered by hits,Mean hit identity,Total hit mismatches,Total hit gap opens,Relative length,Length discrepancy,E-value product,Sequence
0,gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,(3687) 118828- (4123),437,100%,100%,96.58%,14,1,99.7717%,-1,0.0,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...
1,gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy,(3687) 118828- (4123),437,100%,100%,96.12%,16,1,99.7717%,-1,0.0,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...
2,gb|X02340.1|+|222-1194|ARO:3004089|ANT(3'')-IIa,(378) 28874510+,935,96.6049%,96.6049%,99.47%,1,2,99.574%,-4,0.0,GTGGTAACGGCGCAGTGGCGGTTTTCATGGCTTGTTATGACTGTTT...
3,gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib,(1326) 2007548+ (2129),804,100%,100%,99.88%,1,0,100%,0,0.0,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...
4,gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id,(2129) 2007548+ (2965),837,100%,100%,99.76%,2,0,100%,0,0.0,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...


In [7]:
Bandage_Combined_Query_Path =Bandage_Combined1[['Path','Query','Sequence']]

## Extract the start and end position from the path in Bandage

In [8]:
# Extract the start position
pattern_path_start = r'\((.*?)\)' 

In [9]:
#Extract the node position
pattern_path_node = r'\b(\d+)\s*[+-]'

In [10]:
Bandage_Combined_Query_Path['Extracted_Path'] = Bandage_Combined_Query_Path['Path'].apply(lambda x: re.findall(pattern_path_node, x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Bandage_Combined_Query_Path['Extracted_Path'] = Bandage_Combined_Query_Path['Path'].apply(lambda x: re.findall(pattern_path_node, x))


In [11]:
# Remove the braces and convert the values to integers
Bandage_Combined_Query_Path['Extracted_Path'] = Bandage_Combined_Query_Path['Extracted_Path'].apply(lambda x: [int(value) for value in x])

Bandage_Combined_Query_Path['Extracted_Path'] = Bandage_Combined_Query_Path['Extracted_Path'].astype(str)
Bandage_Combined_Query_Path['Extracted_Path'] = Bandage_Combined_Query_Path['Extracted_Path'].str.replace('[', '').str.replace(']', '')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Bandage_Combined_Query_Path['Extracted_Path'] = Bandage_Combined_Query_Path['Extracted_Path'].apply(lambda x: [int(value) for value in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Bandage_Combined_Query_Path['Extracted_Path'] = Bandage_Combined_Query_Path['Extracted_Path'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexin

In [12]:

Bandage_Combined_Query_Path.to_csv("test4.tsv",sep="\t")

In [13]:
Bandage_Combined_Query_Path['Start']=Bandage_Combined_Query_Path['Path'].str.extract(pattern_path_start, expand=False)
Bandage_Combined_Query_Path

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Bandage_Combined_Query_Path['Start']=Bandage_Combined_Query_Path['Path'].str.extract(pattern_path_start, expand=False)


Unnamed: 0,Path,Query,Sequence,Extracted_Path,Start
0,(3687) 118828- (4123),gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,118828,3687
1,(3687) 118828- (4123),gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,118828,3687
2,(378) 28874510+,gb|X02340.1|+|222-1194|ARO:3004089|ANT(3'')-IIa,GTGGTAACGGCGCAGTGGCGGTTTTCATGGCTTGTTATGACTGTTT...,28874510,378
3,(1326) 2007548+ (2129),gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,2007548,1326
4,(2129) 2007548+ (2965),gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,2007548,2129
...,...,...,...,...,...
57,(565) 136860+ (2484),gb|AJ222769.3|+|3686-5606|ARO:3000194|tetW,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...,136860,565
58,(565) 136860+ (2477),gb|KU736867.1|+|19653-21573|ARO:3004442|tet(W/...,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...,136860,565
59,(46387) 2066834- (46980),gb|AL009126.3|-|339155-339749|ARO:3003059|tmrB,ATGATCATTTGGATAAACGGGGCATTCGGTTCCGGAAAAACACAAA...,2066834,46387
60,"(523) 827680+, 120721+ (732)",gb|U00096|-|2098446-2099613|ARO:3003577|ugd,ATGAAAATCACCATTTCCGGTACTGGCTATGTCGGCTTGTCAAACG...,"827680, 120721",523


In [14]:
# Extract the end position
Bandage_Combined_Query_Path['End']= Bandage_Combined_Query_Path["Path"].str.split().str[-1]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Bandage_Combined_Query_Path['End']= Bandage_Combined_Query_Path["Path"].str.split().str[-1]


In [15]:
Bandage_Combined_Query_Path['End']=Bandage_Combined_Query_Path['End'].str.strip('()')
Bandage_Combined_Query_Path['End']=Bandage_Combined_Query_Path['End'].str.strip('+')
Bandage_Combined_Query_Path['End']=Bandage_Combined_Query_Path['End'].str.strip('-')
Bandage_Combined_Query_Path['End']=Bandage_Combined_Query_Path['End'].astype(int)
Bandage_Combined_Query_Path

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Bandage_Combined_Query_Path['End']=Bandage_Combined_Query_Path['End'].str.strip('()')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Bandage_Combined_Query_Path['End']=Bandage_Combined_Query_Path['End'].str.strip('+')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Bandage_Combined_Query_Path['End']

Unnamed: 0,Path,Query,Sequence,Extracted_Path,Start,End
0,(3687) 118828- (4123),gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,118828,3687,4123
1,(3687) 118828- (4123),gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,118828,3687,4123
2,(378) 28874510+,gb|X02340.1|+|222-1194|ARO:3004089|ANT(3'')-IIa,GTGGTAACGGCGCAGTGGCGGTTTTCATGGCTTGTTATGACTGTTT...,28874510,378,28874510
3,(1326) 2007548+ (2129),gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,2007548,1326,2129
4,(2129) 2007548+ (2965),gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,2007548,2129,2965
...,...,...,...,...,...,...
57,(565) 136860+ (2484),gb|AJ222769.3|+|3686-5606|ARO:3000194|tetW,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...,136860,565,2484
58,(565) 136860+ (2477),gb|KU736867.1|+|19653-21573|ARO:3004442|tet(W/...,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...,136860,565,2477
59,(46387) 2066834- (46980),gb|AL009126.3|-|339155-339749|ARO:3003059|tmrB,ATGATCATTTGGATAAACGGGGCATTCGGTTCCGGAAAAACACAAA...,2066834,46387,46980
60,"(523) 827680+, 120721+ (732)",gb|U00096|-|2098446-2099613|ARO:3003577|ugd,ATGAAAATCACCATTTCCGGTACTGGCTATGTCGGCTTGTCAAACG...,"827680, 120721",523,732


In [16]:
# Reorder the columns in Bandage_Combined_Query_Path
Bandage_Combined_Query_Path=Bandage_Combined_Query_Path[['Query','Path','Start','Extracted_Path','End','Sequence']]

# Convert integer columns to strings
Bandage_Combined_Query_Path['Path'] = Bandage_Combined_Query_Path['Path'].astype(str)
Bandage_Combined_Query_Path['End'] = Bandage_Combined_Query_Path['End'].astype(str)
Bandage_Combined_Query_Path['Start'] = Bandage_Combined_Query_Path['Start'].astype(str)
Bandage_Combined_Query_Path['Extracted_Path'] = Bandage_Combined_Query_Path['Extracted_Path'].astype(str)
Bandage_Combined_Query_Path['Sequence'] = Bandage_Combined_Query_Path['Sequence'].astype(str)

# Bandage_Combined1['Query'].value_counts() 
# Merge the rows that have the same query
# group the dataframe by the 'Name' column and aggregate the data for each group
merge_Bandage_df = Bandage_Combined_Query_Path.groupby('Query').agg({'Path': ' , '.join, 'Start': ', '.join, 'End': ', '.join, 'Extracted_Path': ','.join, 'Sequence': '+ '.join}).reset_index()
merge_Bandage_df


Unnamed: 0,Query,Path,Start,End,Extracted_Path,Sequence
0,gb|AB765395|+|0-1143|ARO:3002993|AQU-1,(24879) 2079546- (26021),24879,26021,2079546,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...
1,gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,(3687) 118828- (4123),3687,4123,118828,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...
2,gb|AE016877.1|+|1972251-1972668|ARO:3000172|FosB,"(311) 15445+ (727) , (310) 2021892+ (722)","311, 310","727, 722",154452021892,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...
3,gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id,(2129) 2007548+ (2965),2129,2965,2007548,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...
4,gb|AF118110.1|-|71-1037|ARO:3003002|CfxA2,(112) 821183+ (1077),112,1077,821183,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...
5,gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy,(3687) 118828- (4123),3687,4123,118828,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...
6,gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib,(1326) 2007548+ (2129),1326,2129,2007548,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...
7,gb|AF472622|+|52-1018|ARO:3003003|CfxA3,(112) 821183+ (1077),112,1077,821183,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...
8,gb|AF550679.1|-|80976-81768|ARO:3002601|aadA,(554) 28874510+,554,28874510,28874510,ATGAGGGAAGCGGTGATCGCCGAAGTATCGACTCAACTATCAGAGG...
9,gb|AJ222769.3|+|3686-5606|ARO:3000194|tetW,(565) 136860+ (2484),565,2484,136860,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...


In [17]:
# Add Braces to the values that now have ','
def add_braces(value):
    if ',' in value:
        return '{' + value + '}'
    else:
        return value
#merge_Bandage_df['Path'] = merge_Bandage_df['Path'].apply(add_braces)
#merge_Bandage_df['Start'] = merge_Bandage_df['Start'].apply(add_braces)
#merge_Bandage_df['End'] = merge_Bandage_df['End'].apply(add_braces)
#merge_Bandage_df['Sequence'] = merge_Bandage_df['Sequence'].apply(add_braces)

In [18]:
# Store the dataframe containing the Bandage results to a .tsv file
merge_Bandage_df.to_csv("Test1.tsv", sep="\t")

In [19]:
#Use merge operation so that we have the sequences for both softwares for successful queries
df_QueryAndResultsBandage=pd.merge(df,merge_Bandage_df, on='Query',how="outer")

## Compile the Queries and the results from Bandage in one dataframe

In [20]:
df_QueryAndResultsBandage

Unnamed: 0,Query,Sequence_x,Length,Path,Start,End,Extracted_Path,Sequence_y
0,gb|AB765395|+|0-1143|ARO:3002993|AQU-1,ATGAAGCAAACCTCACCCTTGTCGTCGCTGGCGCTGAGCGCCCTGC...,1143,(24879) 2079546- (26021),24879,26021,2079546,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...
1,gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,438,(3687) 118828- (4123),3687,4123,118828,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...
2,gb|AE016877.1|+|1972251-1972668|ARO:3000172|FosB,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,417,"(311) 15445+ (727) , (310) 2021892+ (722)","311, 310","727, 722",154452021892,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...
3,gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,837,(2129) 2007548+ (2965),2129,2965,2007548,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...
4,gb|AF118110.1|-|71-1037|ARO:3003002|CfxA2,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,966,(112) 821183+ (1077),112,1077,821183,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...
5,gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy,ATGGACATCAGGCAAATGAACAAAACCCATCTGGAGCACTGGCGCG...,438,(3687) 118828- (4123),3687,4123,118828,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...
6,gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,804,(1326) 2007548+ (2129),1326,2129,2007548,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...
7,gb|AF472622|+|52-1018|ARO:3003003|CfxA3,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,966,(112) 821183+ (1077),112,1077,821183,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...
8,gb|AF550679.1|-|80976-81768|ARO:3002601|aadA,ATGAGGGAAGCGGTGATCGCCGAAGTATCGACTCAACTATCAGAGG...,792,(554) 28874510+,554,28874510,28874510,ATGAGGGAAGCGGTGATCGCCGAAGTATCGACTCAACTATCAGAGG...
9,gb|AJ222769.3|+|3686-5606|ARO:3000194|tetW,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...,1920,(565) 136860+ (2484),565,2484,136860,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...


In [21]:
#Print the columns from the dataframe
df_QueryAndResultsBandage.columns

Index(['Query', 'Sequence_x', 'Length', 'Path', 'Start', 'End',
       'Extracted_Path', 'Sequence_y'],
      dtype='object')

In [22]:
#Rename the columns from the merging of Bandage and Query list
df_QueryAndResultsBandage.rename(columns = {'Sequence_x':'Sequence'}, inplace = True)
df_QueryAndResultsBandage.rename(columns = {'Sequence_y':'Sequence_Bandage'}, inplace = True)
df_QueryAndResultsBandage.rename(columns = {'Length':'Length_Bandage'}, inplace = True)
df_QueryAndResultsBandage.rename(columns = {'Path':'Path_Bandage'}, inplace = True)
df_QueryAndResultsBandage.rename(columns = {'Start':'Start_Bandage'}, inplace = True)
df_QueryAndResultsBandage.rename(columns = {'End':'End_Bandage'}, inplace = True)

## Save the Queries and Results from Bandage in one .tsv file

In [23]:
df_QueryAndResultsBandage.to_csv("CompiledQueriesBandage.tsv",sep="\t")

## This section is used to compare the results from SPAligner and the actual list of queries.

In [24]:
#Create a dataframe for SPAligner Combined1
SPAligner_Combined1=pd.read_csv('outputCAMIM2SPAligner.tsv', sep='\t',names=["Query","Start position of alignment on sequence","End position of alignment on sequence","Start position of alignment on the first edge of the path",
"End position of alignment on the last edge of the path","Path of alignment","Length of the alignment on each edge of the path","Sequence Length",
"Sequence"])
SPAligner_Combined1

Unnamed: 0,Query,Start position of alignment on sequence,End position of alignment on sequence,Start position of alignment on the first edge of the path,End position of alignment on the last edge of the path,Path of alignment,Length of the alignment on each edge of the path,Sequence Length,Sequence
0,gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,0,383,3686,4068,438,118828-,382,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...
1,gb|L12710|+|0-549|ARO:3002556|AAC(6')-Ii [Ente...,220,475,0,255,549,28611963-,255,ATCCATTAGTTGTAGAAAGCTCCCGACGAAAGAACCAAATAGGTAC...
2,gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy...,0,383,3686,4068,438,118828-,382,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...
3,gb|KU736867.1|+|19653-21573|ARO:3004442|tet(W/...,0,1858,564,2422,1920,136860+,1858,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...
4,gb|AL009126.3|-|339155-339749|ARO:3003059|tmrB...,0,539,46386,46925,594,2066834-,539,ATGATCATTTGGATAAACGGGGCATTCGGTTCCGGAAAAACACAAA...
5,gb|X02340.1|+|222-1194|ARO:3004089|ANT(3'')-II...,0,884,377,1257,972,28874510+,880,GTGGTAACGGCGCAGTGGCGGTTTTCATGGCTTGTTATGACTGTTT...
6,gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')...,0,749,1325,2074,804,2007548+,749,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...
7,gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id ...,0,782,2128,2910,837,2007548+,782,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...
8,gb|U00096|-|2098446-2099613|ARO:3003577|ugd [E...,0,1112,522,677,1167,"827680+,120721+",435677,ATGAAAATCACCATTTCCGGTACTGGCTATGTCGGCTTGTCAAACG...
9,gb|AB765395|+|0-1143|ARO:3002993|AQU-1 [Aeromo...,0,1088,24878,25966,1143,2079546-,1088,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...


In [25]:
#get the columns from SPAligner
SPAligner_Combined1.columns

Index(['Query', 'Start position of alignment on sequence',
       'End position of alignment on sequence',
       'Start position of alignment on the first edge of the path',
       'End position of alignment on the last edge of the path',
       'Path of alignment', 'Length of the alignment on each edge of the path',
       'Sequence Length', 'Sequence'],
      dtype='object')

In [26]:
#format the Query column from  SPAligner_Combined1 so that the Query column is the same as the one from Bandage
pattern = r'\[.*?\]'
def format_query(x):
    return re.sub(pattern,"", x).rstrip()
    


SPAligner_Combined1['Query'] = SPAligner_Combined1['Query'].map(format_query)


In [27]:
#Get the query and sequence column from SPAligner_Combined1
SPAligner_Combined_Query_Path=SPAligner_Combined1[['Query','Sequence']]


In [28]:
#Get the query and sequence column from SPAligner_Combined1
SPAligner_Combined_Query_Path=SPAligner_Combined1[['Query','Start position of alignment on the first edge of the path', 'End position of alignment on the last edge of the path', 'Sequence Length',
'Path of alignment','Sequence']]
SPAligner_Combined_Query_Path

Unnamed: 0,Query,Start position of alignment on the first edge of the path,End position of alignment on the last edge of the path,Sequence Length,Path of alignment,Sequence
0,gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,3686,4068,382,438,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...
1,gb|L12710|+|0-549|ARO:3002556|AAC(6')-Ii,0,255,255,549,ATCCATTAGTTGTAGAAAGCTCCCGACGAAAGAACCAAATAGGTAC...
2,gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy,3686,4068,382,438,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...
3,gb|KU736867.1|+|19653-21573|ARO:3004442|tet(W/...,564,2422,1858,1920,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...
4,gb|AL009126.3|-|339155-339749|ARO:3003059|tmrB,46386,46925,539,594,ATGATCATTTGGATAAACGGGGCATTCGGTTCCGGAAAAACACAAA...
5,gb|X02340.1|+|222-1194|ARO:3004089|ANT(3'')-IIa,377,1257,880,972,GTGGTAACGGCGCAGTGGCGGTTTTCATGGCTTGTTATGACTGTTT...
6,gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib,1325,2074,749,804,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...
7,gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id,2128,2910,782,837,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...
8,gb|U00096|-|2098446-2099613|ARO:3003577|ugd,522,677,435677,1167,ATGAAAATCACCATTTCCGGTACTGGCTATGTCGGCTTGTCAAACG...
9,gb|AB765395|+|0-1143|ARO:3002993|AQU-1,24878,25966,1088,1143,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...


In [29]:
#Rename the columns in SPAligner
SPAligner_Combined_Query_Path.rename(columns = {'Start position of alignment on the first edge of the path':'Start_SPAligner'}, inplace = True)
SPAligner_Combined_Query_Path.rename(columns = {'End position of alignment on the last edge of the path':'End_SPAligner'}, inplace = True)
SPAligner_Combined_Query_Path.rename(columns = {'Sequence Length':"Length_SPAligner"}, inplace = True)
SPAligner_Combined_Query_Path.rename(columns = {'Path of alignment':"Path_SPAligner"}, inplace = True)
SPAligner_Combined_Query_Path.rename(columns = {'Sequence':"Sequence_SPAligner"}, inplace = True)

SPAligner_Combined_Query_Path

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  SPAligner_Combined_Query_Path.rename(columns = {'Start position of alignment on the first edge of the path':'Start_SPAligner'}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  SPAligner_Combined_Query_Path.rename(columns = {'End position of alignment on the last edge of the path':'End_SPAligner'}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  SPAligner_Combined_Query_Path.rename(columns = {'Sequence Length':"Length_SPAligner"}, inplac

Unnamed: 0,Query,Start_SPAligner,End_SPAligner,Length_SPAligner,Path_SPAligner,Sequence_SPAligner
0,gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,3686,4068,382,438,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...
1,gb|L12710|+|0-549|ARO:3002556|AAC(6')-Ii,0,255,255,549,ATCCATTAGTTGTAGAAAGCTCCCGACGAAAGAACCAAATAGGTAC...
2,gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy,3686,4068,382,438,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...
3,gb|KU736867.1|+|19653-21573|ARO:3004442|tet(W/...,564,2422,1858,1920,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...
4,gb|AL009126.3|-|339155-339749|ARO:3003059|tmrB,46386,46925,539,594,ATGATCATTTGGATAAACGGGGCATTCGGTTCCGGAAAAACACAAA...
5,gb|X02340.1|+|222-1194|ARO:3004089|ANT(3'')-IIa,377,1257,880,972,GTGGTAACGGCGCAGTGGCGGTTTTCATGGCTTGTTATGACTGTTT...
6,gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib,1325,2074,749,804,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...
7,gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id,2128,2910,782,837,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...
8,gb|U00096|-|2098446-2099613|ARO:3003577|ugd,522,677,435677,1167,ATGAAAATCACCATTTCCGGTACTGGCTATGTCGGCTTGTCAAACG...
9,gb|AB765395|+|0-1143|ARO:3002993|AQU-1,24878,25966,1088,1143,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...


In [30]:
SPAligner_Combined_Query_Path

Unnamed: 0,Query,Start_SPAligner,End_SPAligner,Length_SPAligner,Path_SPAligner,Sequence_SPAligner
0,gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,3686,4068,382,438,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...
1,gb|L12710|+|0-549|ARO:3002556|AAC(6')-Ii,0,255,255,549,ATCCATTAGTTGTAGAAAGCTCCCGACGAAAGAACCAAATAGGTAC...
2,gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy,3686,4068,382,438,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...
3,gb|KU736867.1|+|19653-21573|ARO:3004442|tet(W/...,564,2422,1858,1920,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...
4,gb|AL009126.3|-|339155-339749|ARO:3003059|tmrB,46386,46925,539,594,ATGATCATTTGGATAAACGGGGCATTCGGTTCCGGAAAAACACAAA...
5,gb|X02340.1|+|222-1194|ARO:3004089|ANT(3'')-IIa,377,1257,880,972,GTGGTAACGGCGCAGTGGCGGTTTTCATGGCTTGTTATGACTGTTT...
6,gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib,1325,2074,749,804,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...
7,gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id,2128,2910,782,837,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...
8,gb|U00096|-|2098446-2099613|ARO:3003577|ugd,522,677,435677,1167,ATGAAAATCACCATTTCCGGTACTGGCTATGTCGGCTTGTCAAACG...
9,gb|AB765395|+|0-1143|ARO:3002993|AQU-1,24878,25966,1088,1143,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...


## Compile the Queries and the results from SPAligner in one dataframe

In [31]:
#Use merge operation so that we have the sequences for both softwares for successful queries
df_QueryAndResultsSPAligner=pd.merge(df,SPAligner_Combined_Query_Path, on='Query',how='right')

df_QueryAndResultsSPAligner

Unnamed: 0,Query,Sequence,Length,Start_SPAligner,End_SPAligner,Length_SPAligner,Path_SPAligner,Sequence_SPAligner
0,gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,438,3686,4068,382,438,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...
1,gb|L12710|+|0-549|ARO:3002556|AAC(6')-Ii,ATGATAATCAGTGAATTTGACCGTAATAATCCAGTATTGAAAGATC...,549,0,255,255,549,ATCCATTAGTTGTAGAAAGCTCCCGACGAAAGAACCAAATAGGTAC...
2,gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy,ATGGACATCAGGCAAATGAACAAAACCCATCTGGAGCACTGGCGCG...,438,3686,4068,382,438,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...
3,gb|KU736867.1|+|19653-21573|ARO:3004442|tet(W/...,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...,1920,564,2422,1858,1920,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...
4,gb|AL009126.3|-|339155-339749|ARO:3003059|tmrB,ATGATCATTTGGATAAACGGGGCATTCGGTTCGGGAAAAACACAAA...,594,46386,46925,539,594,ATGATCATTTGGATAAACGGGGCATTCGGTTCCGGAAAAACACAAA...
5,gb|X02340.1|+|222-1194|ARO:3004089|ANT(3'')-IIa,GTGGTAACGGCGCAGTGGCGGTTTTCATGGCTTCTTGTTATGACAT...,972,377,1257,880,972,GTGGTAACGGCGCAGTGGCGGTTTTCATGGCTTGTTATGACTGTTT...
6,gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,804,1325,2074,749,804,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...
7,gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,837,2128,2910,782,837,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...
8,gb|U00096|-|2098446-2099613|ARO:3003577|ugd,ATGAAAATCACCATTTCCGGTACTGGCTATGTAGGCTTGTCAAACG...,1167,522,677,435677,1167,ATGAAAATCACCATTTCCGGTACTGGCTATGTCGGCTTGTCAAACG...
9,gb|AB765395|+|0-1143|ARO:3002993|AQU-1,ATGAAGCAAACCTCACCCTTGTCGTCGCTGGCGCTGAGCGCCCTGC...,1143,24878,25966,1088,1143,ATGAAGCAAACCTCACCCTTGTCGTTGCTGGCGCTGAGCGCCCTGC...


In [32]:
#Rename the columns from SPAligner and Results from the Query list
df_QueryAndResultsSPAligner.rename(columns = {'Sequence_x':'Sequence'}, inplace = True)
df_QueryAndResultsSPAligner.rename(columns = {'Sequence_y':'Sequence_SPAligner'}, inplace = True)

In [33]:
df_QueryAndResultsSPAligner['Start_SPAligner']=df_QueryAndResultsSPAligner['Start_SPAligner']+1

df_QueryAndResultsSPAligner

TypeError: can only concatenate str (not "int") to str

## This section is used to compare the results from GraphAligner and the actual list of queries.

In [36]:
#Create a dataframe for GraphAligner Combined1
GraphAligner_Combined1=pd.read_csv('outputCAMIM2GraphAligner.tsv', sep='\t', names=["Query", "Query Length", "Query Start", 
                                        "Query End","Strand Relative Length","Path Matching","Path Length",
                                         "Start Position on Path","End Position on Path","Number of residues Matches",
                                         "Alignment Back Length","Mapping Quality","Column 1"])
GraphAligner_Combined1

Unnamed: 0,Query,Query Length,Query Start,Query End,Strand Relative Length,Path Matching,Path Length,Start Position on Path,End Position on Path,Number of residues Matches,Alignment Back Length,Mapping Quality,Column 1
0,gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,438,0,438,+,<118828,5365,3686,4123,423,438,60,NM:i:15
1,gb|L12710|+|0-549|ARO:3002556|AAC(6')-Ii [Ente...,549,220,530,+,<28611963,310,0,310,307,310,60,NM:i:3
2,gb|L12710|+|0-549|ARO:3002556|AAC(6')-Ii [Ente...,549,0,140,+,<28832378,273,133,273,140,140,60,NM:i:0
3,gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy...,438,0,438,+,<118828,5365,3686,4123,421,438,60,NM:i:17
4,gb|X02340.1|+|222-1194|ARO:3004089|ANT(3'')-II...,972,0,939,+,>28874510,1312,377,1312,934,939,60,NM:i:5
5,gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')...,804,0,804,+,>2007548,5588,1325,2129,803,804,60,NM:i:1
6,gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id ...,837,0,837,+,>2007548,5588,2128,2965,835,837,60,NM:i:2
7,gb|AB765395|+|0-1143|ARO:3002993|AQU-1 [Aeromo...,1143,0,1143,+,<2079546,80395,24878,26021,1112,1143,60,NM:i:31
8,gb|KF730243.1|+|0-1143|ARO:3004647|AQU-2 [Aero...,1143,0,1143,+,<2079546,80395,24878,26021,1114,1143,60,NM:i:29
9,gb|KF730244.1|+|0-1149|ARO:3004648|AQU-3 [Aero...,1149,0,1149,+,<2079546,80395,24878,26021,1106,1150,60,NM:i:44


In [37]:
#Format the Query column from  GraphAligner_Combined1 so that the Query column is the same as the one from Bandage
pattern = r'\[.*?\]'
def format_query(x):
    return re.sub(pattern,"", x).rstrip()
    


GraphAligner_Combined1['Query'] = GraphAligner_Combined1['Query'].map(format_query)

In [38]:
# Remove < and > from the Path
GraphAligner_Combined1['Path Matching'] = GraphAligner_Combined1['Path Matching'].str.replace('>', "")
GraphAligner_Combined1['Path Matching'] = GraphAligner_Combined1['Path Matching'].str.replace('<', "")

## This section drops the rows with coverages < 50 from GraphAligner_Combined1

In [39]:
# Calculate the coverage for GraphAligner
GraphAligner_Combined1["Coverage"]=GraphAligner_Combined1["Alignment Back Length"]/GraphAligner_Combined1["Query Length"]*100

In [40]:
GraphAligner_Combined1

Unnamed: 0,Query,Query Length,Query Start,Query End,Strand Relative Length,Path Matching,Path Length,Start Position on Path,End Position on Path,Number of residues Matches,Alignment Back Length,Mapping Quality,Column 1,Coverage
0,gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,438,0,438,+,118828,5365,3686,4123,423,438,60,NM:i:15,100.0
1,gb|L12710|+|0-549|ARO:3002556|AAC(6')-Ii,549,220,530,+,28611963,310,0,310,307,310,60,NM:i:3,56.466302
2,gb|L12710|+|0-549|ARO:3002556|AAC(6')-Ii,549,0,140,+,28832378,273,133,273,140,140,60,NM:i:0,25.500911
3,gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy,438,0,438,+,118828,5365,3686,4123,421,438,60,NM:i:17,100.0
4,gb|X02340.1|+|222-1194|ARO:3004089|ANT(3'')-IIa,972,0,939,+,28874510,1312,377,1312,934,939,60,NM:i:5,96.604938
5,gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib,804,0,804,+,2007548,5588,1325,2129,803,804,60,NM:i:1,100.0
6,gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id,837,0,837,+,2007548,5588,2128,2965,835,837,60,NM:i:2,100.0
7,gb|AB765395|+|0-1143|ARO:3002993|AQU-1,1143,0,1143,+,2079546,80395,24878,26021,1112,1143,60,NM:i:31,100.0
8,gb|KF730243.1|+|0-1143|ARO:3004647|AQU-2,1143,0,1143,+,2079546,80395,24878,26021,1114,1143,60,NM:i:29,100.0
9,gb|KF730244.1|+|0-1149|ARO:3004648|AQU-3,1149,0,1149,+,2079546,80395,24878,26021,1106,1150,60,NM:i:44,100.087032


In [41]:
#Drop the coverages that are < 50 from GraphAligner 
GraphAligner_Combined1.drop(GraphAligner_Combined1[GraphAligner_Combined1['Coverage'] < 50].index, inplace = True)


In [42]:
GraphAligner_Combined1['Start Position on Path']=GraphAligner_Combined1['Start Position on Path']+1
GraphAligner_Combined1['End Position on Path']=GraphAligner_Combined1['End Position on Path']
GraphAligner_Combined1

Unnamed: 0,Query,Query Length,Query Start,Query End,Strand Relative Length,Path Matching,Path Length,Start Position on Path,End Position on Path,Number of residues Matches,Alignment Back Length,Mapping Quality,Column 1,Coverage
0,gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,438,0,438,+,118828,5365,3687,4123,423,438,60,NM:i:15,100.0
1,gb|L12710|+|0-549|ARO:3002556|AAC(6')-Ii,549,220,530,+,28611963,310,1,310,307,310,60,NM:i:3,56.466302
3,gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy,438,0,438,+,118828,5365,3687,4123,421,438,60,NM:i:17,100.0
4,gb|X02340.1|+|222-1194|ARO:3004089|ANT(3'')-IIa,972,0,939,+,28874510,1312,378,1312,934,939,60,NM:i:5,96.604938
5,gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib,804,0,804,+,2007548,5588,1326,2129,803,804,60,NM:i:1,100.0
6,gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id,837,0,837,+,2007548,5588,2129,2965,835,837,60,NM:i:2,100.0
7,gb|AB765395|+|0-1143|ARO:3002993|AQU-1,1143,0,1143,+,2079546,80395,24879,26021,1112,1143,60,NM:i:31,100.0
8,gb|KF730243.1|+|0-1143|ARO:3004647|AQU-2,1143,0,1143,+,2079546,80395,24879,26021,1114,1143,60,NM:i:29,100.0
9,gb|KF730244.1|+|0-1149|ARO:3004648|AQU-3,1149,0,1149,+,2079546,80395,24879,26021,1106,1150,60,NM:i:44,100.087032
10,gb|AL009126|+|916777-919348|ARO:3003324|Bacill...,2571,0,2571,+,1355660,157027,127270,129840,2527,2571,60,NM:i:44,100.0


In [43]:
#Get the specific columns from GraphAligner
GraphAligner_Combined_Query_Path=GraphAligner_Combined1[['Query','Path Matching', 'Start Position on Path', 
'End Position on Path']]
# Convert the integer to string
GraphAligner_Combined_Query_Path['Query']=GraphAligner_Combined_Query_Path['Query'].astype(str)
GraphAligner_Combined_Query_Path['Path Matching']=GraphAligner_Combined_Query_Path['Path Matching'].astype(str)
GraphAligner_Combined_Query_Path['Start Position on Path']=GraphAligner_Combined_Query_Path['Start Position on Path'].astype(str)
GraphAligner_Combined_Query_Path['End Position on Path']=GraphAligner_Combined_Query_Path['End Position on Path'].astype(str)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  GraphAligner_Combined_Query_Path['Query']=GraphAligner_Combined_Query_Path['Query'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  GraphAligner_Combined_Query_Path['Path Matching']=GraphAligner_Combined_Query_Path['Path Matching'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [44]:
#Use merge operation so that we have the sequences for both softwares for successful queries
merged_df_Graph = GraphAligner_Combined_Query_Path.groupby('Query').agg({'Path Matching':' '.join, 'Start Position on Path': ', '.join,'End Position on Path': ', '.join}).reset_index()
merged_df_Graph

Unnamed: 0,Query,Path Matching,Start Position on Path,End Position on Path
0,gb|AB765395|+|0-1143|ARO:3002993|AQU-1,2079546,24879,26021
1,gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,118828,3687,4123
2,gb|AE016877.1|+|1972251-1972668|ARO:3000172|FosB,15445,311,727
3,gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id,2007548,2129,2965
4,gb|AF118110.1|-|71-1037|ARO:3003002|CfxA2,821183,112,1077
5,gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy,118828,3687,4123
6,gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib,2007548,1326,2129
7,gb|AF472622|+|52-1018|ARO:3003003|CfxA3,821183,112,1077
8,gb|AF550679.1|-|80976-81768|ARO:3002601|aadA,28874510,554,1312
9,gb|AJ222769.3|+|3686-5606|ARO:3000194|tetW,136860,565,2484


In [45]:
merged_df_Graph.rename(columns = {'Path Matching':'Path_GraphAligner'}, inplace = True)
merged_df_Graph.rename(columns = {'Start Position on Path':'Start_GraphAligner'}, inplace = True)
merged_df_Graph.rename(columns = {'End Position on Path':'End_GraphAligner'}, inplace = True)

In [46]:
merged_df_Graph

Unnamed: 0,Query,Path_GraphAligner,Start_GraphAligner,End_GraphAligner
0,gb|AB765395|+|0-1143|ARO:3002993|AQU-1,2079546,24879,26021
1,gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,118828,3687,4123
2,gb|AE016877.1|+|1972251-1972668|ARO:3000172|FosB,15445,311,727
3,gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id,2007548,2129,2965
4,gb|AF118110.1|-|71-1037|ARO:3003002|CfxA2,821183,112,1077
5,gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy,118828,3687,4123
6,gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib,2007548,1326,2129
7,gb|AF472622|+|52-1018|ARO:3003003|CfxA3,821183,112,1077
8,gb|AF550679.1|-|80976-81768|ARO:3002601|aadA,28874510,554,1312
9,gb|AJ222769.3|+|3686-5606|ARO:3000194|tetW,136860,565,2484


## Compile the Queries and the results from GraphAligner in one dataframe

In [47]:
#Use merge operation so that we have the sequences for both softwares for successful queries
df_QueryAndResultsGraphAligner=pd.merge(df,merged_df_Graph, on='Query',how="left")
df_QueryAndResultsGraphAligner

Unnamed: 0,Query,Sequence,Length,Path_GraphAligner,Start_GraphAligner,End_GraphAligner
0,gb|AB765395|+|0-1143|ARO:3002993|AQU-1,ATGAAGCAAACCTCACCCTTGTCGTCGCTGGCGCTGAGCGCCCTGC...,1143,2079546,24879.0,26021.0
1,gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,438,118828,3687.0,4123.0
2,gb|AE016877.1|+|1972251-1972668|ARO:3000172|FosB,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...,417,15445,311.0,727.0
3,gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,837,2007548,2129.0,2965.0
4,gb|AF118110.1|-|71-1037|ARO:3003002|CfxA2,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,966,821183,112.0,1077.0
5,gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy,ATGGACATCAGGCAAATGAACAAAACCCATCTGGAGCACTGGCGCG...,438,118828,3687.0,4123.0
6,gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,804,2007548,1326.0,2129.0
7,gb|AF472622|+|52-1018|ARO:3003003|CfxA3,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...,966,821183,112.0,1077.0
8,gb|AF550679.1|-|80976-81768|ARO:3002601|aadA,ATGAGGGAAGCGGTGATCGCCGAAGTATCGACTCAACTATCAGAGG...,792,28874510,554.0,1312.0
9,gb|AJ222769.3|+|3686-5606|ARO:3000194|tetW,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...,1920,136860,565.0,2484.0


## Compile all the results in one master Dataframe

In [48]:
merged_df_1 = pd.merge(df_QueryAndResultsBandage, df_QueryAndResultsSPAligner, on='Query', how='inner')
merged_df_master = pd.merge(merged_df_1, df_QueryAndResultsGraphAligner, on='Query', how='inner')
#merged_df_master['Start_SPAligner']=merged_df_master['Start_SPAligner'].astype(float).astype(int)

In [49]:
#Drop some columns

merged_df_master.drop(['Sequence_y', 'Length_x','Length_y','Sequence'], axis=1)

#Rename Sequence_x
merged_df_master.rename(columns = {'Sequence_x':'Sequence_Query'}, inplace = True)



In [50]:
def compare_paths(Start_Bandage, End_Bandage,Start_GraphAligner,End_GraphAligner):
    bandage_path_start = Start_Bandage.split(',')
    bandage_path_end = End_Bandage.split(',')
    #str(Start_GraphAligner) in str(bandage_path_start) and  str(End_GraphAligner) in  str(bandage_path_end):
    
    
    if set(str(bandage_path_start)) == set([str(Start_GraphAligner)]) and set(str(bandage_path_end) )== set([str(End_GraphAligner)]):
        return 'Similar'
    elif str(Start_GraphAligner) in str(bandage_path_start) and  str(End_GraphAligner) in  str(bandage_path_end):
        return 'Full'
    else:
        return 'Different'

In [51]:
merged_df_master['BandageVSGraphAligner'] = merged_df_master.apply(lambda row: compare_paths(row['Start_Bandage'], row['End_Bandage'],row['Start_GraphAligner'],row['End_GraphAligner']), axis=1)

merged_df_master

AttributeError: 'float' object has no attribute 'split'

In [52]:
#Compare the Start, and end path from Bandage and SPAligner
merged_df_master['Start_SPAligner']=merged_df_master['Start_SPAligner'].astype(str)
merged_df_master['End_SPAligner']=merged_df_master['End_SPAligner'].astype(str)
merged_df_master['BandageVSSPAligner']=(merged_df_master['Start_Bandage'] == merged_df_master['Start_SPAligner']) & (merged_df_master['End_Bandage'] == merged_df_master['End_SPAligner'])



merged_df_master.dtypes

Query                 object
Sequence_Query        object
Length_Bandage         int64
Path_Bandage          object
Start_Bandage         object
End_Bandage           object
Extracted_Path        object
Sequence_Bandage      object
Sequence_y            object
Length_x               int64
Start_SPAligner       object
End_SPAligner         object
Length_SPAligner      object
Path_SPAligner         int64
Sequence_SPAligner    object
Sequence              object
Length_y               int64
Path_GraphAligner     object
Start_GraphAligner    object
End_GraphAligner      object
BandageVSSPAligner      bool
dtype: object

In [53]:
#Compare the Start, and end path from GraphAligner and SPAligner
merged_df_master['SPAlignerVSGraphAligner']=(merged_df_master['Start_SPAligner'] == merged_df_master['Start_GraphAligner']) & (merged_df_master['End_SPAligner'] == merged_df_master['End_GraphAligner'])

In [54]:
#Drop some columns

merged_df_master=merged_df_master.drop(['Sequence_y', 'Length_x','Length_y','Sequence'], axis=1)

## Save the Queries and  all the Results  in one .tsv file


In [55]:
merged_df_master.to_csv("QueriesAndResultsMasterList.tsv",sep='\t')

## This section saves the Nodes and Sequences from the gfa file

In [None]:
import gfapy

# Open the GFA file
file_path = "graph1.gfa"
gfa = gfapy.Gfa.from_file(file_path)

num_segments = len(gfa.segments)

In [None]:
data_graph = []

In [None]:
# Store the name and sequence for each node from the graph
for segment in gfa.segments:
    data_graph.append({"Name": segment.name, "Sequence": segment.sequence})

# Convert the list to a DataFrame
df_graph = pd.DataFrame(data_graph)

# Print the DataFrame
print(df_graph)

In [None]:
#Count the number of duplicates
#len(df_graph['Name'])-len(df_graph['Name'].drop_duplicates())

In [None]:
num_segments

In [None]:
#Save only one copy of each row to the dataframe
df_graph=df_graph.drop_duplicates()

In [None]:
df_graph

In [None]:
df_graph['Name']=df_graph['Name'].astype(str)


## Get the sequences unique to GraphAligner
## Get the dataframe containing the path from GraphAligner


In [None]:
df_GraphAligner_Paths= pd.read_csv('GraphAlignerNotEqual.tsv', sep='\t')
#GraphAlignerNotEqual.tsv is from another script:GraphAlignerBandageScript.ipynb

In [None]:
df_GraphAligner_Paths

In [None]:
df_GraphAligner_Paths
df_GraphAligner_Paths.sort_index(inplace=True)
df_GraphAligner_Paths['Path Matching']=df_GraphAligner_Paths['Path Matching'].astype(str)

In [None]:
# Function to process the Path values
def process_path(path):
    path = path.replace('<', ' ').replace('>', ' ')
    path = ' '.join(path.split('<'))
    return path

In [None]:
df_GraphAligner_Paths['Path Matching']=df_GraphAligner_Paths['Path Matching'].apply(process_path).str.strip()


In [None]:
df_GraphAligner_Paths
# Create a new column 'Combined_Sequence' that concatenates sequences if Path has two values


In [None]:
df_GraphAligner_Paths[['Path1', 'Path2']] = df_GraphAligner_Paths['Path Matching'].str.split(' ', expand=True)

In [None]:
df_GraphAligner_Paths

In [None]:
# Get the rows where Path 2 is not None
rows_not_none = df_GraphAligner_Paths[df_GraphAligner_Paths['Path2'].notna()]

# Print the resulting dataframe
rows_not_none

In [None]:
#Get the sequence for Path 1 and Path 2
# Merge df1 and df2 based on matching values in Column1 and Name

merged_df= pd.merge(df_GraphAligner_Paths, df_graph, left_on='Path1', right_on='Name')
merged_df = pd.merge(merged_df, df_graph, left_on='Path2', right_on='Name')

# Concatenate the sequences
merged_df