## This script is used to compare the results from all 3 software
## Written by Yusreen Shah
## Date: May 10th 2023

In [8]:
# import the libraries
import numpy as np
import pandas as pd
import re 
from Bio import SeqIO
from collections import defaultdict


## This section saves the list of all the queries in a dataframe

In [10]:
# Save the data
data = defaultdict(list)

# Read the values from the .fasta file, and save them to data
for seq_record in SeqIO.parse("combined_1.fasta", "fasta"):
    query=seq_record.id
    sequence= repr(seq_record.seq)
    length=len(seq_record)
    data['Query'].append(query)
    data['Sequence'].append(sequence)
    data['Length'].append(length)
    
# Add the data to a dataframe
df = pd.DataFrame.from_dict(data)


In [11]:
# Remove Seq(' and ') from the sequences
df['Sequence'] = df['Sequence'].str.replace('Seq(''', '')
df['Sequence'] = df['Sequence'].str.replace(')', '')
df['Sequence'] = df['Sequence'].str.strip(" \' ")
df['Sequence']=df['Sequence'].str.rstrip()

In [4]:
df.head()

Unnamed: 0,Query,Sequence,Length
0,gb|FJ503047|+|0-519|ARO:3002592|AAC(6')-Ib-Han...,ATGACTGAGCATGACCTTGTGATGCTCTATGAGTGGCTAAATCGAT...,519
1,gb|EU085533|+|0-519|ARO:3002591|AAC(6')-Ib-Suzhou,ATGACTGAGCATGACCTTGCGATGCTCTATGAGTGGCTAAATCGAT...,519
2,gb|DQ303918|+|0-600|ARO:3002547|AAC(6')-Ib-cr,ATGAGCAACGCAAAAACAAAGTTAGGCATCACAAAGTACAGCATCG...,600
3,gb|JQ808129|+|633-1188|ARO:3002546|AAC(6')-Ib,GTGACCAACAGCAACGATTCCGTAACACTGCGCCTCATGACTGAGC...,555
4,gb|U59183|+|247-859|ARO:3002581|AAC(6')-Ib10,ATGTTACGCAGCAGCAGTCGCCCTAAAACAAAGTTAGGCATCACAA...,612


## This section is used to compare the results from Bandage and the actual list of queries.

In [5]:
#Create a dataframe for Bandage Combined1
Bandage_Combined1= pd.read_csv('Bandageoutputcombined1.tsv', sep='\t')

In [6]:
Bandage_Combined1.head()

Unnamed: 0,Query,Path,Length,Query covered by path,Query covered by hits,Mean hit identity,Total hit mismatches,Total hit gap opens,Relative length,Length discrepancy,E-value product,Sequence
0,gb|U59183|+|247-859|ARO:3002581|AAC(6')-Ib10,(56) 7593+ (642),587,95.915%,95.915%,99.83%,1,0,100%,0,0,AAACAAAGTTAGGCATCACAAAGTACAGCATCGTGACCAACAGCAA...
1,gb|AY136758|+|377-947|ARO:3002582|AAC(6')-Ib11,(93) 7593+ (642),550,96.4912%,96.4912%,99.455%,3,0,100%,0,0,CAACAGCAACGATTCCGTCACACTGCGCCTCATGACTGAGCATGAC...
2,gb|FJ854362|+|1702-2257|ARO:3002576|AAC(6')-Ib3,(88) 7593+ (642),555,100%,100%,99.64%,2,0,100%,0,0,GTGACCAACAGCAACGATTCCGTCACACTGCGCCTCATGACTGAGC...
3,gb|AF445082|+|2788-3343|ARO:3002577|AAC(6')-Ib4,(88) 7593+ (642),555,100%,100%,99.64%,2,0,100%,0,0,GTGACCAACAGCAACGATTCCGTCACACTGCGCCTCATGACTGAGC...
4,gb|AF043381|+|251-863|ARO:3002580|AAC(6')-Ib9,(56) 7593+ (642),587,95.915%,95.915%,99.659%,2,0,100%,0,0,AAACAAAGTTAGGCATCACAAAGTACAGCATCGTGACCAACAGCAA...


In [7]:
Bandage_Combined_Query_Path =Bandage_Combined1[['Query','Sequence']]

In [19]:
#Use merge operation so that we have the sequences for both softwares for successful queries
df_QueryAndResultsBandage=pd.merge(df,Bandage_Combined_Query_Path, on='Query')

In [20]:
for col in df_QueryAndResultsBandage.columns:
    print(col)

Query
Sequence_x
Length
Sequence_y


In [21]:
df_QueryAndResultsBandage.rename(columns = {'Sequence_x':'Sequence'}, inplace = True)
df_QueryAndResultsBandage.rename(columns = {'Sequence_y':'Sequence_Bandage'}, inplace = True)

In [22]:
for col in df_QueryAndResultsBandage.columns:
    print(col)

Query
Sequence
Length
Sequence_Bandage


## This section is used to compare the results from SPAligner and the actual list of queries.

In [23]:
#Create a dataframe for SPAligner Combined1
SPAligner_Combined1=pd.read_csv('SPAligneroutputcombined1.tsv', sep='\t')
#format the Query column from  SPAligner_Combined1 so that the Query column is the same as the one from Bandage
pattern = r'\[.*?\]'
def format_query(x):aa
    return re.sub(pattern,"", x).rstrip()
    


SPAligner_Combined1['Query'] = SPAligner_Combined1['Query'].map(format_query)


In [24]:
#Get the query and sequence column from SPAligner_Combined1
SPAligner_Combined_Query_Path=SPAligner_Combined1[['Query','Sequence']]

In [25]:
#Use merge operation so that we have the sequences for both softwares for successful queries
df_QueryAndResultsSPAligner=pd.merge(df,SPAligner_Combined_Query_Path, on='Query')

In [26]:
df_QueryAndResultsSPAligner.rename(columns = {'Sequence_x':'Sequence'}, inplace = True)
df_QueryAndResultsSPAligner.rename(columns = {'Sequence_y':'Sequence_SPAligner'}, inplace = True)

In [27]:
#Create a dataframe for GraphAligner Combined1
GraphAligner_Combined1=pd.read_csv('GraphAligneroutputcombined1.tsv', sep='\t', names=["Query", "Query Length", "Query Start", 
                                          "Query End","Strand Relative Length","Path Matching","Path Length",
                                         "Start Position on Path","End Position on Path","Number of residues Matches",
                                         "Alignment Back Length","Mapping Quality","Column 1"])

In [28]:
#Format the Query column from  GraphAligner_Combined1 so that the Query column is the same as the one from Bandage
pattern = r'\[.*?\]'
def format_query(x):
    return re.sub(pattern,"", x).rstrip()
    


GraphAligner_Combined1['Query'] = GraphAligner_Combined1['Query'].map(format_query)

In [29]:
# Remove < and > from the Path
GraphAligner_Combined1['Path Matching'] = GraphAligner_Combined1['Path Matching'].str.replace('>', "")
GraphAligner_Combined1['Path Matching'] = GraphAligner_Combined1['Path Matching'].str.replace('<', "")

## This section saves the Nodes and Sequences from the gfa file

In [30]:
import gfapy

# Open the GFA file
file_path = "graph1.gfa"
gfa = gfapy.Gfa.from_file(file_path)

num_segments = len(gfa.segments)

In [31]:
data_graph = []

In [32]:
# Store the name and sequence for each node from the graph
for segment in gfa.segments:
    data_graph.append({"Name": segment.name, "Sequence": segment.sequence})

# Convert the list to a DataFrame
df_graph = pd.DataFrame(data_graph)

# Print the DataFrame
print(df_graph)

       Name                                           Sequence
0      1321  CGTTCCACCGGTTCTTACAGCCTGGTTACTCAGCAGCCGCTGGGTG...
1      1323  GTGCCTTACCACCCAGCGGCTGCTGAGTAACCAGGCTGTAAGAACC...
2      1325  GTGCCTTACCACCCAGCGGCTGCTGAGTAACCAGGCTGTAAGAACC...
3     32989  CTTAATATGAACCATCCAACTTTATGGGGTCAGTCCAGCAGCGCCG...
4       565  GGTTCGGCGGAGCTTACCGCGTCTTTTCGCGGTTAGCGGAGTGTGG...
...     ...                                                ...
2524  37173  GAACAAGGATCTAAGCTGTTTTAAGTTATGGGCAACGCAATGCACT...
2525  24893  TCTTAAGAGAGTGCATTGCGTTGCCCATAACTTAAAACAGCTTAGA...
2526  36779  TTTTCTCTGCAACCGAACCGGCTGTTTGTGTGAAGTGATTCACATC...
2527   6673  CTCGGCCCGACCCGAAGCCTGCAGGGATAAGTCGAAGGGACCGCGC...
2528  37823  CTCGGCCCGACCCGAAGCCTGCAGGGATAAGTCGAAGGGACCGCGC...

[2529 rows x 2 columns]


In [12]:
#Count the number of duplicates
#len(df_graph['Name'])-len(df_graph['Name'].drop_duplicates())

In [34]:
num_segments

2529

In [35]:
#Save only one copy of each row to the dataframe
df_graph=df_graph.drop_duplicates()

In [36]:
df_graph

Unnamed: 0,Name,Sequence
0,1321,CGTTCCACCGGTTCTTACAGCCTGGTTACTCAGCAGCCGCTGGGTG...
1,1323,GTGCCTTACCACCCAGCGGCTGCTGAGTAACCAGGCTGTAAGAACC...
2,1325,GTGCCTTACCACCCAGCGGCTGCTGAGTAACCAGGCTGTAAGAACC...
3,32989,CTTAATATGAACCATCCAACTTTATGGGGTCAGTCCAGCAGCGCCG...
4,565,GGTTCGGCGGAGCTTACCGCGTCTTTTCGCGGTTAGCGGAGTGTGG...
...,...,...
2524,37173,GAACAAGGATCTAAGCTGTTTTAAGTTATGGGCAACGCAATGCACT...
2525,24893,TCTTAAGAGAGTGCATTGCGTTGCCCATAACTTAAAACAGCTTAGA...
2526,36779,TTTTCTCTGCAACCGAACCGGCTGTTTGTGTGAAGTGATTCACATC...
2527,6673,CTCGGCCCGACCCGAAGCCTGCAGGGATAAGTCGAAGGGACCGCGC...


In [37]:
df_graph.loc[df_graph['Name'] == '7593']

Unnamed: 0,Name,Sequence
801,7593,TGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTC...
