## This script is used to compare the results from all 3 software.
## Written by Yusreen Shah
## Date: May 10th 2023

In [2]:
# import the libraries
import numpy as np
import pandas as pd
import re 
from Bio import SeqIO
from collections import defaultdict


## This section saves the list of all the queries in a dataframe

In [None]:
# Save the data
data = defaultdict(list)

# Read the values from the .fasta file, and save them to data
for seq_record in SeqIO.parse("combined_1.fasta", "fasta"):
    query=seq_record.id
    sequence= repr(seq_record.seq)
    length=len(seq_record)
    data['Query'].append(query)
    data['Sequence'].append(sequence)
    data['Length'].append(length)
    
# Add the data to a dataframe
df = pd.DataFrame.from_dict(data)


In [None]:
# Remove Seq(' and ') from the sequences
df['Sequence'] = df['Sequence'].str.replace('Seq(''', '')
df['Sequence'] = df['Sequence'].str.replace(')', '')
df['Sequence'] = df['Sequence'].str.strip(" \' ")
df['Sequence']=df['Sequence'].str.rstrip()

In [None]:
df=df.sort_values("Query")
df=df.reset_index(drop=True)
df

## This section is used to compare the results from Bandage and the actual list of queries.

In [None]:
# Create a dataframe for Bandage Combined1
Bandage_Combined1= pd.read_csv('Bandageoutputcombined1.tsv', sep='\t')

In [None]:
Bandage_Combined1.head()

In [None]:
Bandage_Combined_Query_Path =Bandage_Combined1[['Path','Query','Sequence']]

## Extract the start and end position from the path in Bandage

In [None]:
# Extract the start position
pattern_path_start = r'\((.*?)\)' 

In [None]:
Bandage_Combined_Query_Path['Start']=Bandage_Combined_Query_Path['Path'].str.extract(pattern_path_start, expand=False)
Bandage_Combined_Query_Path

In [None]:
# Extract the end position
Bandage_Combined_Query_Path['End']= Bandage_Combined_Query_Path["Path"].str.split().str[-1]
Bandage_Combined_Query_Path['End']=Bandage_Combined_Query_Path['End'].str.strip('()').astype(int)
Bandage_Combined_Query_Path

In [None]:
# Reorder the columns in Bandage_Combined_Query_Path
Bandage_Combined_Query_Path=Bandage_Combined_Query_Path[['Query','Path','Start','End','Sequence']]

# Convert integer columns to strings
Bandage_Combined_Query_Path['Path'] = Bandage_Combined_Query_Path['Path'].astype(str)
Bandage_Combined_Query_Path['End'] = Bandage_Combined_Query_Path['End'].astype(str)
Bandage_Combined_Query_Path['Start'] = Bandage_Combined_Query_Path['Start'].astype(str)
Bandage_Combined_Query_Path['Sequence'] = Bandage_Combined_Query_Path['Sequence'].astype(str)

# Bandage_Combined1['Query'].value_counts() 
# Merge the rows that have the same query
# group the dataframe by the 'Name' column and aggregate the data for each group
merge_Bandage_df = Bandage_Combined_Query_Path.groupby('Query').agg({'Path': ' , '.join, 'End': ', '.join, 'Start': ', '.join, 'Sequence': ', '.join}).reset_index()
merge_Bandage_df

In [None]:
# Add Braces to the values that now have ','
def add_braces(value):
    if ',' in value:
        return '{' + value + '}'
    else:
        return value
merge_Bandage_df['Path'] = merge_Bandage_df['Path'].apply(add_braces)
merge_Bandage_df['Start'] = merge_Bandage_df['Start'].apply(add_braces)
merge_Bandage_df['End'] = merge_Bandage_df['End'].apply(add_braces)
merge_Bandage_df['Sequence'] = merge_Bandage_df['Sequence'].apply(add_braces)

In [None]:
merge_Bandage_df.to_csv("Test1.tsv", sep="\t")

In [None]:
#Use merge operation so that we have the sequences for both softwares for successful queries
df_QueryAndResultsBandage=pd.merge(df,merge_Bandage_df, on='Query',how="outer")

In [None]:
df_QueryAndResultsBandage

In [None]:
for col in df_QueryAndResultsBandage.columns:
    print(col)

In [None]:
df_QueryAndResultsBandage.columns

In [None]:
df_QueryAndResultsBandage.rename(columns = {'Sequence_x':'Sequence'}, inplace = True)
df_QueryAndResultsBandage.rename(columns = {'Sequence_y':'Sequence_Bandage'}, inplace = True)

In [None]:
df_QueryAndResultsBandage.to_csv("CompiledQueriesBandage.tsv",sep="\t")

## This section is used to compare the results from SPAligner and the actual list of queries.

In [None]:
#Create a dataframe for SPAligner Combined1
SPAligner_Combined1=pd.read_csv('SPAligneroutputcombined1.tsv', sep='\t')
SPAligner_Combined1

In [None]:
#get the columns from SPAligner
SPAligner_Combined1.columns

In [None]:
#format the Query column from  SPAligner_Combined1 so that the Query column is the same as the one from Bandage
pattern = r'\[.*?\]'
def format_query(x):
    return re.sub(pattern,"", x).rstrip()
    


SPAligner_Combined1['Query'] = SPAligner_Combined1['Query'].map(format_query)


In [None]:
#Get the query and sequence column from SPAligner_Combined1
SPAligner_Combined_Query_Path=SPAligner_Combined1[['Query','Sequence']]


In [None]:
#Get the query and sequence column from SPAligner_Combined1
SPAligner_Combined_Query_Path=SPAligner_Combined1[['Query','Start position of alignment on the first edge of the path', 'End position on the last edge of the path', 'Sequence length',
'Path of alignment','Sequence']]
SPAligner_Combined_Query_Path

In [None]:
#Use merge operation so that we have the sequences for both softwares for successful queries
df_QueryAndResultsSPAligner=pd.merge(df,SPAligner_Combined_Query_Path, on='Query',how='left')
df_QueryAndResultsSPAligner

In [None]:
df_QueryAndResultsSPAligner.rename(columns = {'Sequence_x':'Sequence'}, inplace = True)
df_QueryAndResultsSPAligner.rename(columns = {'Sequence_y':'Sequence_SPAligner'}, inplace = True)

## This section is used to compare the results from GraphAligner and the actual list of queries.

In [None]:
#Create a dataframe for GraphAligner Combined1
GraphAligner_Combined1=pd.read_csv('GraphAligneroutputcombined1.tsv', sep='\t', names=["Query", "Query Length", "Query Start", 
                                        "Query End","Strand Relative Length","Path Matching","Path Length",
                                         "Start Position on Path","End Position on Path","Number of residues Matches",
                                         "Alignment Back Length","Mapping Quality","Column 1"])
GraphAligner_Combined1

In [None]:
#Format the Query column from  GraphAligner_Combined1 so that the Query column is the same as the one from Bandage
pattern = r'\[.*?\]'
def format_query(x):
    return re.sub(pattern,"", x).rstrip()
    


GraphAligner_Combined1['Query'] = GraphAligner_Combined1['Query'].map(format_query)

In [None]:
# Remove < and > from the Path
GraphAligner_Combined1['Path Matching'] = GraphAligner_Combined1['Path Matching'].str.replace('>', "")
GraphAligner_Combined1['Path Matching'] = GraphAligner_Combined1['Path Matching'].str.replace('<', "")

In [None]:
#Get the specific columns from GraphAligner
GraphAligner_Combined_Query_Path=GraphAligner_Combined1[['Query','Path Matching', 'Start Position on Path', 
'End Position on Path']]
# Convert the integer to string
GraphAligner_Combined_Query_Path['Query']=GraphAligner_Combined_Query_Path['Query'].astype(str)
GraphAligner_Combined_Query_Path['Path Matching']=GraphAligner_Combined_Query_Path['Path Matching'].astype(str)
GraphAligner_Combined_Query_Path['Start Position on Path']=GraphAligner_Combined_Query_Path['Start Position on Path'].astype(str)
GraphAligner_Combined_Query_Path['End Position on Path']=GraphAligner_Combined_Query_Path['End Position on Path'].astype(str)


In [None]:
#Use merge operation so that we have the sequences for both softwares for successful queries
merged_df_Graph = GraphAligner_Combined_Query_Path.groupby('Query').agg({'Path Matching':' '.join, 'Start Position on Path': ', '.join,'End Position on Path': ', '.join}).reset_index()
merged_df_Graph

In [None]:
#Use merge operation so that we have the sequences for both softwares for successful queries
df_QueryAndResultsGraphAligner=pd.merge(df,merged_df_Graph, on='Query',how="left")
df_QueryAndResultsGraphAligner

In [None]:
merged_df_1 = pd.merge(df_QueryAndResultsBandage, df_QueryAndResultsSPAligner, on='Query', how='inner')
merged_df_2 = pd.merge(merged_df_1, df_QueryAndResultsGraphAligner, on='Query', how='inner')
merged_df_2

## This section saves the Nodes and Sequences from the gfa file

In [3]:
import gfapy

# Open the GFA file
file_path = "graph1.gfa"
gfa = gfapy.Gfa.from_file(file_path)

num_segments = len(gfa.segments)

In [4]:
data_graph = []

In [5]:
# Store the name and sequence for each node from the graph
for segment in gfa.segments:
    data_graph.append({"Name": segment.name, "Sequence": segment.sequence})

# Convert the list to a DataFrame
df_graph = pd.DataFrame(data_graph)

# Print the DataFrame
print(df_graph)

       Name                                           Sequence
0      1321  CGTTCCACCGGTTCTTACAGCCTGGTTACTCAGCAGCCGCTGGGTG...
1      1323  GTGCCTTACCACCCAGCGGCTGCTGAGTAACCAGGCTGTAAGAACC...
2      1325  GTGCCTTACCACCCAGCGGCTGCTGAGTAACCAGGCTGTAAGAACC...
3     32989  CTTAATATGAACCATCCAACTTTATGGGGTCAGTCCAGCAGCGCCG...
4       565  GGTTCGGCGGAGCTTACCGCGTCTTTTCGCGGTTAGCGGAGTGTGG...
...     ...                                                ...
2524  37173  GAACAAGGATCTAAGCTGTTTTAAGTTATGGGCAACGCAATGCACT...
2525  24893  TCTTAAGAGAGTGCATTGCGTTGCCCATAACTTAAAACAGCTTAGA...
2526  36779  TTTTCTCTGCAACCGAACCGGCTGTTTGTGTGAAGTGATTCACATC...
2527   6673  CTCGGCCCGACCCGAAGCCTGCAGGGATAAGTCGAAGGGACCGCGC...
2528  37823  CTCGGCCCGACCCGAAGCCTGCAGGGATAAGTCGAAGGGACCGCGC...

[2529 rows x 2 columns]


In [6]:
#Count the number of duplicates
#len(df_graph['Name'])-len(df_graph['Name'].drop_duplicates())

In [7]:
num_segments

2529

In [8]:
#Save only one copy of each row to the dataframe
df_graph=df_graph.drop_duplicates()

In [9]:
df_graph

Unnamed: 0,Name,Sequence
0,1321,CGTTCCACCGGTTCTTACAGCCTGGTTACTCAGCAGCCGCTGGGTG...
1,1323,GTGCCTTACCACCCAGCGGCTGCTGAGTAACCAGGCTGTAAGAACC...
2,1325,GTGCCTTACCACCCAGCGGCTGCTGAGTAACCAGGCTGTAAGAACC...
3,32989,CTTAATATGAACCATCCAACTTTATGGGGTCAGTCCAGCAGCGCCG...
4,565,GGTTCGGCGGAGCTTACCGCGTCTTTTCGCGGTTAGCGGAGTGTGG...
...,...,...
2524,37173,GAACAAGGATCTAAGCTGTTTTAAGTTATGGGCAACGCAATGCACT...
2525,24893,TCTTAAGAGAGTGCATTGCGTTGCCCATAACTTAAAACAGCTTAGA...
2526,36779,TTTTCTCTGCAACCGAACCGGCTGTTTGTGTGAAGTGATTCACATC...
2527,6673,CTCGGCCCGACCCGAAGCCTGCAGGGATAAGTCGAAGGGACCGCGC...


In [10]:
df_graph['Name']=df_graph['Name'].astype(str)


## Get the dataframe containing the path from GraphAligner

In [11]:
df_GraphAligner_Paths= pd.read_csv('GraphAlignerNotEqual.tsv', sep='\t')

In [12]:
df_GraphAligner_Paths

Unnamed: 0.1,Unnamed: 0,Query,Path_Bandage,Path Matching,Path_GraphAligner,Equal
0,0,gb|AF047479|+|1295-2087|ARO:3002603|aadA3,7593.0,>7591,7591,no
1,1,gb|AF156486|+|5012-5792|ARO:3002602|aadA2,7593.0,>7591,7591,no
2,3,gb|AF550679.1|-|80976-81768|ARO:3002601|aadA,7591.0,>7593,7593,no
3,4,gb|AJ809407|+|118-898|ARO:3002620|aadA23,7591.0,>7593,7593,no
4,5,gb|AM040708.1|+|1173-1965|ARO:3004704|aadA8b,7593.0,>7591,7591,no
5,6,gb|AM261837|+|73-865|ARO:3002619|aadA22,7591.0,>7593,7593,no
6,10,gb|AY139603|+|106-898|ARO:3002608|aadA8,7593.0,>7591,7591,no
7,11,gb|AY171244|+|46-838|ARO:3002618|aadA21,7591.0,>7593,7593,no
8,13,gb|BX664015.1|-|103017-103833|ARO:3002641|APH(...,264523479.0,<2645<22251,264522251,no
9,14,gb|CP003022|+|336788-337580|ARO:3003197|aadA25,7593.0,>7591,7591,no


In [13]:
df_GraphAligner_Paths
df_GraphAligner_Paths.sort_index(inplace=True)
df_GraphAligner_Paths['Path Matching']=df_GraphAligner_Paths['Path Matching'].astype(str)

In [14]:
# Function to process the Path values
def process_path(path):
    path = path.replace('<', ' ').replace('>', ' ')
    path = ' '.join(path.split('<'))
    return path

In [15]:
df_GraphAligner_Paths['Path Matching']=df_GraphAligner_Paths['Path Matching'].apply(process_path).str.strip()


In [16]:
df_GraphAligner_Paths
# Create a new column 'Combined_Sequence' that concatenates sequences if Path has two values


Unnamed: 0.1,Unnamed: 0,Query,Path_Bandage,Path Matching,Path_GraphAligner,Equal
0,0,gb|AF047479|+|1295-2087|ARO:3002603|aadA3,7593.0,7591,7591,no
1,1,gb|AF156486|+|5012-5792|ARO:3002602|aadA2,7593.0,7591,7591,no
2,3,gb|AF550679.1|-|80976-81768|ARO:3002601|aadA,7591.0,7593,7593,no
3,4,gb|AJ809407|+|118-898|ARO:3002620|aadA23,7591.0,7593,7593,no
4,5,gb|AM040708.1|+|1173-1965|ARO:3004704|aadA8b,7593.0,7591,7591,no
5,6,gb|AM261837|+|73-865|ARO:3002619|aadA22,7591.0,7593,7593,no
6,10,gb|AY139603|+|106-898|ARO:3002608|aadA8,7593.0,7591,7591,no
7,11,gb|AY171244|+|46-838|ARO:3002618|aadA21,7591.0,7593,7593,no
8,13,gb|BX664015.1|-|103017-103833|ARO:3002641|APH(...,264523479.0,2645 22251,264522251,no
9,14,gb|CP003022|+|336788-337580|ARO:3003197|aadA25,7593.0,7591,7591,no


In [17]:
df_GraphAligner_Paths[['Path1', 'Path2']] = df_GraphAligner_Paths['Path Matching'].str.split(' ', expand=True)

In [18]:
df_GraphAligner_Paths

Unnamed: 0.1,Unnamed: 0,Query,Path_Bandage,Path Matching,Path_GraphAligner,Equal,Path1,Path2
0,0,gb|AF047479|+|1295-2087|ARO:3002603|aadA3,7593.0,7591,7591,no,7591,
1,1,gb|AF156486|+|5012-5792|ARO:3002602|aadA2,7593.0,7591,7591,no,7591,
2,3,gb|AF550679.1|-|80976-81768|ARO:3002601|aadA,7591.0,7593,7593,no,7593,
3,4,gb|AJ809407|+|118-898|ARO:3002620|aadA23,7591.0,7593,7593,no,7593,
4,5,gb|AM040708.1|+|1173-1965|ARO:3004704|aadA8b,7593.0,7591,7591,no,7591,
5,6,gb|AM261837|+|73-865|ARO:3002619|aadA22,7591.0,7593,7593,no,7593,
6,10,gb|AY139603|+|106-898|ARO:3002608|aadA8,7593.0,7591,7591,no,7591,
7,11,gb|AY171244|+|46-838|ARO:3002618|aadA21,7591.0,7593,7593,no,7593,
8,13,gb|BX664015.1|-|103017-103833|ARO:3002641|APH(...,264523479.0,2645 22251,264522251,no,2645,22251.0
9,14,gb|CP003022|+|336788-337580|ARO:3003197|aadA25,7593.0,7591,7591,no,7591,


In [20]:
# Get the rows where Column2 is not None
rows_not_none = df_GraphAligner_Paths[df_GraphAligner_Paths['Path2'].notna()]

# Print the resulting dataframe
rows_not_none

Unnamed: 0.1,Unnamed: 0,Query,Path_Bandage,Path Matching,Path_GraphAligner,Equal,Path1,Path2
8,13,gb|BX664015.1|-|103017-103833|ARO:3002641|APH(...,264523479.0,2645 22251,264522251,no,2645,22251
21,28,gb|NG_050218.1|+|0-1061|ARO:3001044|TEM-181 P...,,5965 5967,59655967,no,5965,5967


In [26]:
#Get the sequence for Path 1 and Path 2
# Merge df1 and df2 based on matching values in Column1 and Name

merged_df= pd.merge(df_GraphAligner_Paths, df_graph, left_on='Path1', right_on='Name')
merged_df = pd.merge(merged_df, df_graph, left_on='Path2', right_on='Name')

# Concatenate the sequences
merged_df

Unnamed: 0.1,Unnamed: 0,Query,Path_Bandage,Path Matching,Path_GraphAligner,Equal,Path1,Path2,Name_x,Sequence_x,Name_y,Sequence_y
0,13,gb|BX664015.1|-|103017-103833|ARO:3002641|APH(...,264523479.0,2645 22251,264522251,no,2645,22251,2645,AGAAAAACTCATCGAGCATCAAATGAAACTGCAATTTATTCATATC...,22251,ATCAGCAAAAGGGGATGATAAGTTTATCACCACCGACTATTTGCAA...
1,28,gb|NG_050218.1|+|0-1061|ARO:3001044|TEM-181 P...,,5965 5967,59655967,no,5965,5967,5965,GACGGGATCAGTACCGACGGTGATATGGGGCAAATGGTGGTCACCA...,5967,TGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTC...


In [None]:
merged_df = merged_df[['Sequence_x'][:len()-1]]