# This script is used to read and compare the results from Bandage and GraphAligner
## Written by Yusreen Shah
## Date: May 8th 2023

In [1]:
#Import the libraries
import numpy as np
import pandas as pd
import re 

In [2]:
#Create a dataframe for Bandage Combined1
Bandage_Combined1= pd.read_csv('Bandageoutputcombined1.tsv', sep='\t')

In [3]:
#Create a dataframe for GraphAligner Combined1
GraphAligner_Combined1=pd.read_csv('GraphAligneroutputcombined1.tsv', sep='\t', names=["Query", "Query Length", "Query Start", 
                                          "Query End","Strand Relative Length","Path Matching","Path Length",
                                         "Start Position on Path","End Position on Path","Number of residues Matches",
                                         "Alignment Back Length","Mapping Quality","Column 1"])

## This section cleans the data from GraphAligner so that the columns can be compared easily.

In [4]:
#Format the Query column from  GraphAligner_Combined1 so that the Query column is the same as the one from Bandage
pattern = r'\[.*?\]'
def format_query(x):
    return re.sub(pattern,"", x).rstrip()
    


GraphAligner_Combined1['Query'] = GraphAligner_Combined1['Query'].map(format_query)

In [5]:
GraphAligner_Combined1['Path Matching'] = GraphAligner_Combined1['Path Matching'].str.replace('>', "")
GraphAligner_Combined1['Path Matching'] = GraphAligner_Combined1['Path Matching'].str.replace('<', "")

## This section drops the  rows with coverages < 50 from GraphAligner_Combined1

In [6]:
# Calculate the coverage for GraphAligner
GraphAligner_Combined1["Coverage"]=GraphAligner_Combined1["Alignment Back Length"]/GraphAligner_Combined1["Query Length"]*100

In [7]:
#Drop the coverages that are < 50 from GraphAligner 
GraphAligner_Combined1.drop(GraphAligner_Combined1[GraphAligner_Combined1['Coverage'] < 50].index, inplace = True)

## This section cleans the data from Bandage so that the  formatting of the path column is the same as the one in Graphaligner.

In [8]:
pattern1 = r'\(.*?\)'
def format_path(x):
    return re.sub(pattern1,"", x).rstrip()

In [9]:
Bandage_Combined1['Path']=Bandage_Combined1['Path'].map(format_path)

In [10]:
Bandage_Combined1['Path']=Bandage_Combined1['Path'].str.replace('+', "")
Bandage_Combined1['Path']=Bandage_Combined1['Path'].str.replace('-', "")

## This section creates 2 dataframes so that it is easier to compare. The Query and Resulting path are kept.
### GraphAligner_Combined_Query_Path
### Bandage_Combined_Query_Path

In [11]:
GraphAligner_Combined_Query_Path=GraphAligner_Combined1[['Query','Path Matching']]

In [12]:
Bandage_Combined_Query_Path=Bandage_Combined1[['Query','Path']]

In [13]:
#Sort the values alphabetically.
Bandage_Combined_Query_Path=Bandage_Combined_Query_Path.sort_values(by=['Query']).reset_index()

In [14]:
GraphAligner_Combined_Query_Path=GraphAligner_Combined_Query_Path.sort_values(by=['Query']).reset_index()

## This section creates df_union. The dataframe stores the Query and path from both software.

In [15]:
#Use merge operation so that we have the sequences for both softwares for successful queries
df=pd.merge(Bandage_Combined_Query_Path,GraphAligner_Combined_Query_Path, on='Query')
#union
#df_union=pd.concat([Bandage_Combined_Query_Path,GraphAligner_Combined_Query_Path], axis=1)
df_union=pd.merge(Bandage_Combined_Query_Path,GraphAligner_Combined_Query_Path, on='Query',how="outer")

In [16]:
df_union

Unnamed: 0,index_x,Query,Path,index_y,Path Matching
0,145.0,gb|AB023477|+|0-861|ARO:3001082|SHV-24,5151,132.0,5151
1,385.0,gb|AB049569|+|0-861|ARO:3000958|TEM-91,5967,372.0,5967
2,182.0,gb|AB302939|+|8-869|ARO:3001115|SHV-60,5151,169.0,5151
3,77.0,gb|AB372881|+|8-869|ARO:3001160|SHV-111,5151,64.0,5151
4,91.0,gb|AB551737|+|14-875|ARO:3001177|SHV-133,5151,78.0,5151
...,...,...,...,...,...
395,,gb|AY130284|+|0-785|ARO:3000941|TEM-75 Partial,,355.0,5967
396,,gb|AY130285|+|0-785|ARO:3000981|TEM-118 Partial,,228.0,5967
397,,gb|AY590467|+|0-729|ARO:3001110|SHV-53 Partial,,163.0,5151
398,,gb|JX050178|+|0-853|ARO:3001058|TEM-199 Partial,,303.0,5967


In [17]:
df_union.rename(columns = {'Path':'Path_Bandage'}, inplace = True)
df_union.rename(columns = {'Path Matching':'Path_GraphAligner'}, inplace = True)

In [18]:
df_union['Path_Bandage'] = df_union['Path_Bandage'].str.replace(',',"").str.strip()
df_union['Path_GraphAligner'] = df_union['Path_GraphAligner'].str.strip()


In [19]:
df_union['Path_Bandage'] = df_union['Path_Bandage'].str.findall(r'[^\s,]+').str.join('')

## This section compares the  resulting paths for each query

In [20]:

#Check for similar path for the queries
df_union['Equal'] = np.where(df_union['Path_Bandage']==df_union['Path_GraphAligner'], 
                                           'yes', 'no')

In [21]:
df_union

Unnamed: 0,index_x,Query,Path_Bandage,index_y,Path_GraphAligner,Equal
0,145.0,gb|AB023477|+|0-861|ARO:3001082|SHV-24,5151,132.0,5151,yes
1,385.0,gb|AB049569|+|0-861|ARO:3000958|TEM-91,5967,372.0,5967,yes
2,182.0,gb|AB302939|+|8-869|ARO:3001115|SHV-60,5151,169.0,5151,yes
3,77.0,gb|AB372881|+|8-869|ARO:3001160|SHV-111,5151,64.0,5151,yes
4,91.0,gb|AB551737|+|14-875|ARO:3001177|SHV-133,5151,78.0,5151,yes
...,...,...,...,...,...,...
395,,gb|AY130284|+|0-785|ARO:3000941|TEM-75 Partial,,355.0,5967,no
396,,gb|AY130285|+|0-785|ARO:3000981|TEM-118 Partial,,228.0,5967,no
397,,gb|AY590467|+|0-729|ARO:3001110|SHV-53 Partial,,163.0,5151,no
398,,gb|JX050178|+|0-853|ARO:3001058|TEM-199 Partial,,303.0,5967,no


### Print the yes if path is identical, and no if path is different

In [23]:
df_union['Equal'].value_counts()

Equal
yes    371
no      29
Name: count, dtype: int64

## This section checks the queries that do not have any path in Bandage.

In [24]:
#Get the queries that are found by Bandage only
Bandage_Combined_Query_Path_Compare= Bandage_Combined_Query_Path.drop(['index'], axis=1)
Bandage_Combined_Query_Path_Compare


Unnamed: 0,Query,Path
0,gb|AB023477|+|0-861|ARO:3001082|SHV-24,5151
1,gb|AB049569|+|0-861|ARO:3000958|TEM-91,5967
2,gb|AB302939|+|8-869|ARO:3001115|SHV-60,5151
3,gb|AB372881|+|8-869|ARO:3001160|SHV-111,5151
4,gb|AB551737|+|14-875|ARO:3001177|SHV-133,5151
...,...,...
388,gb|Y14574|+|0-861|ARO:3000888|TEM-17,5967
389,gb|Y17581|+|78-936|ARO:3000891|TEM-20,5967
390,gb|Y17582|+|0-858|ARO:3000892|TEM-21,5967
391,gb|Y17583|+|213-1071|ARO:3000893|TEM-22,5967


In [26]:

#Check for similar path for the queries
df_union['Query_Not_Bandage'] = np.where(df_union['Path_Bandage'].isna(), 
                                           'yes', 'no')

In [27]:
df_union

Unnamed: 0,index_x,Query,Path_Bandage,index_y,Path_GraphAligner,Equal,Query_Not_Bandage
0,145.0,gb|AB023477|+|0-861|ARO:3001082|SHV-24,5151,132.0,5151,yes,no
1,385.0,gb|AB049569|+|0-861|ARO:3000958|TEM-91,5967,372.0,5967,yes,no
2,182.0,gb|AB302939|+|8-869|ARO:3001115|SHV-60,5151,169.0,5151,yes,no
3,77.0,gb|AB372881|+|8-869|ARO:3001160|SHV-111,5151,64.0,5151,yes,no
4,91.0,gb|AB551737|+|14-875|ARO:3001177|SHV-133,5151,78.0,5151,yes,no
...,...,...,...,...,...,...,...
395,,gb|AY130284|+|0-785|ARO:3000941|TEM-75 Partial,,355.0,5967,no,yes
396,,gb|AY130285|+|0-785|ARO:3000981|TEM-118 Partial,,228.0,5967,no,yes
397,,gb|AY590467|+|0-729|ARO:3001110|SHV-53 Partial,,163.0,5151,no,yes
398,,gb|JX050178|+|0-853|ARO:3001058|TEM-199 Partial,,303.0,5967,no,yes


In [28]:
df_union['Query_Not_Bandage'].value_counts()

Query_Not_Bandage
no     393
yes      7
Name: count, dtype: int64

## This section checks the queries that do not have any path in GraphAligner.

In [29]:
#Check for similar path for the queries
df_union['Query_Not_GraphAligner'] = np.where(df_union['Path_GraphAligner'].isna(), 
                                           'yes', 'no')

In [30]:
df_union['Query_Not_GraphAligner'].value_counts()

Query_Not_GraphAligner
no     393
yes      7
Name: count, dtype: int64

In [31]:
rslt_df_Bandage = df_union[df_union['Query_Not_Bandage']=='yes']

### Print the Queries that are missed by Bandage

In [32]:
rslt_df_Bandage

Unnamed: 0,index_x,Query,Path_Bandage,index_y,Path_GraphAligner,Equal,Query_Not_Bandage,Query_Not_GraphAligner
393,,gb|AF527798.1|+|0-785|ARO:3000879|TEM-7 Partial,,359.0,5967,no,yes,no
394,,gb|AY130282|+|0-764|ARO:3000980|TEM-117 Partial,,227.0,5967,no,yes,no
395,,gb|AY130284|+|0-785|ARO:3000941|TEM-75 Partial,,355.0,5967,no,yes,no
396,,gb|AY130285|+|0-785|ARO:3000981|TEM-118 Partial,,228.0,5967,no,yes,no
397,,gb|AY590467|+|0-729|ARO:3001110|SHV-53 Partial,,163.0,5151,no,yes,no
398,,gb|JX050178|+|0-853|ARO:3001058|TEM-199 Partial,,303.0,5967,no,yes,no
399,,gb|NG_050218.1|+|0-1061|ARO:3001044|TEM-181 P...,,285.0,59655967,no,yes,no


### Print the Queries that are missed by GraphAligner

In [33]:
rslt_df_GraphAligner = df_union[df_union['Query_Not_GraphAligner']=='yes']

In [34]:
rslt_df_GraphAligner 

Unnamed: 0,index_x,Query,Path_Bandage,index_y,Path_GraphAligner,Equal,Query_Not_Bandage,Query_Not_GraphAligner
59,372.0,gb|AF527798.1|+|0-785|ARO:3000879|TEM-7,5967,,,no,no,yes
132,240.0,gb|AY130282|+|0-764|ARO:3000980|TEM-117,5967,,,no,no,yes
133,368.0,gb|AY130284|+|0-785|ARO:3000941|TEM-75,5967,,,no,no,yes
134,241.0,gb|AY130285|+|0-785|ARO:3000981|TEM-118,5967,,,no,no,yes
162,176.0,gb|AY590467|+|0-729|ARO:3001110|SHV-53,5151,,,no,no,yes
295,316.0,gb|JX050178|+|0-853|ARO:3001058|TEM-199,5967,,,no,no,yes
353,298.0,gb|NG_050218.1|+|0-1061|ARO:3001044|TEM-181,59655967,,,no,no,yes


## This section gets the sequences from GraphAligner that differs from Bandage

In [None]:
# Get the query
df_GraphAlignerPaths =np.where()