# This script is used to read and compare the results from Bandage and GraphAligner
## Written by Yusreen Shah
## Date: May 8th 2023

In [2]:
#Import the libraries
import numpy as np
import pandas as pd
import re 

In [3]:
#Create a dataframe for Bandage Combined1
Bandage_Combined1= pd.read_csv('Bandageoutputcombined1.tsv', sep='\t')

In [4]:
#Create a dataframe for GraphAligner Combined1
GraphAligner_Combined1=pd.read_csv('GraphAligneroutputcombined1.tsv', sep='\t', names=["Query", "Query Length", "Query Start", 
                                          "Query End","Strand Relative Length","Path Matching","Path Length",
                                         "Start Position on Path","End Position on Path","Number of residues Matches",
                                         "Alignment Back Length","Mapping Quality","Column 1"])

## This section cleans the data from GraphAligner so that the columns can be compared easily.

In [5]:
#Format the Query column from  GraphAligner_Combined1 so that the Query column is the same as the one from Bandage
pattern = r'\[.*?\]'
def format_query(x):
    return re.sub(pattern,"", x).rstrip()
    


GraphAligner_Combined1['Query'] = GraphAligner_Combined1['Query'].map(format_query)

In [None]:
# Add Braces to the values that now have ','
def remove_angle(value):
    if '>' or '<' in value:
        return value
    else:
        return value

In [6]:
GraphAligner_Combined1['Path Matching_1'] = GraphAligner_Combined1['Path Matching'].str.replace('>', "")
#GraphAligner_Combined1['Path Matching_1'] = GraphAligner_Combined1['Path Matching'].str.replace('<', "")

In [7]:
GraphAligner_Combined1['Path Matching_1'] = GraphAligner_Combined1['Path Matching'].str.replace('<', "")
GraphAligner_Combined1

Unnamed: 0,Query,Query Length,Query Start,Query End,Strand Relative Length,Path Matching,Path Length,Start Position on Path,End Position on Path,Number of residues Matches,Alignment Back Length,Mapping Quality,Column 1,Path Matching_1
0,gb|U59183|+|247-859|ARO:3002581|AAC(6')-Ib10,612,25,612,+,>7593,2524,55,642,586,587,255,NM:i:1,>7593
1,gb|U59183|+|247-859|ARO:3002581|AAC(6')-Ib10,612,2,18,+,<1851,79238,37281,37296,15,16,255,NM:i:1,1851
2,gb|AY136758|+|377-947|ARO:3002582|AAC(6')-Ib11,570,20,570,+,>7593,2524,92,642,547,550,255,NM:i:3,>7593
3,gb|FJ854362|+|1702-2257|ARO:3002576|AAC(6')-Ib3,555,0,555,+,>7593,2524,87,642,553,555,255,NM:i:2,>7593
4,gb|AF445082|+|2788-3343|ARO:3002577|AAC(6')-Ib4,555,0,555,+,>7593,2524,87,642,553,555,255,NM:i:2,>7593
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,gb|AJ318094|+|0-861|ARO:3000961|TEM-94,861,0,861,+,>5967,1044,36,897,852,861,255,NM:i:9,>5967
376,gb|AJ308558|+|181-1042|ARO:3000962|TEM-95,861,0,861,+,>5967,1044,36,897,859,861,255,NM:i:2,>5967
377,gb|AY092401|+|0-861|ARO:3000963|TEM-96,861,0,861,+,>5967,1044,36,897,858,861,255,NM:i:3,>5967
378,gb|AM990992.1|-|1001760-1003680|ARO:3000186|tetM,1920,0,1920,+,<1107,47514,33707,35627,1874,1920,255,NM:i:46,1107


In [None]:
GraphAligner_Combined1.to_csv("test_Grap_2.tsv",sep='\t')

## This section drops the  rows with coverages < 50 from GraphAligner_Combined1

In [None]:
# Calculate the coverage for GraphAligner
GraphAligner_Combined1["Coverage"]=GraphAligner_Combined1["Alignment Back Length"]/GraphAligner_Combined1["Query Length"]*100

In [None]:
#Drop the coverages that are < 50 from GraphAligner 
GraphAligner_Combined1.drop(GraphAligner_Combined1[GraphAligner_Combined1['Coverage'] < 50].index, inplace = True)
GraphAligner_Combined1
GraphAligner_Combined1.to_csv("Test_com.tsv", sep='\t')

## This section cleans the data from Bandage so that the  formatting of the path column is the same as the one in Graphaligner.

In [None]:
pattern1 = r'\(.*?\)'
def format_path(x):
    return re.sub(pattern1,"", x).rstrip()

In [None]:
Bandage_Combined1['Path']=Bandage_Combined1['Path'].map(format_path)

In [None]:
Bandage_Combined1['Path']=Bandage_Combined1['Path'].str.replace('+', "")
Bandage_Combined1['Path']=Bandage_Combined1['Path'].str.replace('-', "")

## This section creates 2 dataframes so that it is easier to compare. The Query and Resulting path are kept.
### GraphAligner_Combined_Query_Path
### Bandage_Combined_Query_Path

In [None]:
GraphAligner_Combined_Query_Path=GraphAligner_Combined1[['Query','Path Matching']]

In [None]:
Bandage_Combined_Query_Path=Bandage_Combined1[['Query','Path']]

In [None]:
#Sort the values alphabetically.
Bandage_Combined_Query_Path=Bandage_Combined_Query_Path.sort_values(by=['Query']).reset_index()

In [None]:
GraphAligner_Combined_Query_Path=GraphAligner_Combined_Query_Path.sort_values(by=['Query']).reset_index()

## This section creates df_union. The dataframe stores the Query and path from both software.

In [None]:
#Use merge operation so that we have the sequences for both softwares for successful queries
df=pd.merge(Bandage_Combined_Query_Path,GraphAligner_Combined_Query_Path, on='Query')
#union
#df_union=pd.concat([Bandage_Combined_Query_Path,GraphAligner_Combined_Query_Path], axis=1)
df_union=pd.merge(Bandage_Combined_Query_Path,GraphAligner_Combined_Query_Path, on='Query',how="outer")

In [None]:
df_union

In [None]:
df_union.rename(columns = {'Path':'Path_Bandage'}, inplace = True)
df_union.rename(columns = {'Path Matching':'Path_GraphAligner'}, inplace = True)

In [None]:
df_union['Path_Bandage'] = df_union['Path_Bandage'].str.replace(',',"").str.strip()
df_union['Path_GraphAligner'] = df_union['Path_GraphAligner'].str.strip()


In [None]:
df_union['Path_Bandage'] = df_union['Path_Bandage'].str.findall(r'[^\s,]+').str.join('')

## This section compares the  resulting paths for each query

In [None]:

#Check for similar path for the queries
df_union['Equal'] = np.where(df_union['Path_Bandage']==df_union['Path_GraphAligner'], 
                                           'yes', 'no')

In [None]:
df_union

### Print the yes if path is identical, and no if path is different

In [None]:
df_union['Equal'].value_counts()

In [None]:
# Create a new dataframe to store the extracted rows
df_path_notEqual = df_union[df_union['Equal'] == 'no'].copy().reset_index()

# Print the new dataframe
df_path_notEqual

In [None]:
#Drop the rows where Path_GraphAligner= NaN
df_path_notEqual.dropna(subset=['Path_GraphAligner'], inplace=True)
df_path_notEqual.reset_index()
df_path_notEqual.drop(['index','index_x','index_y'],axis=1,inplace=True)
df_path_notEqual.to_csv("GraphAlignerNotEqual.tsv",sep='\t')

In [None]:
df_path_notEqual

## This section checks the queries that do not have any path in Bandage.

In [None]:
#Get the queries that are found by Bandage only
Bandage_Combined_Query_Path_Compare= Bandage_Combined_Query_Path.drop(['index'], axis=1)
Bandage_Combined_Query_Path_Compare


In [None]:

#Check for similar path for the queries
df_union['Query_Not_Bandage'] = np.where(df_union['Path_Bandage'].isna(), 
                                           'yes', 'no')

In [None]:
df_union

In [None]:
df_union['Query_Not_Bandage'].value_counts()

## This section checks the queries that do not have any path in GraphAligner.

In [None]:
#Check for similar path for the queries
df_union['Query_Not_GraphAligner'] = np.where(df_union['Path_GraphAligner'].isna(), 
                                           'yes', 'no')

In [None]:
df_union['Query_Not_GraphAligner'].value_counts()

In [None]:
rslt_df_Bandage = df_union[df_union['Query_Not_Bandage']=='yes']

### Print the Queries that are missed by Bandage

In [None]:
rslt_df_Bandage

### Print the Queries that are missed by GraphAligner

In [None]:
rslt_df_GraphAligner = df_union[df_union['Query_Not_GraphAligner']=='yes']

In [None]:
rslt_df_GraphAligner 


In [None]:
#Save the queries missed by GraphAligner in a .tsv file
rslt_df_GraphAligner.to_csv("GraphAlignerMissedQueries.tsv",sep='\t')

## This section gets the sequences from GraphAligner that differs from Bandage

In [None]:
# Get the query
df_GraphAlignerPaths =np.where()