In [1]:
# import the libraries
import numpy as np
import pandas as pd
import re 
from Bio import SeqIO
from collections import defaultdict

In [2]:
# Create a dataframe for the output that have all the paths
master_df= pd.read_csv('Real_Results/CategorizeResults.tsv', sep='\t')

In [3]:
master_df.columns

Index(['Unnamed: 0', 'Query', 'Path_Bandage', 'Start_Bandage', 'End_Bandage',
       'Extracted_Path', 'Start_SPAligner', 'End_SPAligner', 'Path_SPAligner',
       'Path_GraphAligner', 'Start_GraphAligner', 'End_GraphAligner'],
      dtype='object')

In [4]:
master_df.dtypes

Unnamed: 0              int64
Query                  object
Path_Bandage           object
Start_Bandage          object
End_Bandage            object
Extracted_Path         object
Start_SPAligner       float64
End_SPAligner         float64
Path_SPAligner         object
Path_GraphAligner      object
Start_GraphAligner     object
End_GraphAligner       object
dtype: object

In [5]:
#Fill the empty column with 0
master_df= master_df.fillna(000)

In [6]:
 #List of column names that are float to convert to integers
#columns_to_convert = ['Start_SPAligner', 'End_SPAligner', 'Start_GraphAligner','End_GraphAligner']
columns_to_convert = ['Start_GraphAligner','End_GraphAligner']
# Convert the selected columns from float to integer data type
master_df[columns_to_convert] = master_df[columns_to_convert].astype(int)

ValueError: invalid literal for int() with base 10: '13, 13'

In [None]:
master_df.dtypes

In [7]:
master_df[columns_to_convert] = master_df[columns_to_convert].astype(str)

In [8]:
master_df.dtypes

Unnamed: 0              int64
Query                  object
Path_Bandage           object
Start_Bandage          object
End_Bandage            object
Extracted_Path         object
Start_SPAligner       float64
End_SPAligner         float64
Path_SPAligner         object
Path_GraphAligner      object
Start_GraphAligner     object
End_GraphAligner       object
dtype: object

In [9]:
# Remove the signs "-" and "+" from the values in the Path_SPAligner
master_df['Path_SPAligner'] = master_df['Path_SPAligner'].str.replace(r'[+-]', '', regex=True)


In [10]:
master_df

Unnamed: 0.1,Unnamed: 0,Query,Path_Bandage,Start_Bandage,End_Bandage,Extracted_Path,Start_SPAligner,End_SPAligner,Path_SPAligner,Path_GraphAligner,Start_GraphAligner,End_GraphAligner
0,0,gb|AB113580|+|1329-2193|ARO:3002332|GES-3,"(93) 32422725-, 5613425+ (32) , (93) 32422725-...","93, 93","32, 32","32422725, 5613425,32422725, 32146774",92.0,901.0,32422725,32422725,93,956
1,1,gb|AB114632|+|655-1453|ARO:3002607|aadA7,"(61) 26862905- (858) , (22) 3906318-","61, 22","858, 3906318",268629053906318,60.0,803.0,26862905,26862905,61,858
2,2,gb|AB116260|+|1329-2193|ARO:3002333|GES-4,"(93) 32422725-, 5613425+ (32) , (93) 32422725-...","93, 93","32, 32","32422725, 5613425,32422725, 32146774",92.0,901.0,32422725,32422725,93,956
3,3,gb|AB211124|+|0-525|ARO:3002895|SAT-1,(64) 81232484- (588),64,588,81232484,63.0,533.0,81232484,81232484,64,588
4,4,gb|AB571865.1|-|143423-144308|ARO:3003742|mphG,"(353) 50056+, 2638631+, 6283287+, 5689755- (17...","353, 353, 171, 171, 171, 171","176, 176, 176, 176, 176, 176","50056, 2638631, 6283287, 5689755,50056, 263863...",352.0,884.0,50056,"50056, 2638631, 6283287, 5689755",353,1237
...,...,...,...,...,...,...,...,...,...,...,...,...
144,144,gb|Z22590|+|154-955|ARO:3001406|OXA-11,"(72) 32592252-, 26012357-, 758212-, 74766869-,...","72, 23, 72, 23, 72, 23, 72, 72, 72","52, 52, 52, 52, 55, 55, 52, 52, 55","32592252, 26012357, 758212, 74766869, 32267955...",0.0,626.0,758212,"26012357, 758212",8,808
145,145,gb|NG_050218.1|+|0-1061|ARO:3001044|TEM-181 P...,0,0,0,0,131.0,1129.0,32381806,0,0,0
146,146,gb|AF043100|+|0-775|ARO:3001411|OXA-16 Partial,0,0,0,0,0.0,626.0,758212,0,0,0
147,147,gb|DQ902344.1|+|2716-3517|ARO:3001412|OXA-17 ...,0,0,0,0,0.0,626.0,758212,0,0,0


## This section compares the output between Bandage and SPAligner

In [11]:
def compare_values_Bandage_SPAligner(row):
    if row['Start_Bandage'] == row['Start_SPAligner'] and row['End_Bandage'] == row['End_SPAligner'] and row['Extracted_Path'] == row['Path_SPAligner']:
        return 'Full'
    elif row['Extracted_Path'] == row['Path_SPAligner']and row['Start_Bandage'] != row['Start_SPAligner'] and row['End_Bandage'] != row['End_SPAligner']:
        return 'MatchOnPath'
    elif row['Extracted_Path'] == row['Path_SPAligner'] and row['Start_Bandage'] == row['Start_SPAligner'] and row['End_Bandage'] != row['End_SPAligner'] :
        return 'MatchOnPathAndStart'
    elif row['Extracted_Path'] == row['Path_SPAligner'] and row['Start_Bandage'] != row['Start_SPAligner'] and row['End_Bandage'] == row['End_SPAligner'] :
        return 'MatchOnPathAndEnd'
    else:
        return 'Different'

In [12]:
master_df['BandageVSSPAligner'] = master_df.apply(compare_values_Bandage_SPAligner, axis=1)

In [13]:
master_df.to_csv("CAMIH1_Results/CategorizeResults.tsv", sep='\t', index=False)

In [14]:
output_list = []

for index, row in master_df.iterrows():
    path_values = str(row['Extracted_Path']).split(',')
    start_values = str(row['Start_Bandage']).split(',')
    end_values = str(row['End_Bandage']).split(',')
    
    path_value = row['Path_SPAligner']
    start_value= str(row['Start_SPAligner']).split(',')
    end_value= str(row['End_SPAligner']).split(',')
    
    if path_value in path_values  and start_value in start_values and end_value in end_values:
        output_list.append('SinglePathFull')
    elif path_value in path_values and start_value in start_values and end_value not in end_values:
        output_list.append('SinglePathStartMatch')
    elif path_value in path_values and start_value not in start_values and end_value in end_values:
        output_list.append('SinglePathEndMatch')
    elif path_value in path_values and start_value not in start_values and end_value not in end_values:
        output_list.append('SinglePathMatch')
    elif path_value not in path_values and start_value in start_values and end_value not in end_values:
        output_list.append('SingleStartMatch')
    else:
         output_list.append('Different')

In [15]:

master_df['ResultsBandageVSSPAligner'] = output_list

In [16]:
master_df

Unnamed: 0.1,Unnamed: 0,Query,Path_Bandage,Start_Bandage,End_Bandage,Extracted_Path,Start_SPAligner,End_SPAligner,Path_SPAligner,Path_GraphAligner,Start_GraphAligner,End_GraphAligner,BandageVSSPAligner,ResultsBandageVSSPAligner
0,0,gb|AB113580|+|1329-2193|ARO:3002332|GES-3,"(93) 32422725-, 5613425+ (32) , (93) 32422725-...","93, 93","32, 32","32422725, 5613425,32422725, 32146774",92.0,901.0,32422725,32422725,93,956,Different,SinglePathMatch
1,1,gb|AB114632|+|655-1453|ARO:3002607|aadA7,"(61) 26862905- (858) , (22) 3906318-","61, 22","858, 3906318",268629053906318,60.0,803.0,26862905,26862905,61,858,Different,SinglePathMatch
2,2,gb|AB116260|+|1329-2193|ARO:3002333|GES-4,"(93) 32422725-, 5613425+ (32) , (93) 32422725-...","93, 93","32, 32","32422725, 5613425,32422725, 32146774",92.0,901.0,32422725,32422725,93,956,Different,SinglePathMatch
3,3,gb|AB211124|+|0-525|ARO:3002895|SAT-1,(64) 81232484- (588),64,588,81232484,63.0,533.0,81232484,81232484,64,588,MatchOnPath,SinglePathMatch
4,4,gb|AB571865.1|-|143423-144308|ARO:3003742|mphG,"(353) 50056+, 2638631+, 6283287+, 5689755- (17...","353, 353, 171, 171, 171, 171","176, 176, 176, 176, 176, 176","50056, 2638631, 6283287, 5689755,50056, 263863...",352.0,884.0,50056,"50056, 2638631, 6283287, 5689755",353,1237,Different,SinglePathMatch
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,144,gb|Z22590|+|154-955|ARO:3001406|OXA-11,"(72) 32592252-, 26012357-, 758212-, 74766869-,...","72, 23, 72, 23, 72, 23, 72, 72, 72","52, 52, 52, 52, 55, 55, 52, 52, 55","32592252, 26012357, 758212, 74766869, 32267955...",0.0,626.0,758212,"26012357, 758212",8,808,Different,Different
145,145,gb|NG_050218.1|+|0-1061|ARO:3001044|TEM-181 P...,0,0,0,0,131.0,1129.0,32381806,0,0,0,Different,Different
146,146,gb|AF043100|+|0-775|ARO:3001411|OXA-16 Partial,0,0,0,0,0.0,626.0,758212,0,0,0,Different,Different
147,147,gb|DQ902344.1|+|2716-3517|ARO:3001412|OXA-17 ...,0,0,0,0,0.0,626.0,758212,0,0,0,Different,Different


In [17]:
master_df.to_csv("CAMIH1_Results/CategorizeResults.tsv", sep='\t', index=False)

## Compare the results between SPAligner and GraphAligner

In [18]:
def compare_values_GraphAligner_SPAligner(row):
    if row['Start_SPAligner'] == row['Start_GraphAligner'] and row['End_SPAligner'] == row['End_GraphAligner'] and row['Path_SPAligner'] == row['Path_GraphAligner']:
        return 'Full'
    elif row['Path_SPAligner'] == row['Path_GraphAligner']and row['Start_SPAligner'] != row['Start_GraphAligner']and row['End_SPAligner'] == row['End_GraphAligner'] :
        return 'MatchOnPathAndEnd'
    elif row['Path_SPAligner'] == row['Path_GraphAligner'] and row['Start_SPAligner'] == row['Start_GraphAligner'] and row['End_SPAligner'] != row['End_GraphAligner'] :
        return 'MatchOnPathAndStart'
    elif row['Path_SPAligner'] == row['Path_GraphAligner'] and row['Start_SPAligner'] != row['Start_GraphAligner'] and row['End_SPAligner'] != row['End_GraphAligner'] :
        return 'MatchOnPath'
    else:
        return 'Different'

In [19]:
master_df['SPAlignerVSGraphAligner'] = master_df.apply(compare_values_GraphAligner_SPAligner, axis=1)

In [20]:
master_df.to_csv("CAMIH1_Results/CategorizeResults.tsv", sep='\t', index=False)

In [21]:
output_list2 = []

for index, row in master_df.iterrows():
    path_values = str(row['Path_GraphAligner']).split(',')
    start_values = str(row['Start_GraphAligner']).split(',')
    end_values = str(row['End_GraphAligner']).split(',')
    
    path_value = row['Path_SPAligner']
    start_value= str(row['Start_SPAligner']).split(',')
    end_value= str(row['End_SPAligner']).split(',')
    
    if path_value in path_values  and start_value in start_values and end_value in end_values:
        output_list2.append('SinglePathFull')
    elif path_value in path_values and start_value in start_values and end_value not in end_values:
        output_list2.append('SinglePathStartMatch')
    elif path_value in path_values and start_value not in start_values and end_value in end_values:
        output_list2.append('SinglePathEndMatch')
    elif path_value in path_values and start_value not in start_values and end_value not in end_values:
        output_list2.append('SinglePathMatch')
    elif path_value not in path_values and start_value in start_values and end_value not in end_values:
        output_list2.append('SingleStartMatch')
    else:
         output_list2.append('Different')



In [22]:
master_df['ResultsSPAlignerVSGraphAligner'] = output_list2

In [23]:
master_df

Unnamed: 0.1,Unnamed: 0,Query,Path_Bandage,Start_Bandage,End_Bandage,Extracted_Path,Start_SPAligner,End_SPAligner,Path_SPAligner,Path_GraphAligner,Start_GraphAligner,End_GraphAligner,BandageVSSPAligner,ResultsBandageVSSPAligner,SPAlignerVSGraphAligner,ResultsSPAlignerVSGraphAligner
0,0,gb|AB113580|+|1329-2193|ARO:3002332|GES-3,"(93) 32422725-, 5613425+ (32) , (93) 32422725-...","93, 93","32, 32","32422725, 5613425,32422725, 32146774",92.0,901.0,32422725,32422725,93,956,Different,SinglePathMatch,MatchOnPath,SinglePathMatch
1,1,gb|AB114632|+|655-1453|ARO:3002607|aadA7,"(61) 26862905- (858) , (22) 3906318-","61, 22","858, 3906318",268629053906318,60.0,803.0,26862905,26862905,61,858,Different,SinglePathMatch,MatchOnPath,SinglePathMatch
2,2,gb|AB116260|+|1329-2193|ARO:3002333|GES-4,"(93) 32422725-, 5613425+ (32) , (93) 32422725-...","93, 93","32, 32","32422725, 5613425,32422725, 32146774",92.0,901.0,32422725,32422725,93,956,Different,SinglePathMatch,MatchOnPath,SinglePathMatch
3,3,gb|AB211124|+|0-525|ARO:3002895|SAT-1,(64) 81232484- (588),64,588,81232484,63.0,533.0,81232484,81232484,64,588,MatchOnPath,SinglePathMatch,MatchOnPath,SinglePathMatch
4,4,gb|AB571865.1|-|143423-144308|ARO:3003742|mphG,"(353) 50056+, 2638631+, 6283287+, 5689755- (17...","353, 353, 171, 171, 171, 171","176, 176, 176, 176, 176, 176","50056, 2638631, 6283287, 5689755,50056, 263863...",352.0,884.0,50056,"50056, 2638631, 6283287, 5689755",353,1237,Different,SinglePathMatch,Different,SinglePathMatch
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,144,gb|Z22590|+|154-955|ARO:3001406|OXA-11,"(72) 32592252-, 26012357-, 758212-, 74766869-,...","72, 23, 72, 23, 72, 23, 72, 72, 72","52, 52, 52, 52, 55, 55, 52, 52, 55","32592252, 26012357, 758212, 74766869, 32267955...",0.0,626.0,758212,"26012357, 758212",8,808,Different,Different,Different,Different
145,145,gb|NG_050218.1|+|0-1061|ARO:3001044|TEM-181 P...,0,0,0,0,131.0,1129.0,32381806,0,0,0,Different,Different,Different,Different
146,146,gb|AF043100|+|0-775|ARO:3001411|OXA-16 Partial,0,0,0,0,0.0,626.0,758212,0,0,0,Different,Different,Different,Different
147,147,gb|DQ902344.1|+|2716-3517|ARO:3001412|OXA-17 ...,0,0,0,0,0.0,626.0,758212,0,0,0,Different,Different,Different,Different


In [24]:
master_df.to_csv("CAMIH1_Results/CategorizeResults.tsv", sep='\t', index=False)

In [25]:
def compare_values_Bandage_GraphAligner(row):
    if row['Start_Bandage'] == row['Start_GraphAligner'] and row['End_Bandage'] == row['End_GraphAligner'] and row['Extracted_Path'] == row['Path_GraphAligner']:
        return 'Full'
    elif row['Extracted_Path'] == row['Path_GraphAligner']  and row['Start_Bandage'] != row['Start_GraphAligner'] and row['End_Bandage'] != row['End_GraphAligner'] :
        return 'MatchOnPath'
    elif row['Extracted_Path'] == row['Path_GraphAligner'] and row['Start_Bandage'] == row['Start_GraphAligner'] and row['End_Bandage'] != row['End_GraphAligner'] :
        return 'MatchOnPathAndStart'
    elif row['Extracted_Path'] == row['Path_GraphAligner'] and row['Start_Bandage'] != row['Start_GraphAligner'] and row['End_Bandage'] == row['End_GraphAligner'] :
        return 'MatchOnPathAndEnd'
    else:
        return 'Different'

In [26]:
master_df['BandageVSGraphAligner'] = master_df.apply(compare_values_Bandage_GraphAligner, axis=1)

In [27]:
master_df.to_csv("CAMIH1_Results/CategorizeResults.tsv", sep='\t', index=False)

In [28]:
output_list3 = []

for index, row in master_df.iterrows():
    path_values = str(row['Extracted_Path']).split(',')
    start_values = str(row['Start_Bandage']).split(',')
    end_values = str(row['End_Bandage']).split(',')
    
    path_value = row['Path_GraphAligner']
    start_value= str(row['Start_GraphAligner']).split(',')
    end_value= str(row['End_GraphAligner']).split(',')
    
    if path_value in path_values  and start_value in start_values and end_value in end_values:
        output_list3.append('SinglePathFull')
    elif path_value in path_values and start_value in start_values and end_value not in end_values:
        output_list3.append('SinglePathStartMatch')
    elif path_value in path_values and start_value not in start_values and end_value in end_values:
        output_list3.append('SinglePathEndMatch')
    elif path_value in path_values and start_value not in start_values and end_value not in end_values:
        output_list3.append('SinglePathMatch')
    elif path_value not in path_values and start_value in start_values and end_value not in end_values:
        output_list3.append('SingleStartMatch')
    else:
         output_list3.append('Different')

In [29]:
master_df['ResultsBandageVSGraphAligner'] = output_list3

In [30]:
master_df.to_csv("CAMIH1_Results/CategorizeResults.tsv", sep='\t', index=False)

In [31]:
def determine_final_result_1(row):
    if row['BandageVSGraphAligner'] == 'Full':
        return 'Full'
    else:
        return row['ResultsBandageVSGraphAligner']

In [32]:
master_df['FinalResultBandageVSGraphAligner'] = master_df.apply(determine_final_result_1, axis=1)

In [33]:
master_df.to_csv("Real_Results/CategorizeResults.tsv", sep='\t', index=False)