## This script is used to compare the results from all 3 software.
## Written by Yusreen Shah
## Date: May 10th 2023

In [1]:
# import the libraries
import numpy as np
import pandas as pd
import re 
import Levenshtein
from Bio import SeqIO
from collections import defaultdict


## This section saves the list of all the queries in a dataframe

In [2]:
#Read the query and sequences from the .fasta file
fasta_file = "CAMIM2_graph/combined_2.fasta"  

# Create empty lists to store the data
queries = []
sequences = []

# Read through the FASTA file and extract query and sequence information
for record in SeqIO.parse(fasta_file, "fasta"):
    queries.append(record.id)
    sequences.append(str(record.seq))

# Create DataFrame from the lists
df = pd.DataFrame({'Query': queries, 'Sequence': sequences})

# Display the DataFrame
print(df)


                                                Query   
0   gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...  \
1            gb|L12710|+|0-549|ARO:3002556|AAC(6')-Ii   
2      gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy   
3     gb|X02340.1|+|222-1194|ARO:3004089|ANT(3'')-IIa   
4   gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib   
5       gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id   
6              gb|AB765395|+|0-1143|ARO:3002993|AQU-1   
7            gb|KF730243.1|+|0-1143|ARO:3004647|AQU-2   
8            gb|KF730244.1|+|0-1149|ARO:3004648|AQU-3   
9   gb|AL009126|+|916777-919348|ARO:3003324|Bacill...   
10               gb|X06599|+|272-1193|ARO:3002877|BcI   
11           gb|KF526113|+|0-1146|ARO:3002112|CMY-100   
12           gb|KF526114|+|0-1146|ARO:3002113|CMY-101   
13         gb|JX440350|+|1026-2172|ARO:3002083|CMY-70   
14         gb|JX440349|+|1026-2172|ARO:3002087|CMY-74   
15            gb|KJ207203|+|5-1151|ARO:3002095|CMY-82   
16         gb|JX440351|+|1026-2

In [3]:
# Remove Seq(' and ') from the sequences
df['Sequence'] = df['Sequence'].str.replace('Seq(''', '')
df['Sequence'] = df['Sequence'].str.replace(')', '')
df['Sequence'] = df['Sequence'].str.strip(" \' ")
df['Sequence']=df['Sequence'].str.rstrip()

In [4]:
df=df.sort_values("Query")
df=df.reset_index(drop=True)
df

Unnamed: 0,Query,Sequence
0,gb|AB765395|+|0-1143|ARO:3002993|AQU-1,ATGAAGCAAACCTCACCCTTGTCGTCGCTGGCGCTGAGCGCCCTGC...
1,gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...
2,gb|AE016877.1|+|1972251-1972668|ARO:3000172|FosB,TTGTTAAGGGGAATCAATCATATTTGTTTTTCGGTATCTAATTTAG...
3,gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...
4,gb|AF118110.1|-|71-1037|ARO:3003002|CfxA2,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...
5,gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy,ATGGACATCAGGCAAATGAACAAAACCCATCTGGAGCACTGGCGCG...
6,gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...
7,gb|AF472622|+|52-1018|ARO:3003003|CfxA3,ATGGAAAAAAACAGAAAAAAACAAATCGTAGTTTTGAGTATAGCTT...
8,gb|AF550679.1|-|80976-81768|ARO:3002601|aadA,ATGAGGGAAGCGGTGATCGCCGAAGTATCGACTCAACTATCAGAGG...
9,gb|AJ222769.3|+|3686-5606|ARO:3000194|tetW,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...


In [5]:
df.to_csv("testSequences.tsv",sep="\t")

## This section is used to compare the results from Bandage and the actual list of queries.

In [6]:
# Create a dataframe for Bandage Combined1
Bandage_Combined1= pd.read_csv('CAMIM2_Resullts/outputCAMIM2Bandage.tsv', sep='\t')
Bandage_Combined1

Unnamed: 0,Query,Path,Length,Query covered by path,Query covered by hits,Mean hit identity,Total hit mismatches,Total hit gap opens,Relative length,Length discrepancy,E-value product,Sequence
0,gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,(3687) 118828- (4123),437,100%,100%,96.58%,14,1,99.7717%,-1,0.0,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...
1,gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy,(3687) 118828- (4123),437,100%,100%,96.12%,16,1,99.7717%,-1,0.0,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...
2,gb|X02340.1|+|222-1194|ARO:3004089|ANT(3'')-IIa,(378) 28874510+,935,96.6049%,96.6049%,99.47%,1,2,99.574%,-4,0.0,GTGGTAACGGCGCAGTGGCGGTTTTCATGGCTTGTTATGACTGTTT...
3,gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib,(1326) 2007548+ (2129),804,100%,100%,99.88%,1,0,100%,0,0.0,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...
4,gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id,(2129) 2007548+ (2965),837,100%,100%,99.76%,2,0,100%,0,0.0,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...
...,...,...,...,...,...,...,...,...,...,...,...,...
57,gb|AJ222769.3|+|3686-5606|ARO:3000194|tetW,(565) 136860+ (2484),1920,100%,100%,98.8%,23,0,100%,0,0.0,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...
58,gb|KU736867.1|+|19653-21573|ARO:3004442|tet(W/...,(565) 136860+ (2477),1913,99.6354%,99.6354%,96.45%,66,2,100%,0,0.0,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...
59,gb|AL009126.3|-|339155-339749|ARO:3003059|tmrB,(46387) 2066834- (46980),594,100%,100%,98.82%,7,0,100%,0,0.0,ATGATCATTTGGATAAACGGGGCATTCGGTTCCGGAAAAACACAAA...
60,gb|U00096|-|2098446-2099613|ARO:3003577|ugd,"(523) 827680+, 120721+ (732)",1167,100%,100%,97.8718%,26,0,100%,0,0.0,ATGAAAATCACCATTTCCGGTACTGGCTATGTCGGCTTGTCAAACG...


In [7]:
# Perform a left merge on 'Query' column to compare sequences
merged_df_test = pd.merge(df,Bandage_Combined1, on='Query', how='right')

merged_df_test

Unnamed: 0,Query,Sequence_x,Path,Length,Query covered by path,Query covered by hits,Mean hit identity,Total hit mismatches,Total hit gap opens,Relative length,Length discrepancy,E-value product,Sequence_y
0,gb|AE006468.2|+|1707351-1707789|ARO:3002571|AA...,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...,(3687) 118828- (4123),437,100%,100%,96.58%,14,1,99.7717%,-1,0.0,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...
1,gb|AF144880|+|3541-3979|ARO:3002569|AAC(6')-Iy,ATGGACATCAGGCAAATGAACAAAACCCATCTGGAGCACTGGCGCG...,(3687) 118828- (4123),437,100%,100%,96.12%,16,1,99.7717%,-1,0.0,ATGGACATCAGGCAAATGAACAGAACCCATCTGGATCACTGGCGCG...
2,gb|X02340.1|+|222-1194|ARO:3004089|ANT(3'')-IIa,GTGGTAACGGCGCAGTGGCGGTTTTCATGGCTTCTTGTTATGACAT...,(378) 28874510+,935,96.6049%,96.6049%,99.47%,1,2,99.574%,-4,0.0,GTGGTAACGGCGCAGTGGCGGTTTTCATGGCTTGTTATGACTGTTT...
3,gb|AF313472|+|15593-16397|ARO:3002639|APH(3'')-Ib,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...,(1326) 2007548+ (2129),804,100%,100%,99.88%,1,0,100%,0,0.0,TTGAATCGAACTAATATTTTTTTTGGTGAATCGCATTCTGACTGGT...
4,gb|AF024602|+|3155-3992|ARO:3002660|APH(6)-Id,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...,(2129) 2007548+ (2965),837,100%,100%,99.76%,2,0,100%,0,0.0,ATGTTCATGCCGCCTGTTTTTCCTGCTCATTGGCACGTTTCGCAAC...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,gb|AJ222769.3|+|3686-5606|ARO:3000194|tetW,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...,(565) 136860+ (2484),1920,100%,100%,98.8%,23,0,100%,0,0.0,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...
58,gb|KU736867.1|+|19653-21573|ARO:3004442|tet(W/...,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...,(565) 136860+ (2477),1913,99.6354%,99.6354%,96.45%,66,2,100%,0,0.0,ATGAAAATAATCAATATTGGAATTCTTGCCCATGTAGACGCTGGAA...
59,gb|AL009126.3|-|339155-339749|ARO:3003059|tmrB,ATGATCATTTGGATAAACGGGGCATTCGGTTCGGGAAAAACACAAA...,(46387) 2066834- (46980),594,100%,100%,98.82%,7,0,100%,0,0.0,ATGATCATTTGGATAAACGGGGCATTCGGTTCCGGAAAAACACAAA...
60,gb|U00096|-|2098446-2099613|ARO:3003577|ugd,ATGAAAATCACCATTTCCGGTACTGGCTATGTAGGCTTGTCAAACG...,"(523) 827680+, 120721+ (732)",1167,100%,100%,97.8718%,26,0,100%,0,0.0,ATGAAAATCACCATTTCCGGTACTGGCTATGTCGGCTTGTCAAACG...


In [None]:
# Calculate the difference between 'Sequence_x' and 'Sequence_y'
merged_df_test['Distance'] = Levenshtein.distance(merged_df_test['Sequence_x'],merged_df_test['Sequence_y'])
# Create a new dataframe with unique values in the 'Query' column
df_unique = df.drop_duplicates(subset='Query', keep='first')


#merged_df_test.to_csv("CAMIH1_Results/TestDistance.tsv",sep="\t")
# Print the new dataframe
df_unique
df_unique.to_csv("CAMIM2_Resullts/TestDistance.tsv",sep="\t")

In [None]:
df_unique

In [None]:
#len(df.index)
Bandage_Combined1.dtypes

In [None]:
Bandage_Combined_Query_Path =Bandage_Combined1[['Path','Query','Sequence']]

## Extract the start and end position from the path in Bandage

In [None]:
# Extract the start position
pattern_path_start = r'\((.*?)\)' 

In [None]:
#Extract the node position
pattern_path_node = r'\b(\d+)\s*[+-]'

In [None]:
Bandage_Combined_Query_Path['Extracted_Path'] = Bandage_Combined_Query_Path['Path'].apply(lambda x: re.findall(pattern_path_node, x))

In [None]:
# Remove the braces and convert the values to integers
Bandage_Combined_Query_Path['Extracted_Path'] = Bandage_Combined_Query_Path['Extracted_Path'].apply(lambda x: [int(value) for value in x])

Bandage_Combined_Query_Path['Extracted_Path'] = Bandage_Combined_Query_Path['Extracted_Path'].astype(str)
Bandage_Combined_Query_Path['Extracted_Path'] = Bandage_Combined_Query_Path['Extracted_Path'].str.replace('[', '').str.replace(']', '')


In [None]:

Bandage_Combined_Query_Path.to_csv("test4.tsv",sep="\t")

In [None]:
Bandage_Combined_Query_Path['Start']=Bandage_Combined_Query_Path['Path'].str.extract(pattern_path_start, expand=False)
Bandage_Combined_Query_Path

In [None]:
# Extract the end position
Bandage_Combined_Query_Path['End']= Bandage_Combined_Query_Path["Path"].str.split().str[-1]


In [None]:
Bandage_Combined_Query_Path['End']=Bandage_Combined_Query_Path['End'].str.strip('()')
Bandage_Combined_Query_Path['End']=Bandage_Combined_Query_Path['End'].str.strip('+')
Bandage_Combined_Query_Path['End']=Bandage_Combined_Query_Path['End'].str.strip('-')
Bandage_Combined_Query_Path['End']=Bandage_Combined_Query_Path['End'].astype(int)
Bandage_Combined_Query_Path

In [None]:
# Reorder the columns in Bandage_Combined_Query_Path
Bandage_Combined_Query_Path=Bandage_Combined_Query_Path[['Query','Path','Start','Extracted_Path','End','Sequence']]

# Convert integer columns to strings
Bandage_Combined_Query_Path['Path'] = Bandage_Combined_Query_Path['Path'].astype(str)
Bandage_Combined_Query_Path['End'] = Bandage_Combined_Query_Path['End'].astype(str)
Bandage_Combined_Query_Path['Start'] = Bandage_Combined_Query_Path['Start'].astype(str)
Bandage_Combined_Query_Path['Extracted_Path'] = Bandage_Combined_Query_Path['Extracted_Path'].astype(str)
Bandage_Combined_Query_Path['Sequence'] = Bandage_Combined_Query_Path['Sequence'].astype(str)

# Bandage_Combined1['Query'].value_counts() 
# Merge the rows that have the same query
# group the dataframe by the 'Name' column and aggregate the data for each group
merge_Bandage_df = Bandage_Combined_Query_Path.groupby('Query').agg({'Path': ' , '.join, 'Start': ', '.join, 'End': ', '.join, 'Extracted_Path': ','.join, 'Sequence': ' '.join}).reset_index()
merge_Bandage_df


In [None]:
# Add Braces to the values that now have ','
def add_braces(value):
    if ',' in value:
        return '{' + value + '}'
    else:
        return value
#merge_Bandage_df['Path'] = merge_Bandage_df['Path'].apply(add_braces)
#merge_Bandage_df['Start'] = merge_Bandage_df['Start'].apply(add_braces)
#merge_Bandage_df['End'] = merge_Bandage_df['End'].apply(add_braces)
#merge_Bandage_df['Sequence'] = merge_Bandage_df['Sequence'].apply(add_braces)

In [None]:
# Store the dataframe containing the Bandage results to a .tsv file
merge_Bandage_df.to_csv("Test1.tsv", sep="\t")

In [None]:
#Use merge operation so that we have the sequences for both softwares for successful queries
df_QueryAndResultsBandage=pd.merge(df,merge_Bandage_df, on='Query',how="outer")

## Compile the Queries and the results from Bandage in one dataframe

In [None]:
df_QueryAndResultsBandage

In [None]:
#Print the columns from the dataframe
df_QueryAndResultsBandage.columns

In [None]:
#Rename the columns from the merging of Bandage and Query list
df_QueryAndResultsBandage.rename(columns = {'Sequence_x':'Sequence'}, inplace = True)
df_QueryAndResultsBandage.rename(columns = {'Sequence_y':'Sequence_Bandage'}, inplace = True)
df_QueryAndResultsBandage.rename(columns = {'Length':'Length_Bandage'}, inplace = True)
df_QueryAndResultsBandage.rename(columns = {'Path':'Path_Bandage'}, inplace = True)
df_QueryAndResultsBandage.rename(columns = {'Start':'Start_Bandage'}, inplace = True)
df_QueryAndResultsBandage.rename(columns = {'End':'End_Bandage'}, inplace = True)

## Save the Queries and Results from Bandage in one .tsv file

In [None]:
df_QueryAndResultsBandage.to_csv("CompiledQueriesBandage.tsv",sep="\t")
df_QueryAndResultsBandage

## This section is used to compare the results from SPAligner and the actual list of queries.

In [None]:
#Create a dataframe for SPAligner Combined1
SPAligner_Combined1=pd.read_csv('RealSPAligner.tsv', sep='\t',names=["Query","Start position of alignment on sequence","End position of alignment on sequence","Start position of alignment on the first edge of the path",
"End position of alignment on the last edge of the path","Path of alignment","Length of the alignment on each edge of the path","Sequence Length",
"Sequence"])
SPAligner_Combined1

In [None]:
#get the columns from SPAligner
SPAligner_Combined1.columns

In [None]:
#format the Query column from  SPAligner_Combined1 so that the Query column is the same as the one from Bandage
pattern = r'\[.*?\]'
def format_query(x):
    return re.sub(pattern,"", x).rstrip()
    


SPAligner_Combined1['Query'] = SPAligner_Combined1['Query'].map(format_query)


In [None]:
#Get the query and sequence column from SPAligner_Combined1
SPAligner_Combined_Query_Path=SPAligner_Combined1[['Query','Sequence']]


In [None]:
#Get the query and sequence column from SPAligner_Combined1
SPAligner_Combined_Query_Path=SPAligner_Combined1[['Query','Start position of alignment on the first edge of the path', 'End position of alignment on the last edge of the path', 'Sequence Length',
'Path of alignment','Sequence']]
SPAligner_Combined_Query_Path

In [None]:
#Rename the columns in SPAligner
SPAligner_Combined_Query_Path.rename(columns = {'Start position of alignment on the first edge of the path':'Start_SPAligner'}, inplace = True)
SPAligner_Combined_Query_Path.rename(columns = {'End position of alignment on the last edge of the path':'End_SPAligner'}, inplace = True)
SPAligner_Combined_Query_Path.rename(columns = {'Sequence Length':"Length_SPAligner"}, inplace = True)
SPAligner_Combined_Query_Path.rename(columns = {'Path of alignment':"Path_SPAligner"}, inplace = True)
SPAligner_Combined_Query_Path.rename(columns = {'Sequence':"Sequence_SPAligner"}, inplace = True)

SPAligner_Combined_Query_Path

In [None]:
SPAligner_Combined_Query_Path

## Compile the Queries and the results from SPAligner in one dataframe

In [None]:
#Use merge operation so that we have the sequences for both softwares for successful queries
df_QueryAndResultsSPAligner=pd.merge(df,SPAligner_Combined_Query_Path, on='Query',how='right')

df_QueryAndResultsSPAligner

In [None]:
df_QueryAndResultsSPAligner.to_csv("TestCompiledQueriesSPAligner.tsv",sep="\t")

In [None]:
#Rename the columns from SPAligner and Results from the Query list
df_QueryAndResultsSPAligner.rename(columns = {'Sequence_x':'Sequence'}, inplace = True)
df_QueryAndResultsSPAligner.rename(columns = {'Sequence_y':'Sequence_SPAligner'}, inplace = True)

In [None]:
df_QueryAndResultsSPAligner['Start_SPAligner']=df_QueryAndResultsSPAligner['Start_SPAligner']+1

df_QueryAndResultsSPAligner

In [None]:
df_QueryAndResultsSPAligner.to_csv("TestCompiledQueriesSPAligner.tsv",sep="\t")

## This section is used to compare the results from GraphAligner and the actual list of queries.

In [None]:
#Create a dataframe for GraphAligner Combined1
GraphAligner_Combined1=pd.read_csv('RealGraphAligner.tsv', sep='\t', names=["Query", "Query Length", "Query Start", 
                                        "Query End","Strand Relative Length","Path Matching","Path Length",
                                         "Start Position on Path","End Position on Path","Number of residues Matches",
                                         "Alignment Back Length","Mapping Quality","Column 1"])
GraphAligner_Combined1

In [None]:
#Format the Query column from  GraphAligner_Combined1 so that the Query column is the same as the one from Bandage
pattern = r'\[.*?\]'
def format_query(x):
    return re.sub(pattern,"", x).rstrip()
    


GraphAligner_Combined1['Query'] = GraphAligner_Combined1['Query'].map(format_query)

In [None]:
# Remove < and > from the Path
GraphAligner_Combined1['Path Matching'] = GraphAligner_Combined1['Path Matching'].str.replace('>', " ")
GraphAligner_Combined1['Path Matching'] = GraphAligner_Combined1['Path Matching'].str.replace('<', " ")

## This section drops the rows with coverages < 50 from GraphAligner_Combined1

In [None]:
# Calculate the coverage for GraphAligner
GraphAligner_Combined1["Coverage"]=GraphAligner_Combined1["Alignment Back Length"]/GraphAligner_Combined1["Query Length"]*100

In [None]:
GraphAligner_Combined1

In [None]:
#Drop the coverages that are < 50 from GraphAligner 
GraphAligner_Combined1.drop(GraphAligner_Combined1[GraphAligner_Combined1['Coverage'] < 50].index, inplace = True)


In [None]:
GraphAligner_Combined1['Start Position on Path']=GraphAligner_Combined1['Start Position on Path']+1
GraphAligner_Combined1['End Position on Path']=GraphAligner_Combined1['End Position on Path']
GraphAligner_Combined1

In [None]:
#Concatenate the values from Path_Matching
GraphAligner_Combined1['Path Matching'] = GraphAligner_Combined1['Path Matching'].apply(lambda x: ', '.join(x.split()))


In [None]:
#Get the specific columns from GraphAligner
GraphAligner_Combined_Query_Path=GraphAligner_Combined1[['Query','Path Matching', 'Start Position on Path', 
'End Position on Path']]
# Convert the integer to string
GraphAligner_Combined_Query_Path['Query']=GraphAligner_Combined_Query_Path['Query'].astype(str)
GraphAligner_Combined_Query_Path['Path Matching']=GraphAligner_Combined_Query_Path['Path Matching'].astype(str)
GraphAligner_Combined_Query_Path['Start Position on Path']=GraphAligner_Combined_Query_Path['Start Position on Path'].astype(str)
GraphAligner_Combined_Query_Path['End Position on Path']=GraphAligner_Combined_Query_Path['End Position on Path'].astype(str)



In [None]:
#Use merge operation so that we have the sequences for both softwares for successful queries
merged_df_Graph = GraphAligner_Combined_Query_Path.groupby('Query').agg({'Path Matching':' '.join, 'Start Position on Path': ', '.join,'End Position on Path': ', '.join}).reset_index()
merged_df_Graph

In [None]:
merged_df_Graph.rename(columns = {'Path Matching':'Path_GraphAligner'}, inplace = True)
merged_df_Graph.rename(columns = {'Start Position on Path':'Start_GraphAligner'}, inplace = True)
merged_df_Graph.rename(columns = {'End Position on Path':'End_GraphAligner'}, inplace = True)

In [None]:
merged_df_Graph

## Compile the Queries and the results from GraphAligner in one dataframe

In [None]:
#Use merge operation so that we have the sequences for both softwares for successful queries
df_QueryAndResultsGraphAligner=pd.merge(df,merged_df_Graph, on='Query',how="left")
df_QueryAndResultsGraphAligner

## Compile all the results in one master Dataframe

In [None]:
merged_df_1 = pd.merge(df_QueryAndResultsBandage, df_QueryAndResultsSPAligner, on=['Query', 'Sequence'], how='outer')
merged_df_master = pd.merge(merged_df_1, df_QueryAndResultsGraphAligner, on=['Query', 'Sequence'], how='outer')
#merged_df_master['Start_SPAligner']=merged_df_master['Start_SPAligner'].astype(float).astype(int)
merged_df_master

In [None]:
#Drop some columns

merged_df_master.drop(['Sequence_y', 'Length_x','Length_y','Sequence'], axis=1)

#Rename Sequence_x
merged_df_master.rename(columns = {'Sequence_x':'Sequence_Query'}, inplace = True)



In [None]:
def compare_paths(Start_Bandage, End_Bandage,Start_GraphAligner,End_GraphAligner):
    bandage_path_start = Start_Bandage.split(',')
    bandage_path_end = End_Bandage.split(',')
    #str(Start_GraphAligner) in str(bandage_path_start) and  str(End_GraphAligner) in  str(bandage_path_end):
    
    
    if set(str(bandage_path_start)) == set([str(Start_GraphAligner)]) and set(str(bandage_path_end) )== set([str(End_GraphAligner)]):
        return 'Similar'
    elif str(Start_GraphAligner) in str(bandage_path_start) and  str(End_GraphAligner) in  str(bandage_path_end):
        return 'Full'
    else:
        return 'Different'

In [None]:
merged_df_master['BandageVSGraphAligner'] = merged_df_master.apply(lambda row: compare_paths(row['Start_Bandage'], row['End_Bandage'],row['Start_GraphAligner'],row['End_GraphAligner']), axis=1)

merged_df_master

In [None]:
#Compare the Start, and end path from Bandage and SPAligner
merged_df_master['Start_SPAligner']=merged_df_master['Start_SPAligner'].astype(str)
merged_df_master['End_SPAligner']=merged_df_master['End_SPAligner'].astype(str)
merged_df_master['BandageVSSPAligner']=(merged_df_master['Start_Bandage'] == merged_df_master['Start_SPAligner']) & (merged_df_master['End_Bandage'] == merged_df_master['End_SPAligner'])



merged_df_master.dtypes

In [None]:
#Compare the Start, and end path from GraphAligner and SPAligner
merged_df_master['SPAlignerVSGraphAligner']=(merged_df_master['Start_SPAligner'] == merged_df_master['Start_GraphAligner']) & (merged_df_master['End_SPAligner'] == merged_df_master['End_GraphAligner'])

In [None]:
#Drop some columns

merged_df_master=merged_df_master.drop(['Sequence_y', 'Length_x','Length_y','Sequence'], axis=1)

## Save the Queries and  all the Results  in one .tsv file


In [None]:
merged_df_master.to_csv("QueriesAndResultsMasterList.tsv",sep='\t')

In [None]:
merged_df_master['Sequence_Bandage']=merged_df_master['Sequence_Bandage'].astype(str)

In [None]:
def extract_closest_string(df):
    df['Extracted string'] = ''

    for index, row in df.iterrows():
        sequences = row['Sequence_Bandage']
        query = row['Sequence']
        closest_distance = float('inf')
        closest_string = ''

        if sequences:
            sequences = sequences.split(' ')
            for sequence in sequences:
                distance = Levenshtein.distance(sequence, query)
                if Levenshtein.distance(sequence, query) < closest_distance:
                    closest_distance = distance
                    closest_string = sequence

        df.at[index, 'Extracted string'] = closest_string


    return df

## This section saves the Nodes and Sequences from the gfa file

In [None]:
import gfapy

# Open the GFA file
file_path = "graph1.gfa"
gfa = gfapy.Gfa.from_file(file_path)

num_segments = len(gfa.segments)

In [None]:
data_graph = []

In [None]:
# Store the name and sequence for each node from the graph
for segment in gfa.segments:
    data_graph.append({"Name": segment.name, "Sequence": segment.sequence})

# Convert the list to a DataFrame
df_graph = pd.DataFrame(data_graph)

# Print the DataFrame
print(df_graph)

In [None]:
#Count the number of duplicates
#len(df_graph['Name'])-len(df_graph['Name'].drop_duplicates())

In [None]:
num_segments

In [None]:
#Save only one copy of each row to the dataframe
df_graph=df_graph.drop_duplicates()

In [None]:
df_graph

In [None]:
df_graph['Name']=df_graph['Name'].astype(str)


## Get the sequences unique to GraphAligner
## Get the dataframe containing the path from GraphAligner


In [None]:
df_GraphAligner_Paths= pd.read_csv('GraphAlignerNotEqual.tsv', sep='\t')
#GraphAlignerNotEqual.tsv is from another script:GraphAlignerBandageScript.ipynb

In [None]:
df_GraphAligner_Paths

In [None]:
df_GraphAligner_Paths
df_GraphAligner_Paths.sort_index(inplace=True)
df_GraphAligner_Paths['Path Matching']=df_GraphAligner_Paths['Path Matching'].astype(str)

In [None]:
# Function to process the Path values
def process_path(path):
    path = path.replace('<', ' ').replace('>', ' ')
    path = ' '.join(path.split('<'))
    return path

In [None]:
df_GraphAligner_Paths['Path Matching']=df_GraphAligner_Paths['Path Matching'].apply(process_path).str.strip()


In [None]:
df_GraphAligner_Paths
# Create a new column 'Combined_Sequence' that concatenates sequences if Path has two values


In [None]:
df_GraphAligner_Paths[['Path1', 'Path2']] = df_GraphAligner_Paths['Path Matching'].str.split(' ', expand=True)

In [None]:
df_GraphAligner_Paths

In [None]:
# Get the rows where Path 2 is not None
rows_not_none = df_GraphAligner_Paths[df_GraphAligner_Paths['Path2'].notna()]

# Print the resulting dataframe
rows_not_none

In [None]:
#Get the sequence for Path 1 and Path 2
# Merge df1 and df2 based on matching values in Column1 and Name

merged_df= pd.merge(df_GraphAligner_Paths, df_graph, left_on='Path1', right_on='Name')
merged_df = pd.merge(merged_df, df_graph, left_on='Path2', right_on='Name')

# Concatenate the sequences
merged_df

In [None]:
extracted_sequences_Path1_df = extract_sequences_Path1(merged_df_Path1)
extracted_sequences_Path1_df
#Save the dataframe to a pdf
extracted_sequences_Path1_df.to_csv("Path1_TestSequences.tsv", sep="\t")