## This Script is used to get the unique sequences from GraphAligner.

In [1]:
# import the libraries
import numpy as np
import pandas as pd
import re 
from Bio import SeqIO
from collections import defaultdict
import gfapy

In [2]:
# Open the GFA file
file_path = "graph1.gfa"
gfa = gfapy.Gfa.from_file(file_path)

num_segments = len(gfa.segments)

In [3]:
data_graph = []

In [4]:
# Store the name and sequence for each node from the graph
for segment in gfa.segments:
    data_graph.append({"Name": segment.name, "Sequence": segment.sequence})

# Convert the list to a DataFrame
df_graph = pd.DataFrame(data_graph)

# Print the DataFrame
print(df_graph)

       Name                                           Sequence
0      1321  CGTTCCACCGGTTCTTACAGCCTGGTTACTCAGCAGCCGCTGGGTG...
1      1323  GTGCCTTACCACCCAGCGGCTGCTGAGTAACCAGGCTGTAAGAACC...
2      1325  GTGCCTTACCACCCAGCGGCTGCTGAGTAACCAGGCTGTAAGAACC...
3     32989  CTTAATATGAACCATCCAACTTTATGGGGTCAGTCCAGCAGCGCCG...
4       565  GGTTCGGCGGAGCTTACCGCGTCTTTTCGCGGTTAGCGGAGTGTGG...
...     ...                                                ...
2524  37173  GAACAAGGATCTAAGCTGTTTTAAGTTATGGGCAACGCAATGCACT...
2525  24893  TCTTAAGAGAGTGCATTGCGTTGCCCATAACTTAAAACAGCTTAGA...
2526  36779  TTTTCTCTGCAACCGAACCGGCTGTTTGTGTGAAGTGATTCACATC...
2527   6673  CTCGGCCCGACCCGAAGCCTGCAGGGATAAGTCGAAGGGACCGCGC...
2528  37823  CTCGGCCCGACCCGAAGCCTGCAGGGATAAGTCGAAGGGACCGCGC...

[2529 rows x 2 columns]


In [5]:
#Save only one copy of each row to the dataframe
df_graph=df_graph.drop_duplicates()

In [6]:
#GraphAlignerNotEqual.tsv is from another script:GraphAlignerBandageScript.ipynb
df_GraphAligner_Paths= pd.read_csv('GraphAlignerNotEqual.tsv', sep='\t')

In [7]:
df_GraphAligner_Paths

Unnamed: 0.1,Unnamed: 0,Query,Path_Bandage,Start Position on Path,End Position on Path,Path Matching,Path_GraphAligner,Equal
0,0,gb|AF047479|+|1295-2087|ARO:3002603|aadA3,7593.0,1687,2477,>7591,7591,no
1,1,gb|AF156486|+|5012-5792|ARO:3002602|aadA2,7593.0,1699,2479,>7591,7591,no
2,3,gb|AF550679.1|-|80976-81768|ARO:3002601|aadA,7591.0,711,1500,>7593,7593,no
3,4,gb|AJ809407|+|118-898|ARO:3002620|aadA23,7591.0,731,1500,>7593,7593,no
4,5,gb|AM040708.1|+|1173-1965|ARO:3004704|aadA8b,7593.0,1687,2477,>7591,7591,no
5,6,gb|AM261837|+|73-865|ARO:3002619|aadA22,7591.0,711,1500,>7593,7593,no
6,10,gb|AY139603|+|106-898|ARO:3002608|aadA8,7593.0,1687,2477,>7591,7591,no
7,11,gb|AY171244|+|46-838|ARO:3002618|aadA21,7591.0,711,1500,>7593,7593,no
8,13,gb|BX664015.1|-|103017-103833|ARO:3002641|APH(...,264523479.0,75,891,<2645<22251,264522251,no
9,14,gb|CP003022|+|336788-337580|ARO:3003197|aadA25,7593.0,1687,2477,>7591,7591,no


In [8]:
# Function to process the Path values
def process_path(path):
    path = path.replace('<', ' ').replace('>', ' ')
    path = ' '.join(path.split('<'))
    return path

In [9]:
df_GraphAligner_Paths['Path Matching']=df_GraphAligner_Paths['Path Matching'].apply(process_path).str.strip()

In [10]:
df_GraphAligner_Paths[['Path1', 'Path2']] = df_GraphAligner_Paths['Path Matching'].str.split(' ', expand=True)

In [11]:
df_GraphAligner_Paths

Unnamed: 0.1,Unnamed: 0,Query,Path_Bandage,Start Position on Path,End Position on Path,Path Matching,Path_GraphAligner,Equal,Path1,Path2
0,0,gb|AF047479|+|1295-2087|ARO:3002603|aadA3,7593.0,1687,2477,7591,7591,no,7591,
1,1,gb|AF156486|+|5012-5792|ARO:3002602|aadA2,7593.0,1699,2479,7591,7591,no,7591,
2,3,gb|AF550679.1|-|80976-81768|ARO:3002601|aadA,7591.0,711,1500,7593,7593,no,7593,
3,4,gb|AJ809407|+|118-898|ARO:3002620|aadA23,7591.0,731,1500,7593,7593,no,7593,
4,5,gb|AM040708.1|+|1173-1965|ARO:3004704|aadA8b,7593.0,1687,2477,7591,7591,no,7591,
5,6,gb|AM261837|+|73-865|ARO:3002619|aadA22,7591.0,711,1500,7593,7593,no,7593,
6,10,gb|AY139603|+|106-898|ARO:3002608|aadA8,7593.0,1687,2477,7591,7591,no,7591,
7,11,gb|AY171244|+|46-838|ARO:3002618|aadA21,7591.0,711,1500,7593,7593,no,7593,
8,13,gb|BX664015.1|-|103017-103833|ARO:3002641|APH(...,264523479.0,75,891,2645 22251,264522251,no,2645,22251.0
9,14,gb|CP003022|+|336788-337580|ARO:3003197|aadA25,7593.0,1687,2477,7591,7591,no,7591,


## Get the rows that have a unique node, Path1

In [12]:
# Get the rows where Path 2 is  None
Path1_Only = df_GraphAligner_Paths[df_GraphAligner_Paths['Path2'].isna()]

# Print the resulting dataframe
Path1_Only 

Unnamed: 0.1,Unnamed: 0,Query,Path_Bandage,Start Position on Path,End Position on Path,Path Matching,Path_GraphAligner,Equal,Path1,Path2
0,0,gb|AF047479|+|1295-2087|ARO:3002603|aadA3,7593.0,1687,2477,7591,7591,no,7591,
1,1,gb|AF156486|+|5012-5792|ARO:3002602|aadA2,7593.0,1699,2479,7591,7591,no,7591,
2,3,gb|AF550679.1|-|80976-81768|ARO:3002601|aadA,7591.0,711,1500,7593,7593,no,7593,
3,4,gb|AJ809407|+|118-898|ARO:3002620|aadA23,7591.0,731,1500,7593,7593,no,7593,
4,5,gb|AM040708.1|+|1173-1965|ARO:3004704|aadA8b,7593.0,1687,2477,7591,7591,no,7591,
5,6,gb|AM261837|+|73-865|ARO:3002619|aadA22,7591.0,711,1500,7593,7593,no,7593,
6,10,gb|AY139603|+|106-898|ARO:3002608|aadA8,7593.0,1687,2477,7591,7591,no,7591,
7,11,gb|AY171244|+|46-838|ARO:3002618|aadA21,7591.0,711,1500,7593,7593,no,7593,
9,14,gb|CP003022|+|336788-337580|ARO:3003197|aadA25,7593.0,1687,2477,7591,7591,no,7591,
10,15,gb|DQ393783|+|1799-2591|ARO:3002615|aadA15,7591.0,711,1498,7593,7593,no,7593,


In [13]:
# Get the sequence where we have only Path1
merged_df_Path1= pd.merge(Path1_Only , df_graph, left_on='Path1', right_on='Name')
merged_df_Path1

Unnamed: 0.1,Unnamed: 0,Query,Path_Bandage,Start Position on Path,End Position on Path,Path Matching,Path_GraphAligner,Equal,Path1,Path2,Name,Sequence
0,0,gb|AF047479|+|1295-2087|ARO:3002603|aadA3,7593.0,1687,2477,7591,7591,no,7591,,7591,TGTTATGGAGCAGCAACGATGTTACGCAGCAGGGCAGTCGCCCTAA...
1,1,gb|AF156486|+|5012-5792|ARO:3002602|aadA2,7593.0,1699,2479,7591,7591,no,7591,,7591,TGTTATGGAGCAGCAACGATGTTACGCAGCAGGGCAGTCGCCCTAA...
2,5,gb|AM040708.1|+|1173-1965|ARO:3004704|aadA8b,7593.0,1687,2477,7591,7591,no,7591,,7591,TGTTATGGAGCAGCAACGATGTTACGCAGCAGGGCAGTCGCCCTAA...
3,10,gb|AY139603|+|106-898|ARO:3002608|aadA8,7593.0,1687,2477,7591,7591,no,7591,,7591,TGTTATGGAGCAGCAACGATGTTACGCAGCAGGGCAGTCGCCCTAA...
4,14,gb|CP003022|+|336788-337580|ARO:3003197|aadA25,7593.0,1687,2477,7591,7591,no,7591,,7591,TGTTATGGAGCAGCAACGATGTTACGCAGCAGGGCAGTCGCCCTAA...
5,18,gb|FJ460181|+|1790-2582|ARO:3002617|aadA17,7593.0,1687,2479,7591,7591,no,7591,,7591,TGTTATGGAGCAGCAACGATGTTACGCAGCAGGGCAGTCGCCCTAA...
6,3,gb|AF550679.1|-|80976-81768|ARO:3002601|aadA,7591.0,711,1500,7593,7593,no,7593,,7593,TGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTC...
7,4,gb|AJ809407|+|118-898|ARO:3002620|aadA23,7591.0,731,1500,7593,7593,no,7593,,7593,TGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTC...
8,6,gb|AM261837|+|73-865|ARO:3002619|aadA22,7591.0,711,1500,7593,7593,no,7593,,7593,TGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTC...
9,11,gb|AY171244|+|46-838|ARO:3002618|aadA21,7591.0,711,1500,7593,7593,no,7593,,7593,TGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTC...


In [14]:
def extract_sequences_Path1(dataframe):
    for index, row in dataframe.iterrows():
        start_pos = row['Start Position on Path']
        end_pos = row['End Position on Path']
        sequence = row['Sequence']
        
        extracted_sequence = sequence[start_pos-1:end_pos]
        dataframe.loc[index, 'Result_node1'] = extracted_sequence
    
    return dataframe

In [15]:
extracted_sequences_Path1_df = extract_sequences_Path1(merged_df_Path1)
extracted_sequences_Path1_df
#Save the dataframe to a pdf
extracted_sequences_Path1_df.to_csv("TestSeq.tsv", sep="\t")

In [16]:
# Get the rows where Path 2 is not None
df_multiple = df_GraphAligner_Paths[df_GraphAligner_Paths['Path2'].notna()]

# Print the resulting dataframe
df_multiple

Unnamed: 0.1,Unnamed: 0,Query,Path_Bandage,Start Position on Path,End Position on Path,Path Matching,Path_GraphAligner,Equal,Path1,Path2
8,13,gb|BX664015.1|-|103017-103833|ARO:3002641|APH(...,264523479.0,75,891,2645 22251,264522251,no,2645,22251
21,28,gb|NG_050218.1|+|0-1061|ARO:3001044|TEM-181 P...,,366,1419,5965 5967,59655967,no,5965,5967


In [17]:
merged_df_multiple_Path1= pd.merge(df_multiple, df_graph, left_on='Path1', right_on='Name')
merged_df_multiple_Path2=pd.merge(merged_df_multiple_Path1, df_graph, left_on='Path2', right_on='Name')
merged_df_multiple_Path2

Unnamed: 0.1,Unnamed: 0,Query,Path_Bandage,Start Position on Path,End Position on Path,Path Matching,Path_GraphAligner,Equal,Path1,Path2,Name_x,Sequence_x,Name_y,Sequence_y
0,13,gb|BX664015.1|-|103017-103833|ARO:3002641|APH(...,264523479.0,75,891,2645 22251,264522251,no,2645,22251,2645,AGAAAAACTCATCGAGCATCAAATGAAACTGCAATTTATTCATATC...,22251,ATCAGCAAAAGGGGATGATAAGTTTATCACCACCGACTATTTGCAA...
1,28,gb|NG_050218.1|+|0-1061|ARO:3001044|TEM-181 P...,,366,1419,5965 5967,59655967,no,5965,5967,5965,GACGGGATCAGTACCGACGGTGATATGGGGCAAATGGTGGTCACCA...,5967,TGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTC...


In [18]:
#Rename Name_X, Sequence_X, Name_y, Sequence_y
merged_df_multiple_Path2.rename(columns = {'Sequence_x':'Sequence_Path1'}, inplace = True)
merged_df_multiple_Path2.rename(columns = {'Sequence_y':'Sequence_Path2'}, inplace = True)
merged_df_multiple_Path2

Unnamed: 0.1,Unnamed: 0,Query,Path_Bandage,Start Position on Path,End Position on Path,Path Matching,Path_GraphAligner,Equal,Path1,Path2,Name_x,Sequence_Path1,Name_y,Sequence_Path2
0,13,gb|BX664015.1|-|103017-103833|ARO:3002641|APH(...,264523479.0,75,891,2645 22251,264522251,no,2645,22251,2645,AGAAAAACTCATCGAGCATCAAATGAAACTGCAATTTATTCATATC...,22251,ATCAGCAAAAGGGGATGATAAGTTTATCACCACCGACTATTTGCAA...
1,28,gb|NG_050218.1|+|0-1061|ARO:3001044|TEM-181 P...,,366,1419,5965 5967,59655967,no,5965,5967,5965,GACGGGATCAGTACCGACGGTGATATGGGGCAAATGGTGGTCACCA...,5967,TGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTC...


In [19]:
def extract_sequences_multiple_sequences(dataframe):
    for index, row in dataframe.iterrows():
        start_pos = row['Start Position on Path']
        sequence_1 = row['Sequence_Path1']
       
        #Get the first sequence, and remove 55 kmer from it
        extracted_sequence_first = sequence_1[start_pos-1:]
        
        dataframe.loc[index, 'Sequencefirst'] = extracted_sequence_first
    
    return dataframe

In [20]:
extracted_sequences_2 = extract_sequences_multiple_sequences(merged_df_multiple_Path2)
extracted_sequences_2.to_csv('TestSeque.tsv',sep='\t')

extracted_sequences_2

Unnamed: 0.1,Unnamed: 0,Query,Path_Bandage,Start Position on Path,End Position on Path,Path Matching,Path_GraphAligner,Equal,Path1,Path2,Name_x,Sequence_Path1,Name_y,Sequence_Path2,Sequencefirst
0,13,gb|BX664015.1|-|103017-103833|ARO:3002641|APH(...,264523479.0,75,891,2645 22251,264522251,no,2645,22251,2645,AGAAAAACTCATCGAGCATCAAATGAAACTGCAATTTATTCATATC...,22251,ATCAGCAAAAGGGGATGATAAGTTTATCACCACCGACTATTTGCAA...,AGCCGTTTCTGTAATGAAGGAGAAAACTCACCGAGGCAGTTCCATA...
1,28,gb|NG_050218.1|+|0-1061|ARO:3001044|TEM-181 P...,,366,1419,5965 5967,59655967,no,5965,5967,5965,GACGGGATCAGTACCGACGGTGATATGGGGCAAATGGTGGTCACCA...,5967,TGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTC...,ATTTGTTTATTTTTCTAAATACATTCAAATATGTATCCGCTCATGA...


In [21]:
def extract_sequences_multiple_sequences_path2(dataframe):
    for index, row in dataframe.iterrows():
        end_pos = row['End Position on Path']
        sequence_2 = row['Sequence_Path2']
       
        #Get the first sequence, and remove 55 kmer from it
        extracted_sequence_second = sequence_2[:end_pos]
        
        dataframe.loc[index, 'SequencefSecond'] = extracted_sequence_second
    
    return dataframe

In [22]:
extracted_sequences_3 = extract_sequences_multiple_sequences_path2(extracted_sequences_2)
extracted_sequences_3.to_csv('TestSeque3.tsv',sep='\t')

extracted_sequences_3

Unnamed: 0.1,Unnamed: 0,Query,Path_Bandage,Start Position on Path,End Position on Path,Path Matching,Path_GraphAligner,Equal,Path1,Path2,Name_x,Sequence_Path1,Name_y,Sequence_Path2,Sequencefirst,SequencefSecond
0,13,gb|BX664015.1|-|103017-103833|ARO:3002641|APH(...,264523479.0,75,891,2645 22251,264522251,no,2645,22251,2645,AGAAAAACTCATCGAGCATCAAATGAAACTGCAATTTATTCATATC...,22251,ATCAGCAAAAGGGGATGATAAGTTTATCACCACCGACTATTTGCAA...,AGCCGTTTCTGTAATGAAGGAGAAAACTCACCGAGGCAGTTCCATA...,ATCAGCAAAAGGGGATGATAAGTTTATCACCACCGACTATTTGCAA...
1,28,gb|NG_050218.1|+|0-1061|ARO:3001044|TEM-181 P...,,366,1419,5965 5967,59655967,no,5965,5967,5965,GACGGGATCAGTACCGACGGTGATATGGGGCAAATGGTGGTCACCA...,5967,TGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTC...,ATTTGTTTATTTTTCTAAATACATTCAAATATGTATCCGCTCATGA...,TGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTC...


In [23]:
def extract_sequences_multiple_sequences_overlap(dataframe):
   
    for index, row in dataframe.iterrows():
        sequence_first = row['Sequencefirst']
        length_end=len(sequence_first)-55
       
        #Get the first sequence, and the last 55 characters
        extracted_sequence_first = sequence_first[:len(sequence_first)-55]
        
        dataframe.loc[index, 'length'] = len(sequence_first)
        dataframe.loc[index, 'length_end'] = length_end
        dataframe.loc[index, 'FinalSequence1'] = sequence_first[:length_end]
    
    return dataframe
    

In [24]:
extracted_sequences_multiple = extract_sequences_multiple_sequences_overlap(extracted_sequences_3)

In [25]:
extracted_sequences_multiple

Unnamed: 0.1,Unnamed: 0,Query,Path_Bandage,Start Position on Path,End Position on Path,Path Matching,Path_GraphAligner,Equal,Path1,Path2,Name_x,Sequence_Path1,Name_y,Sequence_Path2,Sequencefirst,SequencefSecond,length,length_end,FinalSequence1
0,13,gb|BX664015.1|-|103017-103833|ARO:3002641|APH(...,264523479.0,75,891,2645 22251,264522251,no,2645,22251,2645,AGAAAAACTCATCGAGCATCAAATGAAACTGCAATTTATTCATATC...,22251,ATCAGCAAAAGGGGATGATAAGTTTATCACCACCGACTATTTGCAA...,AGCCGTTTCTGTAATGAAGGAGAAAACTCACCGAGGCAGTTCCATA...,ATCAGCAAAAGGGGATGATAAGTTTATCACCACCGACTATTTGCAA...,815.0,760.0,AGCCGTTTCTGTAATGAAGGAGAAAACTCACCGAGGCAGTTCCATA...
1,28,gb|NG_050218.1|+|0-1061|ARO:3001044|TEM-181 P...,,366,1419,5965 5967,59655967,no,5965,5967,5965,GACGGGATCAGTACCGACGGTGATATGGGGCAAATGGTGGTCACCA...,5967,TGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTC...,ATTTGTTTATTTTTCTAAATACATTCAAATATGTATCCGCTCATGA...,TGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTC...,112.0,57.0,ATTTGTTTATTTTTCTAAATACATTCAAATATGTATCCGCTCATGA...


In [30]:
def extract_sequences_multiple_sequences_final(dataframe):
   
    for index, row in dataframe.iterrows():
        sequence_first = row['FinalSequence1']
        sequence_second = row['SequencefSecond']
       
        #Get the first sequence, and the last 55 characters
        extracted_sequence_final = sequence_first+sequence_second
        
      
        dataframe.loc[index, 'FinalSequence'] = sequence_first + sequence_second
    
    return dataframe

In [31]:
extracted_sequences_multiple_final =extract_sequences_multiple_sequences_final(extracted_sequences_multiple)

In [32]:
extracted_sequences_multiple_final

Unnamed: 0.1,Unnamed: 0,Query,Path_Bandage,Start Position on Path,End Position on Path,Path Matching,Path_GraphAligner,Equal,Path1,Path2,Name_x,Sequence_Path1,Name_y,Sequence_Path2,Sequencefirst,SequencefSecond,length,length_end,FinalSequence1,FinalSequence
0,13,gb|BX664015.1|-|103017-103833|ARO:3002641|APH(...,264523479.0,75,891,2645 22251,264522251,no,2645,22251,2645,AGAAAAACTCATCGAGCATCAAATGAAACTGCAATTTATTCATATC...,22251,ATCAGCAAAAGGGGATGATAAGTTTATCACCACCGACTATTTGCAA...,AGCCGTTTCTGTAATGAAGGAGAAAACTCACCGAGGCAGTTCCATA...,ATCAGCAAAAGGGGATGATAAGTTTATCACCACCGACTATTTGCAA...,815.0,760.0,AGCCGTTTCTGTAATGAAGGAGAAAACTCACCGAGGCAGTTCCATA...,AGCCGTTTCTGTAATGAAGGAGAAAACTCACCGAGGCAGTTCCATA...
1,28,gb|NG_050218.1|+|0-1061|ARO:3001044|TEM-181 P...,,366,1419,5965 5967,59655967,no,5965,5967,5965,GACGGGATCAGTACCGACGGTGATATGGGGCAAATGGTGGTCACCA...,5967,TGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTC...,ATTTGTTTATTTTTCTAAATACATTCAAATATGTATCCGCTCATGA...,TGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTC...,112.0,57.0,ATTTGTTTATTTTTCTAAATACATTCAAATATGTATCCGCTCATGA...,ATTTGTTTATTTTTCTAAATACATTCAAATATGTATCCGCTCATGA...


In [34]:
#Drop some columns
extracted_sequences_multiple_final=extracted_sequences_multiple_final.drop(['Path_Bandage', 'Path_GraphAligner','Equal','Name_x','Name_y'], axis=1)

In [35]:
extracted_sequences_multiple_final

Unnamed: 0.1,Unnamed: 0,Query,Start Position on Path,End Position on Path,Path Matching,Path1,Path2,Sequence_Path1,Sequence_Path2,Sequencefirst,SequencefSecond,length,length_end,FinalSequence1,FinalSequence
0,13,gb|BX664015.1|-|103017-103833|ARO:3002641|APH(...,75,891,2645 22251,2645,22251,AGAAAAACTCATCGAGCATCAAATGAAACTGCAATTTATTCATATC...,ATCAGCAAAAGGGGATGATAAGTTTATCACCACCGACTATTTGCAA...,AGCCGTTTCTGTAATGAAGGAGAAAACTCACCGAGGCAGTTCCATA...,ATCAGCAAAAGGGGATGATAAGTTTATCACCACCGACTATTTGCAA...,815.0,760.0,AGCCGTTTCTGTAATGAAGGAGAAAACTCACCGAGGCAGTTCCATA...,AGCCGTTTCTGTAATGAAGGAGAAAACTCACCGAGGCAGTTCCATA...
1,28,gb|NG_050218.1|+|0-1061|ARO:3001044|TEM-181 P...,366,1419,5965 5967,5965,5967,GACGGGATCAGTACCGACGGTGATATGGGGCAAATGGTGGTCACCA...,TGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTC...,ATTTGTTTATTTTTCTAAATACATTCAAATATGTATCCGCTCATGA...,TGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGTATTC...,112.0,57.0,ATTTGTTTATTTTTCTAAATACATTCAAATATGTATCCGCTCATGA...,ATTTGTTTATTTTTCTAAATACATTCAAATATGTATCCGCTCATGA...


In [36]:
extracted_sequences_multiple_final.rename(columns = {'Sequence_Path1':'Sequence_Path1'}, inplace = True)

In [37]:
extracted_sequences_multiple_final.rename(columns = {'Sequence_Path2':'Sequence_FromGraph_Path2'}, inplace = True)

In [39]:
extracted_sequences_multiple_final.rename(columns = {'Sequencefirst':'Sequence_FromGraph_Path1_No_Overlap'}, inplace = True)

In [55]:
extracted_sequences_multiple_final.rename(columns = {'length':'Length_Sequence_Path1_No_Overlap'}, inplace = True)


In [56]:
extracted_sequences_multiple_final.rename(columns = {'length_end':'Length_Sequence_Path1_With_Overlap'}, inplace = True)

In [57]:
extracted_sequences_multiple_final.rename(columns = {'FinalSequence1':'Sequence_Path1_With_Overlap'}, inplace = True)

In [59]:
extracted_sequences_multiple_final.rename(columns = {'FinalSequence':'Concatenated_Sequence'}, inplace = True)

In [60]:
extracted_sequences_multiple_final.to_csv("FinalConcatenatedSequences.tsv",sep='\t')