# Smith-Waterman genome application

We will apply the Smith-Waterman algorithm to two specific sequences. In this example, we align the NC_000017.11 human scaffold with the whole gorilla genome.

In [1]:
import pandas as pd
import time
import numpy as np
from datetime import datetime as dt
from ipynb.fs.full.smith_waterman import *

In [3]:
human = pd.read_excel('C:/Users/bertr/LRZ Sync+Share/Syntheny-Graph (Pirmin Schlicke)/Code/Data/Tables_Filtered_IK.xlsx', 'Human')
gorilla = pd.read_excel('C:/Users/bertr/LRZ Sync+Share/Syntheny-Graph (Pirmin Schlicke)/Code/Data/Tables_Filtered_IK.xlsx', 'Gorilla')

# 1 min 40 s approx

In [4]:
# Only consider the gene classes
human['Gene'] = human['Locus'].str.split('(\d+)').str[0]
human.reset_index(inplace= True) # creates a column which will refers to the row number of the original file, hence we can identify the position of the gene

gorilla['Gene'] = gorilla['Locus'].str.split('(\d+)').str[0]
gorilla.reset_index(inplace = True)

# Filter out the genes we are not interested in
human = human[human['Gene'].str.contains('LOC') == False]
gorilla = gorilla[gorilla['Gene'].str.contains('LOC') == False]

#human['Origin'] = 'Human'
#gorilla['Origin'] = 'Gorilla'

### Smith-Waterman  
Prepare a dataset to implement S-W algorithm. For doing so, we will try to implement a small one, comparing two random scaffolds from human and gorilla.

In [6]:
sw_human = human.loc[human['Replicon Accession'].str.contains('NC_000017.11')]
#sw_human = human.loc[human['Replicon Accession'].str.contains('NC_000020.11'), ['Gene', '#Replicon Name', 'Replicon Accession']] # length: 611
sw_gorilla_scaffold = gorilla.loc[gorilla['Replicon Accession'].str.contains('NC_018435.2')]
#sw_gorilla_scaffold = gorilla.loc[gorilla['Replicon Accession'].str.contains('NC_018440.2'), 'Gene'] # length: 739
#sw_gorilla = gorilla.loc[gorilla['Gene'].str.contains('LOC') == False]
#sw_human.to_csv('C:/Users/bertr/LRZ Sync+Share/Thesis/Data/Intermidiate/sw_human.csv')
#sw_gorilla.to_csv('C:/Users/bertr/LRZ Sync+Share/Thesis/Data/Intermidiate/sw_gorilla.csv')

In [14]:
#if __name__ == "__main__":

# Reading the two required fasta sequences
#file_1_name, file_1 = fasta_reader("Sequence1.fasta")
#file_2_name, file_2  = fasta_reader("Sequence2.fasta")

file_1 = sw_human['Gene'].array
file_1_name = 'human_NC_000017.11'
file_2 = sw_gorilla_scaffold['Gene'].array
file_2_name = 'gorilla_NC_018435.2'

output_1, output_2  = [], []

# Executing the Smith Waterman local alignment algorithm
tic = time.perf_counter()
output_1, output_2 = smith_waterman(file_1, file_2)
toc = time.perf_counter()

print(file_1_name + ' length: ' + str(len(file_1)) + ' ' + output_1 + '\n' + file_2_name + ' length: ' + str(len(file_2)) + ' ' + output_2)
print(f"Computed in {toc - tic:0.4f} seconds")
print(f"Length of the common sequence: " + str(len(output_1)))


#human_NC_000017.11 DCDCDCDC
#gorilla_NC_018435.2 DCDCDCDC
#Computed in 28.7334 seconds

# human_NC_000017.11 length: 1318  CODHPRCMAFSPVMAFNIMEGDOLGMRMNXNMMITRBAAHLHBCSUTEAHWYEAHWYKRCOYMPPNIANPTIPCLSFRACSPLIRFPRPDCLTRDWRDWFNIPRESFNIPRESDYMSAPRNTRHPDACVOCIHGMSGMSRRSRSTMSGSMSGSTNMLTTEMHAFAPHAFAPHULCCDCCPARROROROROROROROROROROATAPSAPSAVPRTVPRTKPHSSNTCSNTCXATCMEPEAGTIGSGEAGTIPBCNPBCNKKMACKKMACKKMACPPPTAPTAFEZZFEZZBYCYFKNAEBUSNPSSNPSPBBYMTGGLNTMSXOLAPLEPBRRADEMLCXCDNYMZMTOMVDPTLGBMSPDLPKNIMENRHCCPGCLSFNRNFPONEGAPSATMACACNIFIKCLSPFZFNZPSUFNZPMICSPEBARPUNNIAPRCXHDLREDSIMPRLNDCSWLPIAMAFMAFMNPTIPAAIKAAIKCDNXTDEMDEMCCLSFAXFAXOXBFTKETXOLAKESANRCLCBCLSCLSCELCRGSARGSAGLDLVDACALVDFHPPARABAGPENDTCPLENDLCCLSXBYFIEFIESPGLRUENPACADTCKMEMTMEMTKNTKNTRCSLPMEMTNGLNMEPSCMEMTFGFBNRHCBTBZCLSRLOPFSFNTFSFNTPNESFIEDCUDPMUDPMXOSRXFTASTASGBHSPTAPTPTPTPARWBNFEHANDMDKMEMTAANBYCDHCDHCDHCDHCBANCKCPPARTBORTNC-YCUGXOLAXOLAEXOLAEXOLASEHREPPMAVMEMTSCROBBKRUACTCSAFPSAFPFRGNARCLSFEGHRAFDOABRKLPRFNRLEDNLEDNHYMCDCCEYDPSDSFMKIPKIPKIPNTNXTSPAFCPSUSRHDGSGPLGNRVCRSAGHYMHYMHYMHYMHYMHYMOCSMRPDAMEMTTRIPASIHSHANDHANDFNZPAMPAMDCOYMPAGHRA-CALESHXOCTRDCSHPMPTKETTKETTRDC
# gorilla            length: 17433 CODHPRCBNLFRSPVMAFNIMEGDOLGMRMNXNMMITRBAAHLHBCSUTEAHWY-KRCOYMPPNIANPTIPCLSFRACSPLIRFPRPDCLTRDW-FNIPRESFNIPRESDYMSAPRNTRHPDACVOCIHGMS-RRSRSTMSGSMSGSTNMLTTEMHAFAP-HULC-PAR----------ATAPSAPSAVPRTVPRTKPHSSNTC-XATCMEPEAGTIGSG-PBCN-KKMAC--P-PTA-FEZZ--YFKNAEBUSNPSSNPSPBBYMTGGLNTMSXOLAPLEPBRRADEMLCXCDNYMZMTOMVDPTLGBMSPDLPKNIMENRHCCPGCLSFNRNFPONEGAPSATMACACNIFIKCLSPFZFNZ-FNZPMICSPEBARPUNNIAPRCXHDLREDSIMPRLNDCSWLPIALPIAMAFMNPTIPAAIK-CDNXTDEM-CCLSFAX-OXBFTKETXOLAKESANRCLCBCLSCLS-RGSARGSAGLDLVDACALVDFHPPARABAGPENDTCPLENDLCCLSXBYFIE-SPGLRUENPACADTCK--KNT-RCSLPMEMTNGLNMEPSCMEMTFGFBNRHCBTBZ-RLOPFSFNTFSFNTPNESFIEDCUDPM-XOSRXFTAS-GBHSPTAPT--PARWBNFEHANDMDKMEMTAAN-DHCDHCDHCBANCKBANCKCPPARTBORTNCBORTNCYCUGXOLAXOLAEXOLA-SEHREPPMAVMEMTSCROBBKRUACTCSAFP-FRGNARCLSFEGHRAFDOABRKLPRFNRLEDN-HYMCDCCEYDPSDSFMKIPKIP--XTSPAFCPSUSRHDGSGPLGNRVCRSAGHYMHYM-----MRPDAMEMTTRIPASIHSHAND-FNZPAM-DCOYMPAGHRAPAGHRACALESH-TRDCSHPMPTKET-TRDC
# Computed in 471.4808 seconds


TypeError: can only concatenate str (not "list") to str

In [13]:
print(file_1_name + output_1 + '\n' + file_2_name + ' ' + output_2)

TypeError: can only concatenate str (not "list") to str