# Multiple Sequence Alignment using Genetic Algorithms

In [2]:
import pandas as pd
import numpy as np 
from random import randint, uniform
from dataclasses import dataclass

from Bio.Align import PairwiseAligner
from Bio.SubsMat.MatrixInfo import blosum62
from utils import Utils
from ga import GA_MSA
from vose_sampler import VoseAlias
from Bio.pairwise2 import align

In [3]:
# Utils.get_interval_gaps("A-----BC--", 3)

In [4]:
# seq1 = "CARTABLANCHE"
# seq2 = "CARTE"

In [5]:
sequences = ["ACTGCAACG", "ATCTGCTAG", "ACCCGAGACTG", "CGTAAACGT"]

In [6]:
# GA_MSA.compute_pairwise_alignments(sequences)

In [7]:
ga = GA_MSA(population_size=50, generations=100, min_generations=50, mutation_rate=0.05, gap_open_score=-2,gap_extend_score=-1)
ga.run(sequences=sequences)

Initializing Population
Running GA

Best solution:
ACTG-CAACG---
ATCTG----CTAG
ACCCGAG-ACT-G
--C-GTAAACGT-


(125.0, ['ACTG-CAACG---', 'ATCTG----CTAG', 'ACCCGAG-ACT-G', '--C-GTAAACGT-'])

In [2]:
S1 = "GKGDPKKPRGKMSSYAFFVQTSREEHKKKHPDASVNFSEFSKKCSERWKTMSAKEKGKFEDMAKADKARYEREMKTYIPPKGE";
S2 = "MQDRVKRPMNAFIVWSRDQRRKMALENPRMRNSEISKQLGYQWKMLTEAEKWPFFQEAQKLQAMHREKYPNYKYRPRRKAKMLPK";
S3 = "MKKLKKHPDFPKKPLTPYFRFFMEKRAKYAKLHPEMSNLDLTKILSKKYKELPEKKKMKYIQDFQREKQEFERNLARFREDHPDLIQNAKK";
S4 = "MHIKKPLNAFMLYMKEMRANVVAESTLKESAAINQILGRRWHALSREEQAKYYELARKERQLHMQLYPGWSARDNYGKKKKRKREK";

sequences = [S1, S2, S3, S4]

In [8]:
ga = GA_MSA(population_size=100, generations=800, min_generations=100, mutation_rate=0.05, gap_open_score=-2,gap_extend_score=-1)
best_val, best_alignment = ga.run(sequences=sequences)

--AEK--W-PFFQEA-Q-K-L-Q---AMHREKY--PNYKYR-PRRKAK-M--LPK
MKKLKKHP-DF-P-KKPLTPY--F--RFFMEK-RAK-YAK-LHPE-M-SNLDLTKI--LS-K-------------------KYKEL--PE-KK---KMKY-IQ-----DFQREK-QEFE-R-N-L-ARFRE---D-H-P-DL-IQ-NAK-K
---MH-IK-KP-----L-N-----AFMLY-MKEMR-A---NV-VAE--ST-L-KE--S-A------------A--IN-Q-ILGRR-WH-AL-SR--EEQAKYY-ELA-R---K-E--RQLHM---QLY-PGWS-ARDNY-GKKKKRKR-EK

Organism #72
Fitness Score: -1050.0
-G--KGDP----------K--KPRGKMSS-Y-----AF-F-VQ-TS---R---EEHKK--K-H-PD--AS-VN-FSEF----SKKCSER-W--KT-MSAK-EKGKFED-MAK-ADKAR-YE--R---E-M--KTYI-P----PK-------G------E--
----M-Q------DRV---KRPMN----AF--IV-WS-RDQ-RR-K--M-A-LEN-PR-M-R-N-SE--I--SK--QLGYQ-----WK---ML-T-E------AEK--WPFFQEA-Q-K--LQ-AMH---REKY---PN-YKYR---PRRKA-K--M-LPK
MKK-L-K-KHPDF---PKKPLTPYF----RFFMEK-R-AK--Y-A-KLH--PE-M-S-NLD--LTKILSK-KY-----KE-LPEK-KKMK---Y--IQDF--Q---REKQ-EFE-R-NL---A-R-FREDH-P-----D-LIQ-N--A-K-------K---
---MH--IKKP------LN------AFMLYM-K-EM-R-A--------------NV-V-AE--S-T-L-KES-A----------A-I-N--Q--ILGRRWHAL-SR-EEQ-AKYYEL

In [6]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC

sequence_records = list()
for i, aln in enumerate(best_alignment):
    sequence_records.append(SeqRecord(Seq(aln,
                       IUPAC.protein),
                   id="YP_025292." + str(i)))
with open("example.fasta", "w") as output_handle:
    SeqIO.write(sequence_records, output_handle, "fasta")

In [12]:
sequence_records

[SeqRecord(seq=Seq('GKGDP---KKPRGKMSSYAF--FVQTSR-----EEHKKKHPDASVNFSEF-SKK...-E-', IUPACProtein()), id='YP_025292.0', name='<unknown name>', description='<unknown description>', dbxrefs=[]),
 SeqRecord(seq=Seq('MQD-RVKR-PM-NAF----IV-WSR---DQRRKMA-LENPRMRN---SEI-SKQ...LPK', IUPACProtein()), id='YP_025292.1', name='<unknown name>', description='<unknown description>', dbxrefs=[]),
 SeqRecord(seq=Seq('MKKLKKHPDFPKKP----LTPYFRFFME--K-RAKYAKLHPEMS-NL-DLTKIL...K-K', IUPACProtein()), id='YP_025292.2', name='<unknown name>', description='<unknown description>', dbxrefs=[]),
 SeqRecord(seq=Seq('-----MHIKKP---LN--AFMLYMKEMRANVVAESTLKE--SAAIN--QILGR-...REK', IUPACProtein()), id='YP_025292.3', name='<unknown name>', description='<unknown description>', dbxrefs=[])]

In [14]:
best_val

-491.0