# Multiple Sequence Alignment using Genetic Algorithms

In [1]:
import pandas as pd
import numpy as np 
from random import randint, uniform
from dataclasses import dataclass

from Bio.Align import PairwiseAligner
from Bio.SubsMat.MatrixInfo import blosum62
from utils import Utils
from ga import GA_MSA
from vose_sampler import VoseAlias
from Bio.pairwise2 import align

In [2]:
sequences = ["ACTGCAACG", "ATCTGCTAG", "ACCCGAGACTG", "CGTAAACGT"]

In [5]:
ga = GA_MSA(population_size=26, generations=50, min_generations=15, termination_generations=15, mutation_rate=0.05, gap_open_score=-2,gap_extend_score=-1, debug=True)
ga.run(sequences=sequences)

Input matrix:
ACTGCAACG
ATCTGCTAG
ACCCGAGACTG
CGTAAACGT
Initializing Population
Running GA


Generation 1

Population fitness: 616.0
Max Fitness: 73.0
Best Fitness: -inf
Updated Best Value

Gap Reduce Mutation
Original alignment: A-C--T-GCAACG
Original Score: 20.0
After Mutation: A-C--TGCAACG
Score: 20.0

No Mutation
Original alignments: ['A-CTG-C-AAC-G', 'ATCTGCT---A-G', 'ACCCGAGAC-T-G', '--C-G-TAAACGT']
Score: 73.0


Generation 2

Population fitness: 769.0
Max Fitness: 67.0
Best Fitness: 73.0

No Mutation
Original alignments: ['A-C--TG--C--------AAC--G', '--AT--CTGC-T--------AG--', 'AC-CCG-A----GAC--T-----G', '---C--G---T-A--AAC--G-T-']
Score: 27.0


Generation 3

Population fitness: 594.0
Max Fitness: 67.0
Best Fitness: 73.0

No Mutation
Original alignments: ['-ACT---G--CA--A-CG', 'A-TCTGCT-----A---G', 'A-CCCGA-GAC-T----G', '--C--GT----A-AACGT']
Score: 44.0


Generation 4

Population fitness: 833.0
Max Fitness: 56.0
Best Fitness: 73.0

Gap Extend Mutation
Original alignment: ATCTGCT

(117.0, ['A-CTG-CAACG--', 'ATCTG----CTAG', 'ACCCGAGA-CTG-', '--C-GTAAACG-T'])

In [7]:
alignments=['-AC-T-GC-A----AC-G', 'A-TCT-G-CTA-----G-', 'AC--CC-G--AGACTG--', '--C--G-TAAACG-T---']
ga.calculate_fitness(alignments)

-77.0

In [3]:
S1 = "GKGDPKKPRGKMSSYAFFVQTSREEHKKKHPDASVNFSEFSKKCSERWKTMSAKEKGKFEDMAKADKARYEREMKTYIPPKGE";
S2 = "MQDRVKRPMNAFIVWSRDQRRKMALENPRMRNSEISKQLGYQWKMLTEAEKWPFFQEAQKLQAMHREKYPNYKYRPRRKAKMLPK";
S3 = "MKKLKKHPDFPKKPLTPYFRFFMEKRAKYAKLHPEMSNLDLTKILSKKYKELPEKKKMKYIQDFQREKQEFERNLARFREDHPDLIQNAKK";
S4 = "MHIKKPLNAFMLYMKEMRANVVAESTLKESAAINQILGRRWHALSREEQAKYYELARKERQLHMQLYPGWSARDNYGKKKKRKREK";

sequences = [S1, S2, S3, S4]

In [4]:
ga = GA_MSA(population_size=50, generations=100, min_generations=50, mutation_rate=0.05, gap_open_score=-2,gap_extend_score=-1, debug=True)
best_val, best_alignment = ga.run(sequences=sequences)

MQD-R-VKR-P-------MN------A-F-IV-WS-R--D-QRRKMAL--E-N----P--R-MR-N-SE-IS--K-QLG--YQ-WKMLT--E---AEKWPFF-Q-E------A-QK--LQAM-H-R-EK-YPN--YKYR--PRRKAKMLPK', '--MK-KL-KKH-P-DFP-KKP-------LTP-YF-RFF-ME-K--RAKYAKLHPE-MSNL-DLTK---IL-SKKYK-ELPEKKKMKY--IQDFQRE-KQ-EFER-NLARF-RE---D-HPD-L-----------------IQNAK--K---', 'MHIKKP-L-NAFMLYMK-E-MRA-NVVA-ES---TL-KES--------AA---INQ--ILG-RRW-HALSR--------------------------E--E-QAKY-YELARKERQLHM--QLYPGWSA-RDN-Y--GKKKK-RKR-E--K']
Score: 0


Generation 38

Population fitness: -47191.0
Max Fitness: -815.0
Best Fitness: -230.0

No Mutation
Original alignments: ['GK------GD---P-KK-PRGKM-SSY-AF-FV-QT-SRE-EHKKKH-PDASVNFS-E--FSKKCS-E-RWKTMSAKEKGK--F-E---------------DMAKADK--------AR-YEREMKTYI-P----PKG-E', 'MQDRVKR-P--------M-N---A-F-I--V-WSR--DQRRKMA-L-ENP--RMRN-SEISKQ-LG-Y--QW-KMLT----EA--EKWPFFQE-----A--QK-L-QAMH-REK-YPNYKYR----P--RRK-AKMLPK', '---MKKLKKHPDFPK-KP--LTP--YFRFFM-EK-RA-KY-A-K-LHPEMS-NL--D---LTKIL-SKKY---K-ELPEKKKMKYIQD-FQ--REK--QEFERNLARFREDH--P---

In [6]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC

sequence_records = list()
for i, aln in enumerate(best_alignment):
    sequence_records.append(SeqRecord(Seq(aln,
                       IUPAC.protein),
                   id="YP_025292." + str(i)))
with open("example.fasta", "w") as output_handle:
    SeqIO.write(sequence_records, output_handle, "fasta")

In [12]:
sequence_records

[SeqRecord(seq=Seq('GKGDP---KKPRGKMSSYAF--FVQTSR-----EEHKKKHPDASVNFSEF-SKK...-E-', IUPACProtein()), id='YP_025292.0', name='<unknown name>', description='<unknown description>', dbxrefs=[]),
 SeqRecord(seq=Seq('MQD-RVKR-PM-NAF----IV-WSR---DQRRKMA-LENPRMRN---SEI-SKQ...LPK', IUPACProtein()), id='YP_025292.1', name='<unknown name>', description='<unknown description>', dbxrefs=[]),
 SeqRecord(seq=Seq('MKKLKKHPDFPKKP----LTPYFRFFME--K-RAKYAKLHPEMS-NL-DLTKIL...K-K', IUPACProtein()), id='YP_025292.2', name='<unknown name>', description='<unknown description>', dbxrefs=[]),
 SeqRecord(seq=Seq('-----MHIKKP---LN--AFMLYMKEMRANVVAESTLKE--SAAIN--QILGR-...REK', IUPACProtein()), id='YP_025292.3', name='<unknown name>', description='<unknown description>', dbxrefs=[])]

In [14]:
best_val

-491.0