# Multiple Sequence Alignment

## Single alignments

In [2]:
from Bio import AlignIO
alignment = AlignIO.read("files/PF05371_seed.sth", "stockholm")

In [4]:
print(alignment)

SingleLetterAlphabet() alignment with 7 rows and 52 columns
AEGDDP---AKAAFDSLQASATEYIGYAWAMVVVIVGATIGIKL...SKA Q9T0Q9_BPFD/1-49
AEGDDP---AKAAFNSLQASATEYIGYAWAMVVVIVGATIGIKL...SKA J7I0P6_BPM13/24-72
AEGDDP---AKAAFDSLQASATEYIGYAWAMVVVIVGATIGIKL...SKA CAPSD_BPZJ2/1-49
FAADDATSQAKAAFDSLTAQATEMSGYAWALVVLVVGATVGIKL...SRA CAPSD_BPIF1/22-73
AEPNAATNYATEAMDSLKTQAIDLISQTWPVVTTVVVAGLVIKL...SRA Q9T0Q8_BPIKE/1-52
DGTSTATSYATEAMNSLKTQATDLIDQTWPVVTSVAVAGLAIRL...SKA CAPSD_BPI22/32-83
AEPNAATNYATEAMDSLKTQAIDLISQTWPVVTTVVVAGLVIRL...SKA CAPSD_BPIKE/30-81


In [6]:
for record in alignment:
    print("%s - %s" % (record.seq, record.id))

AEGDDP---AKAAFDSLQASATEYIGYAWAMVVVIVGATIGIKLFKKFTSKA - Q9T0Q9_BPFD/1-49
AEGDDP---AKAAFNSLQASATEYIGYAWAMVVVIVGATIGIKLFKKFTSKA - J7I0P6_BPM13/24-72
AEGDDP---AKAAFDSLQASATEYIGYAWAMVVVIVGATIGIKLFKKFASKA - CAPSD_BPZJ2/1-49
FAADDATSQAKAAFDSLTAQATEMSGYAWALVVLVVGATVGIKLFKKFVSRA - CAPSD_BPIF1/22-73
AEPNAATNYATEAMDSLKTQAIDLISQTWPVVTTVVVAGLVIKLFKKFVSRA - Q9T0Q8_BPIKE/1-52
DGTSTATSYATEAMNSLKTQATDLIDQTWPVVTSVAVAGLAIRLFKKFSSKA - CAPSD_BPI22/32-83
AEPNAATNYATEAMDSLKTQAIDLISQTWPVVTTVVVAGLVIRLFKKFSSKA - CAPSD_BPIKE/30-81


In [8]:
from Bio import AlignIO
alignment = AlignIO.read("files/PF05371_seed.faa", "fasta")
print(alignment)

SingleLetterAlphabet() alignment with 7 rows and 52 columns
AEGDDP...AKAAFDSLQASATEYIGYAWAMVVVIVGATIGIKL...SKA Q9T0Q9_BPFD/1-49
AEGDDP...AKAAFNSLQASATEYIGYAWAMVVVIVGATIGIKL...SKA J7I0P6_BPM13/24-72
AEGDDP...AKAAFDSLQASATEYIGYAWAMVVVIVGATIGIKL...SKA CAPSD_BPZJ2/1-49
FAADDATSQAKAAFDSLTAQATEMSGYAWALVVLVVGATVGIKL...SRA CAPSD_BPIF1/22-73
AEPNAATNYATEAMDSLKTQAIDLISQTWPVVTTVVVAGLVIKL...SRA Q9T0Q8_BPIKE/1-52
DGTSTATSYATEAMNSLKTQATDLIDQTWPVVTSVAVAGLAIRL...SKA CAPSD_BPI22/32-83
AEPNAATNYATEAMDSLKTQAIDLISQTWPVVTTVVVAGLVIRL...SKA CAPSD_BPIKE/30-81


## Multiple alignments

In [14]:
alignment = AlignIO.read("files/single.phy", "phylip")
print(alignment)

SingleLetterAlphabet() alignment with 5 rows and 6 columns
AACAAC Alpha
AACCCC Beta
ACCAAC Gamma
CCACCA Delta
CCAAAC Epsilon


In [16]:
from Bio import AlignIO
alignments = AlignIO.parse("files/resampled.phy", "phylip")
for alignment in alignments:
    print(alignment)
print("")

SingleLetterAlphabet() alignment with 5 rows and 6 columns
AACAAC Alpha
AACCCC Beta
ACCAAC Gamma
CCACCA Delta
CCAAAC Epsilon
SingleLetterAlphabet() alignment with 5 rows and 6 columns
AAACCA Alpha
AAACCC Beta
ACCCCA Gamma
CCCAAC Delta
CCCAAA Epsilon
SingleLetterAlphabet() alignment with 5 rows and 6 columns
AAACAA Alpha
AAACCC Beta
ACCCAA Gamma
CCCACC Delta
CCCAAA Epsilon



Iterator vs. list

In [18]:
from Bio import AlignIO
alignments = list(AlignIO.parse("files/resampled.phy", "phylip"))
last_align = alignments[-1]
first_align = alignments[0]
print(last_align)

SingleLetterAlphabet() alignment with 5 rows and 6 columns
AAACAA Alpha
AAACCC Beta
ACCCAA Gamma
CCCACC Delta
CCCAAA Epsilon


## Writing alignments

In [19]:
from Bio.Alphabet import generic_dna
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment


align1 = MultipleSeqAlignment([
    SeqRecord(Seq("ACTGCTAGCTAG", generic_dna), id="Alpha"),
    SeqRecord(Seq("ACT-CTAGCTAG", generic_dna), id="Beta"),
    SeqRecord(Seq("ACTGCTAGDTAG", generic_dna), id="Gamma"),
])

align2 = MultipleSeqAlignment([
    SeqRecord(Seq("GTCAGC-AG", generic_dna), id="Delta"),
    SeqRecord(Seq("GACAGCTAG", generic_dna), id="Epsilon"),
    SeqRecord(Seq("GTCAGCTAG", generic_dna), id="Zeta"),
])

align3 = MultipleSeqAlignment([
    SeqRecord(Seq("ACTAGTACAGCTG", generic_dna), id="Eta"),
    SeqRecord(Seq("ACTAGTACAGCT-", generic_dna), id="Theta"),
    SeqRecord(Seq("-CTACTACAGGTG", generic_dna), id="Iota"),
])

my_alignments = [align1, align2, align3]
print(my_alignments)

[<<class 'Bio.Align.MultipleSeqAlignment'> instance (3 records of length 12, DNAAlphabet()) at 7fc6e19e4128>, <<class 'Bio.Align.MultipleSeqAlignment'> instance (3 records of length 9, DNAAlphabet()) at 7fc6e19e42b0>, <<class 'Bio.Align.MultipleSeqAlignment'> instance (3 records of length 13, DNAAlphabet()) at 7fc6e19e4438>]


In [27]:
for record in my_alignments:
    print(record)

DNAAlphabet() alignment with 3 rows and 12 columns
ACTGCTAGCTAG Alpha
ACT-CTAGCTAG Beta
ACTGCTAGDTAG Gamma
DNAAlphabet() alignment with 3 rows and 9 columns
GTCAGC-AG Delta
GACAGCTAG Epsilon
GTCAGCTAG Zeta
DNAAlphabet() alignment with 3 rows and 13 columns
ACTAGTACAGCTG Eta
ACTAGTACAGCT- Theta
-CTACTACAGGTG Iota


In [29]:
from Bio import AlignIO
AlignIO.write(my_alignments, "files/my_example.phy", "phylip")

3

You can view the file in jupyterlab.

## Alignment as array

In [33]:
import numpy as np
from Bio import AlignIO
alignment = AlignIO.read("files/PF05371_seed.sth", "stockholm")
align_array = np.array([list(rec) for rec in alignment], np.character)
print("Array shape %i by %i" % align_array.shape)

Array shape 7 by 52


In [41]:
print(align_array[2])

[b'A' b'E' b'G' b'D' b'D' b'P' b'-' b'-' b'-' b'A' b'K' b'A' b'A' b'F'
 b'D' b'S' b'L' b'Q' b'A' b'S' b'A' b'T' b'E' b'Y' b'I' b'G' b'Y' b'A'
 b'W' b'A' b'M' b'V' b'V' b'V' b'I' b'V' b'G' b'A' b'T' b'I' b'G' b'I'
 b'K' b'L' b'F' b'K' b'K' b'F' b'A' b'S' b'K' b'A']


Corresponds to 3th record in alignment:
```
CAPSD_BPZJ2/1-49               AEGDDP...AKAAFDSLQASATEYIGYAWAMVVVIVGATIGIKLFKKFASKA
```

## Alignment tools

In [42]:
import Bio.Align.Applications
dir(Bio.Align.Applications)

['ClustalOmegaCommandline',
 'ClustalwCommandline',
 'DialignCommandline',
 'MSAProbsCommandline',
 'MafftCommandline',
 'MuscleCommandline',
 'PrankCommandline',
 'ProbconsCommandline',
 'TCoffeeCommandline',
 '_ClustalOmega',
 '_Clustalw',
 '_Dialign',
 '_MSAProbs',
 '_Mafft',
 '_Muscle',
 '_Prank',
 '_Probcons',
 '_TCoffee',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__']

In [63]:
from Bio.Align.Applications import ClustalwCommandline
clustalw_cline = ClustalwCommandline("clustalw", infile="files/opuntia.fasta")
print(clustalw_cline)

clustalw -infile=files/opuntia.fasta


In [64]:
stdout, stderr = clustalw_cline()

`optunia.aln` and `optunia.dnd` files are under `files` directory. delete `.aln` file in order to run the next command. The `dnd` file can be used to draw tree:

```python
from Bio import Phylo
tree = Phylo.read("files/opuntia.dnd", "newick")
Phylo.draw_ascii(tree)
```

In our installation, we also have Clustal Omega, let's run that too.

In [65]:
from Bio.Align.Applications import ClustalOmegaCommandline
clustalo_cline = ClustalOmegaCommandline("clustalo", infile="files/opuntia.fasta", 
                                        outfile="files/opuntia.aln", outfmt="clustal")
print(clustalo_cline)

clustalo -i files/opuntia.fasta -o files/opuntia.aln --outfmt clustal


In [67]:
stdout, stderr = clustalo_cline()

check the contents of `optunia.aln` file, either from the editor or by parsing it as below

In [68]:
from Bio import AlignIO
align = AlignIO.read("files/opuntia.aln", "clustal")
print(align)

SingleLetterAlphabet() alignment with 7 rows and 906 columns
TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191665
TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191664
TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191663
TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191661
TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191660
TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191659
TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191658
