In [1]:
from Bio import AlignIO

alignments = AlignIO.read("../data/protein.clustal_num", "clustal")
print(alignments[:3])

Alignment with 3 rows and 411 columns
----MTSMTVDQIRRPLRAEGLATILAIGTANPANYITQADYPD...--- gi|13936397|dbj|BAB47195.1|
----MTSMTVDQIRRPLRAEGLATILAIGTANPANYITQADYPD...--- gi|13936395|dbj|BAB47194.1|
---MSSSITVDQIRKAQRAEGPATILAIGTATPANFIIQADYPD...SET gi|13936399|dbj|BAB47196.1|


In [4]:
pip install panel bokeh

Collecting panel
  Downloading panel-0.13.0-py2.py3-none-any.whl (15.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.6/15.6 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting bokeh
  Downloading bokeh-2.4.2-py3-none-any.whl (18.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.5/18.5 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting param>=1.12.0
  Downloading param-1.12.1-py2.py3-none-any.whl (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 KB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyviz-comms>=0.7.4
  Downloading pyviz_comms-2.2.0-py2.py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 KB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting markdown
  Downloading Markdown-3.3.6-py3-none-any.whl (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.8/97.8 KB[0m 

In [2]:
frequency = alignments.substitutions
observed_frequencies = frequency.select("DEHKR")
print(observed_frequencies)

       D      E      H      K      R
D 2360.0  255.5    7.5    0.5   25.0
E  255.5 3306.0   16.5   27.0    2.0
H    7.5   16.5 1235.0   16.0    8.5
K    0.5   27.0   16.0 3218.0  116.5
R   25.0    2.0    8.5  116.5 2079.0



In [4]:
import os, io, random
import string
import numpy as np

from Bio.Seq import Seq
from Bio.Align import MultipleSeqAlignment
from Bio import AlignIO, SeqIO

import panel as pn
import panel.widgets as pnw
pn.extension()

from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, Plot, Grid, Range1d
from bokeh.models.glyphs import Text, Rect
from bokeh.layouts import gridplot

In [5]:
def view_alignment(aln, alph='dna', fontsize="9pt", plot_width=800, consensus=None):
    """Bokeh sequence alignment view"""

    #make sequence and id lists from the aln object
    seqs = [rec.seq for rec in (aln)]
    ids = [rec.id for rec in aln]    
    text = [i for s in list(seqs) for i in s]
    colors = get_colors(seqs, alph)    
    N = len(seqs[0])
    S = len(seqs)    
    width = .4

    x = np.arange(1,N+1)
    y = np.arange(0,S,1)
    #creates a 2D grid of coords from the 1D arrays
    xx, yy = np.meshgrid(x, y)
    #flattens the arrays
    gx = xx.ravel()
    gy = yy.flatten()
    #use recty for rect coords with an offset
    recty = gy+.5
    h= 1/S
    #now we can create the ColumnDataSource with all the arrays
    source = ColumnDataSource(dict(x=gx, y=gy, recty=recty, text=text, colors=colors))
    plot_height = len(seqs)*15+50
    x_range = Range1d(0,N+1, bounds='auto')
    if N>100:
        viewlen=100
    else:
        viewlen=N
    #view_range is for the close up view
    view_range = (0,viewlen)
    tools="xpan, xwheel_zoom, reset, save"

    #entire sequence view (no text, with zoom)
    p = figure(title=None, plot_width= plot_width, plot_height=50,
               x_range=x_range, y_range=(0,S), tools=tools,
               min_border=0, toolbar_location='below')
    rects = Rect(x="x", y="recty",  width=1, height=1, fill_color="colors",
                 line_color=None, fill_alpha=0.6)
    p.add_glyph(source, rects)
    p.yaxis.visible = False
    p.grid.visible = False  

    #sequence text view with ability to scroll along x axis
    p1 = figure(title=None, plot_width=plot_width, plot_height=plot_height,
                x_range=view_range, y_range=ids, tools="xpan,xwheel_pan,reset",
                min_border=0, toolbar_location='below')#, lod_factor=1)          
    glyph = Text(x="x", y="y", text="text", text_align='center',text_color="black",
                text_font="monospace",text_font_size=fontsize)
    rects = Rect(x="x", y="recty",  width=1, height=1, fill_color="colors",
                line_color=None, fill_alpha=0.4)
    p1.add_glyph(source, glyph)
    p1.add_glyph(source, rects)

    p1.grid.visible = False
    p1.xaxis.major_label_text_font_style = "bold"
    p1.yaxis.minor_tick_line_width = 0
    p1.yaxis.major_tick_line_width = 0
    
    if consensus is not None:
        seqs = [rec.seq for rec in consensus]
        thrs = [f'Consensus/{int(rec.thr * 100)}%' for rec in consensus]
        text = [i for s in list(seqs) for i in s]
        colors = get_colors(seqs, alph)
        N = len(seqs[0])
        S = len(seqs)  
        
        x = np.arange(1,N+1)
        y = np.arange(0,S,1)
        #creates a 2D grid of coords from the 1D arrays
        xx, yy = np.meshgrid(x, y)
        #flattens the arrays
        gx = xx.ravel()
        gy = yy.flatten()
        #use recty for rect coords with an offset
        recty = gy+.5
        h= 1/S
        #now we can create the ColumnDataSource with all the arrays
        source = ColumnDataSource(dict(x=gx, y=gy, recty=recty, text=text, colors=colors))
        
        plot_height = len(seqs)*15+30
        x_range = Range1d(0,N+1, bounds='auto')
        #sequence text view with ability to scroll along x axis
        p2 = figure(title=None, plot_width=plot_width, plot_height=plot_height,
                x_range=view_range, y_range=thrs, tools="xpan,xwheel_pan,reset",
                min_border=0, toolbar_location='below')#, lod_factor=1)          
        glyph = Text(x="x", y="y", text="text", text_align='center',text_color="black",
                text_font="monospace",text_font_size=fontsize)
        rects = Rect(x="x", y="recty",  width=1, height=1, fill_color="colors",
                line_color=None, fill_alpha=0.4)
        p2.add_glyph(source, glyph)
        p2.add_glyph(source, rects)

        p2.grid.visible = False
        p2.xaxis.major_label_text_font_style = "bold"
        p2.yaxis.minor_tick_line_width = 0
        p2.yaxis.major_tick_line_width = 0
        
        p = gridplot([[p],[p1],[p2]], toolbar_location='below')
    else:
        p = gridplot([[p],[p1]], toolbar_location='below')
    return p

def get_colors(seqs, alph='dna'):
    """make colors for bases in sequence"""
    nlcs = [n for s in list(seqs) for n in s]
    colormaps = {
        'dna': { 'A' : "#e53319", 'C' : "#197fe5", 'G' : "#e5994c", 'T' : "#19cc19" },
        'protein': { 'A' : "#197fe5", 'C' : "#e57f7f", 'D' : "#cc4ccc", 'E' : "#cc4ccc", 'F' : "#197fe5", 'G' : "#e5994c", 'H' : "#19b2b2", 'I' : "#197fe5", 'K' : "#e53319", 'L' : "#197fe5", 'M' : "#197fe5", 'N' : "#19cc19", 'P' : "#cccc00", 'Q' : "#19cc19", 'R' : "#e53319", 'S' : "#19cc19", 'T' : "#19cc19", 'V' : "#197fe5", 'W' : "#197fe5", 'Y' : "#19b2b2" }
    }
    colors = [colormaps[alph].setdefault(n,"#ffffff") for n in nlcs]
    return colors

In [6]:
p = view_alignment(alignments, alph='protein', plot_width=1048*2)
pn.pane.Bokeh(p)

In [6]:
from Bio.Align import AlignInfo

summary_aligns = AlignInfo.SummaryInfo(alignments)

In [54]:
print(alignments[:10])

Alignment with 10 rows and 411 columns
----MTSMTVDQIRRPLRAEGLATILAIGTANPANYITQADYPD...--- gi|13936397|dbj|BAB47195.1|
----MTSMTVDQIRRPLRAEGLATILAIGTANPANYITQADYPD...--- gi|13936395|dbj|BAB47194.1|
---MSSSITVDQIRKAQRAEGPATILAIGTATPANFIIQADYPD...SET gi|13936399|dbj|BAB47196.1|
MTVLEESADASSRRLAQRANGPATVLAIGTANPANVFEQSSYPD...--- gi|12644515|sp|Q9MBB1|CHSY_EQU
--------------------------------------------...--- gi|14150838|gb|AAK54648.1|AF37
------MVTVEEFRRAQCAEGPATVMAIGTATPSNCVDQSTYPD...--- gi|13925890|gb|AAK49457.1|
-----------------------------------------YPD...--- gi|13919613|gb|AAK33142.1|
-----------------------------------------YPD...--- gi|13919597|gb|AAK33134.1|
------MVTVEEVRKAQRAEGPATILAIGTATPANCVDQSTYPD...--- gi|13774973|gb|AAK39114.1|AF35
------MVTVEEVRKAQRAEGPATILAIGTATPANCVDQSTYPD...--- gi|13774965|gb|AAK39110.1|AF35


In [7]:
summary_aligns.dumb_consensus(0.9)

Seq('MTVXXXMVTVEEVRKAQRAEGPATILAIGTATPANCVXQSTYPDYYFRITXSXH...PHT')

In [8]:
from typing import NamedTuple

class Consensus(NamedTuple):
    seq: Seq
    thr: float

def consensus_cal(aligns, thrs=[0.7]):
    cons = []
    for thr in thrs:
        cons.append(Consensus(AlignInfo.SummaryInfo(alignments).dumb_consensus(),
                             thr))
    return cons

In [50]:
consesus = consensus_cal(alignments, thrs=[0.7, 0.6, 1.0])
consesus[0]           

Consensus(seq=Seq('MTVXXXMVTVEEVRKAQRAEGPATILAIGTATPANCVXQSTYPDYYFRITXSXH...PHT'), thr=0.7)

In [51]:
p = view_alignment(alignments, alph='protein', plot_width=1048*2, consensus=consesus)
pn.pane.Bokeh(p)

In [55]:
frequency = alignments.substitutions
observed_frequencies = frequency.select("DEHKR")
print(observed_frequencies)

       D      E      H      K      R
D 2360.0  255.5    7.5    0.5   25.0
E  255.5 3306.0   16.5   27.0    2.0
H    7.5   16.5 1235.0   16.0    8.5
K    0.5   27.0   16.0 3218.0  116.5
R   25.0    2.0    8.5  116.5 2079.0



In [56]:
import numpy as np

observed_frequencies /= np.sum(observed_frequencies)
residue_frequencies = np.sum(observed_frequencies, 0)
print(residue_frequencies.format("%.4f"))

D 0.2014
E 0.2743
H 0.0976
K 0.2569
R 0.1697



In [57]:
expected_frequencies = np.dot(residue_frequencies[:, None], residue_frequencies[None, :])
print(expected_frequencies.format("%.4f"))

       D      E      H      K      R
D 0.0406 0.0553 0.0197 0.0518 0.0342
E 0.0553 0.0753 0.0268 0.0705 0.0466
H 0.0197 0.0268 0.0095 0.0251 0.0166
K 0.0518 0.0705 0.0251 0.0660 0.0436
R 0.0342 0.0466 0.0166 0.0436 0.0288



In [58]:
m = np.log2(observed_frequencies/expected_frequencies)
print(m)

      D    E    H     K    R
D   2.1 -1.5 -5.1 -10.4 -4.2
E  -1.5  1.7 -4.4  -5.1 -8.3
H  -5.1 -4.4  3.3  -4.4 -4.7
K -10.4 -5.1 -4.4   1.9 -2.3
R  -4.2 -8.3 -4.7  -2.3  2.5



In [59]:
from Bio.Align import PairwiseAligner
aligner = PairwiseAligner()
aligner.substitution_matrix = m
aligner.gap_score = -3.0

alignments = aligner.align("DEHEK", "DHHKK")
print(alignments[0])

DEHEK
|.|.|
DHHKK

