In [None]:
import itertools as it
import numpy as np
from numpy import random

BASES = ['A', 'C', 'G', 'T']
GAP = '-'

# Question 1

We draw $L$ bases from the discrete uniform distribution on $\{A, C, G, T\}$ to get the ancestral sequence.

In [None]:
def random_sequence(L):
    return ''.join(random.choice(BASES, L))

To simulate the evolution of a sequence $S$ with per-site mutation rate $\mu$, let $\lambda = L\frac{3}{4}\mu$ be the rate of observable  mutations (i.e., base $x$ to $y$ where $x \neq y$) across the entire sequence. We draw the number of mutations over time $t$ from a Poisson distribution with parameter $t \lambda$. For each mutation, we select the affected site uniformly at random and change it to one of the three other bases, selected uniformly at random.

In [None]:
def evolve_sequence(S, mu, t):
    S = list(S) # Strings are immutable so use list of chars
    L = len(S)
    lambd = L * 3/4 * mu
    for _ in range(random.poisson(t * lambd)):
        i = random.randint(L)
        S[i] = random.choice([b for b in BASES if b != S[i]])
    return ''.join(S) # Back to string

Finally, to simulate a pair of "sibling" sequences that have diverged from a common ancestor $t$ time units ago, first we draw an ancestral sequence $A$ and then simulate two independent evolutionary processes starting with $A$ and generating $B$ and $C$.

In [None]:
def simulate_siblings(L, mu, t):
    A = random_sequence(L)
    B = evolve_sequence(A, mu, t)
    C = evolve_sequence(A, mu, t)
    return A, B, C

Here, we simulate a pair of sequences with length $L = 50$ and mutation rate $\mu = 0.01$ for $t = 10$ time units.

In [None]:
L = 50
mu = 0.01
t = 10
A, B, C = simulate_siblings(L, mu, t)

In [None]:
A

The sequence for the first child is

In [None]:
B

and the sequence for the second child is

In [None]:
C

The number of differences between the ancestor and child 1, the ancestor and child 2, and children 1 and 2 are, respectively,

In [None]:
def count_differences(A, B):
    return sum(1 for x, y in zip(A, B) if x != y)
count_differences(A, B), count_differences(A, C), count_differences(B, C)

The mean, or expected value, of the Poisson distribution is equal to its parameter. Furthermore, the sum of independent Poisson variables is also Poisson distributed by the sum of their parameters. Therefore, the expected number of mutations for a single evolutionary process is $t\lambda = tL\frac{3}{4}\mu$ and thus the expected number of mutations between two sibling sequences is $2tL\frac{3}{4}\mu$.

Here, we simulate $n = 1000$ pairs of sequences with length $L = 1000$ and mutation rate $\mu = 0.01$ for $t = 25$ time units and count the number of sites at which they differ.

In [23]:
n = 100
L = 100000
mu = 0.0001
t = 25
d = [0] * n
for i in range(n):
    _, B, C = simulate_siblings(L, mu, t)
    d[i] = count_differences(B, C)

The mean number of differing sites is

In [None]:
np.mean(d)

and its variance is

In [None]:
np.var(d)

The number of differing sites is not Poisson distributed with parameter $2tL\frac{3}{4}\mu$ 

In [None]:
L = 10000
mu = 0.03
t = 10
_, B, C = simulate_siblings(L, mu, t)
empirical_p = {}
theoretical_p = {}
for a,b in it.product(BASES, repeat=2):
    empirical_p[(a,b)] = sum(1 for x, y in zip(B,C) if (a, b) == (x, y)) / (L / 4)
    theoretical_p[(a,b)] = 1/4 + (3/4 if a == b else -1/4) * np.exp(-2 * t * mu)

The empirical $p_{ab}$ values are

In [None]:
empirical_p

The theoretical $p_{ab}$ values are

In [None]:
theoretical_p

# Question 2

In [None]:
# TODO Needs major debugging
def simulate_siblings_indel(L, mu, t):
    A, B, C = map(list, simulate_siblings(L, mu, t))
    for X,Y in it.permutations([B, C]):
        lambd = L * t * mu / 10
        h_I = random.poisson(lambd)
        h_D = random.poisson(lambd)
        for _ in range(h_I):
            i = random.randint(len(X)) + 1
            A[i:i] = [GAP] * 3
            X[i:i] = list(random_sequence(3))
            Y[i:i] = [GAP] * 3
        for _ in range(h_D):
            # TODO incorrect
            i = random.randint(len(X)) + 1
            j = min(i+3, len(X))
            X[i:j] = [GAP] * (i - j)
    A, B, C = map(lambda x: ''.join(x), [A, B, C])
    return A, B, C

In [None]:
_, B, C = simulate_siblings_indel(50, 0.01, 20)
print(B)
print(C)

# Question 3

In [None]:
def align_overlap(A, B, S, d):
    
    # Initialise F matrix with zeros
    F = [[0] * (len(B)+1) for _ in range(len(A)+1)]
    
    # Fill F matrix by global recurrence relation
    for i in range(len(A)):
        for j in range(len(B)):
            match = F[i][j] + S[A[i]][B[j]]
            delete = F[i][j+1] + d
            insert = F[i+1][j] + d
            F[i+1][j+1] = max(match, delete, insert)
    
    # Iterator over boundary indices
    boundary = it.chain(((i,len(B)-1) for i in range(len(A))),
                        ((len(A)-1,j) for j in range(len(B))))
    
    # Find index for boundary entry with greatest score
    i, j = max(boundary, key=lambda ij: F[ij[0]][ij[1]])
    
    # Append non-overlapping sequence or gaps
    alignment_A = A[i+1:] if i+1 < len(A) else '-' * (len(B) - j - 1)
    alignment_B = B[j+1:] if j+1 < len(B) else '-' * (len(A) - i - 1)
    
    # Backtrack to form alignment for overlapping region
    while i >= 0 and j >= 0:
        if i >= 0 and j >= 0 and F[i+1][j+1] == F[i][j] + S[A[i]][B[j]]:
            alignment_A = A[i] + alignment_A
            alignment_B = B[j] + alignment_B
            i -= 1
            j -= 1
        elif i >= 0 and F[i+1][j+1] == F[i][j+1] + d:
            alignment_A = A[i] + alignment_A
            alignment_B = GAP + alignment_B
            i -= 1
        elif j >= 0 and F[i+1][j+1] == F[i+1][j] + d:
            alignment_A = GAP + alignment_A
            alignment_B = B[j] + alignment_B
            j -= 1
        else: # Helpful for debugging
            raise Exception()
    
    # Append non-overlapping sequence or gaps
    alignment_A = (A[:i] if i > 0 else '-' * j) + alignment_A
    alignment_B = (B[:j] if j > 0 else '-' * i) + alignment_B
    
    return alignment_A, alignment_B

In [None]:
S = {x: {y: 2 if x == y else -2 for y in BASES} for x in BASES}
d = -3
Bp = B.replace(GAP, '')[:35]
Cp = C.replace(GAP, '')[-35:]

In [None]:
Ba, Ca = align_overlap(Bp, Cp, S, d)
print(Ba)
print(Ca)

In [None]:
for d in range(-4, 0):
    Ba, Ca = align_overlap(Bp, Cp, S, d)
    print('d = {}'.format(d))
    print(Ba)
    print(Ca)
    print()