In [None]:
import itertools as it
import numpy as np
from numpy import random as random

BASES = ['A', 'C', 'G', 'T']

# Question 1

In [None]:
L = 50
mu = 0.01
t = 10

We draw $n$ bases from the discrete uniform distribution on $\{A, C, G, T\}$ to get the ancestral sequence.

In [None]:
def random_sequence(L):
    return ''.join(random.choice(BASES, L))

In [None]:
def poisson_process(lambd):
    t = 0
    lambd = 1.0 / lambd
    while True:
        t += random.exponential(lambd)
        yield t

In [None]:
def evolve_sequence(s, mu, t):
    s = list(s)
    L = len(s)
    lambd = 3/4 * mu * L
    for _ in it.takewhile(lambda x: x < t, poisson_process(lambd)):
        i = random.randint(L)
        s[i] = random.choice([b for b in BASES if b != s[i]])
    return ''.join(s)

def simulate_siblings(L, mu, t):
    A = random_sequence(L)
    B = evolve_sequence(A, mu, t)
    C = evolve_sequence(A, mu, t)
    return A, B, C

In [None]:
A, B, C = simulate_siblings(L, mu, t)

In [None]:
A

The sequence for the first child is

In [None]:
B

and the sequence for the second child is

In [None]:
C

The number of differences between the ancestor and child 1, the ancestor and child2, and childs 1 and 2 are, respectively,

In [None]:
def count_differences(x, y):
    return sum(1 for a, b in zip(x, y) if a != b)
count_differences(A, B), count_differences(A, C), count_differences(B, C)

[TODO Explain expected number of mutations.]

In [None]:
n = 1000
L = 1000
mu = 0.01
t = 25
d = [0] * n
for i in range(n):
    _, B, C = simulate_siblings(L, mu, t)
    d[i] = count_differences(B, C)

The mean number of differing sites is

In [None]:
np.mean(d)

and its variance is

In [None]:
np.var(d)

In [None]:
L = 10000
mu = 0.03
t = 10
_, B, C = simulate_siblings(L, mu, t)
empirical_p = {}
theoretical_p = {}
for a,b in it.product(BASES, repeat=2):
    empirical_p[(a,b)] = sum(1 for x,y in zip(B,C) if (a,b) == (x,y)) / (L / 4)
    theoretical_p[(a,b)] = 1/4 + (3/4 if a == b else -1/4) * np.exp(-2 * t * mu)

The empirical $p_{ab}$ values are

In [None]:
empirical_p

The theoretical $p_{ab}$ values are

In [None]:
theoretical_p

# Question 2

In [None]:
# TODO Needs major debugging
def simulate_siblings_indel(L, mu, t):
    A, B, C = map(list, simulate_siblings(L, mu, t))
    for X,Y in it.permutations([B, C]):
        h_I = random.poisson(L * t * mu / 10)
        h_D = random.poisson(L * t * mu / 10)
        for _ in range(h_I):
            i = random.randint(len(X)) + 1
            A[i:i] = ['-'] * 3
            X[i:i] = list(random_sequence(3))
            Y[i:i] = ['-'] * 3
        for _ in range(h_D):
            # TODO incorrect
            i = random.randint(len(X)) + 1
            j = min(i+3, len(X))
            X[i:j] = ['-'] * (i - j)
    A, B, C = map(lambda x: ''.join(x), [A, B, C])
    return A, B, C

In [None]:
_, B, C = simulate_siblings_indel(50, 0.01, 20)
print(B)
print(C)

# Question 3

In [None]:
def align(A, B, S, d):
    NINF = -1e309
    F = [[0] * (len(B)+1) for _ in range(len(A)+1)]
    for i in range(len(A)+1):
        F[i][0] = d * i
    for j in range(len(B)+1):
        F[0][j] = d * j
    for i in range(len(A)):
        for j in range(len(B)):
            match = F[i-1][j-1] + S[A[i]][B[i]] if i >= 1 and j >= 1 else NINF
            delete = F[i-1][j] + d if i >= 1 else NINF
            insert = F[i][j-1] + d if j >= 1 else NINF
            F[i][j] = max(match, delete, insert)
    alignment_A = ''
    alignment_B = ''
    i = length(A)
    j = length(B)
    while i > 0 or j > 0:
        if i > 0 or and j > 0 and F[i][j] == F[i-1][j-1] + S[A[i]][B[j]]:
            alignment_A = A[i] + alignment_A
            alignment_B = B[i] + alignment_B
            i -= 1
            j -= 1
        elif i >= 0 and F[i][j] == F[i-1][j] + d:
            alignment_A = A[i] + alignment_A
            alignment_B = '-' + alignment_B
            i -= 1
        else:
            alignment_A = '-' + alignment_A
            alignment_B = B[i] + alignment_B