In [None]:
import itertools as it
import numpy as np
from numpy import random as random

BASES = ['A', 'C', 'G', 'T']

# Question 1

In [None]:
L = 50
mu = 0.01
t = 10

We draw $n$ bases from the discrete uniform distribution on $\{A, C, G, T\}$ to get the ancestral sequence.

In [None]:
def random_sequence(L):
    return ''.join(random.choice(BASES, L))

A = random_sequence(L)
A

In [None]:
def poisson_process(lambd):
    t = 0
    lambd = 1.0 / lambd
    while True:
        t += random.exponential(lambd)
        yield t

In [None]:
def evolve_sequence(s, mu, t):
    s = [c for c in s]
    L = len(s)
    lambd = 3/4 * mu * L
    for _ in it.takewhile(lambda x: x < t, poisson_process(lambd)):
        i = random.randint(L)
        s[i] = random.choice([b for b in BASES if b != s[i]])
    return ''.join(s)

The sequence for the first child is

In [None]:
B = evolve_sequence(A, mu, t)
B

and the sequence for the second child is

In [None]:
C = evolve_sequence(A, mu, t)
C

The number of differences between the ancestor and child 1, the ancestor and child2, and childs 1 and 2 are, respectively,

In [None]:
def count_differences(x, y):
    return sum(1 for a, b in zip(x, y) if a != b)
count_differences(A, B), count_differences(A, C), count_differences(B, C)

[TODO Explain expected number of mutations.]

In [None]:
n = 1000
L = 1000
mu = 0.01
t = 25
d = [0] * n
for i in range(n):
    A = random_sequence(L)
    B = evolve_sequence(A, mu, t)
    C = evolve_sequence(A, mu, t)
    d[i] = count_differences(B, C)

The mean number of differing sites is

In [None]:
np.mean(d)

and its variance is

In [None]:
np.var(d)

In [None]:
L = 10000
mu = 0.03
t = 10
A = random_sequence(L)
B = evolve_sequence(A, mu, t)
C = evolve_sequence(A, mu, t)
empirical_p = {}
theoretical_p = {}
for a,b in it.product(BASES, repeat=2):
    empirical_p[(a,b)] = sum(1 for x,y in zip(B,C) if (a,b) == (x,y)) / L / (1/4)
    theoretical_p[(a,b)] = 1/4 + (3/4 if a == b else -1/4) * np.exp(-2 * t * mu)

The empirical $p_{ab}$ values are

In [None]:
empirical_p

The theoretical $p_{ab}$ values are

In [None]:
theoretical_p