In [1]:
import pandas as pd
import numpy as np
from scipy.linalg import expm

In [2]:
Y = pd.read_csv("./acol_simulated_Y.txt", sep="\t", index_col=0)
Z = pd.read_csv("./acol_simulated_Z_species.txt", sep="\t")

In [3]:
matrix = np.array([[0, 0], [0, 0]])
for i, row in Y.iterrows():
    y_state = row["Y_state"]
    z_state = int(Z[Z["molecules"] == row["molecules"]]["Z_state"].iloc[0])
    if y_state == 0 and z_state == 0:
        matrix[0, 0] += 1
    if y_state == 1 and z_state == 0:
        matrix[0, 1] += 1
    if y_state == 0 and z_state == 1:
        matrix[1, 0] += 1
    if y_state == 1 and z_state == 1:
        matrix[1, 1] += 1

In [4]:
row_sums = matrix.sum(axis=1)
matrix_normalised = matrix / row_sums[:, np.newaxis]

This should then become the matrix at the specific branch length that we chose.

In [5]:
print(matrix_normalised)

[[0.90277778 0.09722222]
 [0.18055556 0.81944444]]


Now we can calculate the actual matrix at the specific branch length. For the model validation, we have fixed all species branch length to the same value. That means that we can take any of the [species_branch_lengths_1, ..., species_branch_lengths_N] and use it as the branch length for all species. 

The same can be said for the $\mu$ of the species. We can take any of the [species_mu_1, ..., species_mu_N] and use it as the $\mu$ for all species.

In [6]:
species_branch_len = pd.read_csv(
    "acol_simulated_species_branch_length_grid.txt", sep="\t", index_col=0
)
params = pd.read_csv("acol_input_simulated.txt", sep="\t")

# for the model validation, given that we fixed all species branch length to the same values, we can take the first one only
branch_length = species_branch_len.loc[
    params[params["name"] == "species_branch_lengths_1"]["value"]
]["branch_length"].to_numpy()

species_mu_1 = params[params["name"] == "species_mu_1_1"]["value"].to_numpy().item()
species_mu_0 = params[params["name"] == "species_mu_0_1"]["value"].to_numpy().item()

start_matrix = np.array([[-species_mu_1, species_mu_1], [species_mu_0, -species_mu_0]])
matrix_at_grid_position = branch_length * start_matrix
matrix_exp = expm(matrix_at_grid_position)
print(matrix_exp)

FileNotFoundError: [Errno 2] No such file or directory: 'acol_simulated_species_branch_length_grid.txt'

As we can see both are rather similar. This indicates that the calculations for this part of the model seem to be correct.

Also we can check that the average value of Z  for $Z = 1$ should be equal to $\frac{\mu_1}{\mu_1 + \mu_0}$

In [None]:
print(Z["Z_state"].mean())

0.5833333333333334


In [None]:
species_mu_1 / (species_mu_1 + species_mu_0)

0.5454545454545455

Both are also quite similar.