In [1]:
import pandas as pd
import numpy as np
from scipy.linalg import expm

In [2]:
Y = pd.read_csv("./acol_simulated_Y.txt", sep="\t", index_col=0)
Z = pd.read_csv("./acol_simulated_Z_species.txt", sep="\t")

In [3]:
matrix = np.array([[0, 0], [0, 0]])
for i, row in Y.iterrows():
    y_state = row["Y_state"]
    z_state = int(Z[Z["molecules"] == row["molecules"]]["Z_state"].iloc[0])
    if y_state == 0 and z_state == 0:
        matrix[0, 0] += 1
    if y_state == 1 and z_state == 0:
        matrix[0, 1] += 1
    if y_state == 0 and z_state == 1:
        matrix[1, 0] += 1
    if y_state == 1 and z_state == 1:
        matrix[1, 1] += 1

In [4]:
row_sums = matrix.sum(axis=1)
matrix_normalised = matrix / row_sums[:, np.newaxis]

This should then become the matrix at the specific branch length that we chose.

In [5]:
print(matrix_normalised)

[[0.87708333 0.12291667]
 [0.2625     0.7375    ]]


Now we can calculate the actual matrix at the specific branch length. For the model validation, we have fixed all species branch length to the same value. That means that we can take any of the [species_branch_lengths_1, ..., species_branch_lengths_N] and use it as the branch length for all species. 

The same can be said for the $\mu$ of the species. We can take any of the [species_mu_1, ..., species_mu_N] and use it as the $\mu$ for all species.

In [None]:
import numpy as np

species_branch_len = pd.read_csv("acol_species_simulated.txt", sep="\t", index_col=0)
params = pd.read_csv("acol_input_simulated.txt", sep="\t")
species_log_nu = pd.read_csv("./acol_simulated.txt", sep="\t", index_col=0)

# for the model validation, given that we fixed all species branch length to the same values, we can take the first one only
branch_length = species_branch_len.loc["species_branch_lengths_1"].to_numpy()

species_alpha = params[params["name"] == "species_alpha_1"]["value"].to_numpy().item()
species_log_nu = species_log_nu.loc["species_log_nu_1"].to_numpy()

start_matrix = np.array(
    [[-species_alpha, species_alpha], [1 - species_alpha, species_alpha - 1]]
)
matrix_at_grid_position = branch_length * start_matrix * np.exp(species_log_nu)
matrix_exp = expm(matrix_at_grid_position)
print(matrix_exp)

[[0.6 0.4]
 [0.6 0.4]]


In [35]:
start_matrix

array([[-0.4,  0.4],
       [ 0.6, -0.6]])

In [23]:
species_log_nu = pd.read_csv("./acol_simulated.txt", sep="\t")

In [24]:
species_log_nu

Unnamed: 0,name,value
0,molecules_mean_log_nu,0.5
1,molecules_var_log_nu,0.2
2,molecules_log_nu_1,2.5
3,molecules_log_nu_2,2.5
4,molecules_log_nu_3,2.5
...,...,...
191,species_alpha_32,0.4
192,species_alpha_33,0.4
193,species_alpha_34,0.4
194,species_alpha_35,0.4


As we can see both are rather similar. This indicates that the calculations for this part of the model seem to be correct.

Also we can check that the average value of Z  for $Z = 1$ should be equal to $\frac{\mu_1}{\mu_1 + \mu_0}$

In [None]:
print(Z["Z_state"].mean())

In [None]:
species_mu_1 / (species_mu_1 + species_mu_0)

Both are also quite similar.