In [1]:
import pandas as pd
import numpy as np
from scipy.linalg import expm

In [2]:
Y = pd.read_csv("./acol_simulated_Y.txt", sep="\t", index_col=0)

In [3]:
Y

Unnamed: 0_level_0,Y_state,molecules,species,fraction_of_one
position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,cholesterol,human,0.98878
1,1,cholesterol,mosquito,0.79070
2,0,cholesterol,fly,0.79210
3,1,cholesterol,worm,0.79220
4,1,cholesterol,bacteria,0.79228
...,...,...,...,...
427,0,lima,cat,0.24152
428,1,lima,fish,0.23654
429,1,lima,bird,0.24014
430,1,lima,plant,0.23508


In [4]:
Z = pd.read_csv("./acol_simulated_Z_species.txt", sep="\t")

In [5]:
Z

Unnamed: 0,molecules,species,position,Z_state
0,cholesterol,species,0,1
1,sugar,species,1,0
2,leucine,species,2,0
3,morphine,species,3,0
4,fentanyl,species,4,0
5,protein,species,5,0
6,hydrogen,species,6,0
7,marco,species,7,0
8,hello,species,8,0
9,cocaine,species,9,1


In [6]:
matrix = np.array([[0, 0], [0, 0]])
for i, row in Y.iterrows():
    y_state = row["Y_state"]
    z_state = int(Z[Z["molecules"] == row["molecules"]]["Z_state"].iloc[0])
    if y_state == 0 and z_state == 0:
        matrix[0, 0] += 1
    if y_state == 1 and z_state == 0:
        matrix[0, 1] += 1
    if y_state == 0 and z_state == 1:
        matrix[1, 0] += 1
    if y_state == 1 and z_state == 1:
        matrix[1, 1] += 1

In [7]:
row_sums = matrix.sum(axis=1)
matrix_normalised = matrix / row_sums[:, np.newaxis]

This should then become the matrix at the specific branch length that we chose.

In [12]:
print(matrix_normalised)

[[0.93269231 0.06730769]
 [0.225      0.775     ]]


Now we can calculate the actual matrix at the specific branch length. For the model validation, we have fixed all species branch length to the same value. That means that we can take any of the [species_branch_lengths_1, ..., species_branch_lengths_N] and use it as the branch length for all species. 

The same can be said for the $\mu$ of the species. We can take any of the [species_mu_1, ..., species_mu_N] and use it as the $\mu$ for all species.

In [9]:
species_branch_len = pd.read_csv(
    "acol_simulated_species_branch_length_grid.txt", sep="\t", index_col=0
)
params = pd.read_csv("acol_input_simulated.txt", sep="\t")

# for the model validation, given that we fixed all species branch length to the same values, we can take the first one only
branch_length = species_branch_len.loc[
    params[params["name"] == "species_branch_lengths_1"]["value"]
]["branch_length"].to_numpy()

species_mu_1 = params[params["name"] == "species_mu_1_1"]["value"].to_numpy().item()
species_mu_0 = params[params["name"] == "species_mu_0_1"]["value"].to_numpy().item()

start_matrix = np.array([[-species_mu_1, species_mu_1], [species_mu_0, -species_mu_0]])
matrix_at_grid_position = branch_length * start_matrix
matrix_exp = expm(matrix_at_grid_position)
print(matrix_exp)

[[0.94323171 0.05676829]
 [0.21288108 0.78711892]]


As we can see both are rather similar. This indicates that the calculations for this part of the model seem to be correct.

Also we can check that the average value of Z  for $Z = 1$ should be equal to $\frac{\mu_1}{\mu_1 + \mu_0}$

In [10]:
print(Z["Z_state"].mean())

0.2777777777777778


In [None]:
species_mu_1 / (species_mu_1 + species_mu_0)

0.2105263157894737