In [1]:
import numpy as np
import pygrgl
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score

import sys
sys.path.append('/Users/adityasyam/grg_pheno_sim') 

from grg_pheno_sim.effect_size import sim_grg_causal_mutation, additive_effect_sizes, samples_to_individuals, normalize_genetic_values
from grg_pheno_sim.model import grg_causal_mutation_model
from grg_pheno_sim.split import split_genetic_values

from test_phenotype_sim.effect_size_test import test_additive_effect_sizes
from test_phenotype_sim.genetic_value_test import split_sample_df, split_causal_mutation_df, split_normalized_genetic_values



The following command only serves the purpose of converting the VCF zip file into a GRG that will be used for the phenotype simulation.

In [None]:
%%script bash --out /dev/null
echo "Test"
grg construct --no-maf-flip -p 10 -t 2 ../data/test-200-samples.vcf.gz --out-file test-200-samples.grg

In [3]:
grg_1 = pygrgl.load_immutable_grg("test-200-samples.grg") #loading in a sample grg stored in the same directory


In [None]:
rng = np.random.default_rng()

mean_1 = np.zeros(2)
cov_1 = np.eye(2) 
model_normal_multi = grg_causal_mutation_model("multivariate normal", mean=mean_1, cov=cov_1)

#simulating effect sizes using just the model
num_causal = 1000  #number of causal sites

effect_sizes = model_normal_multi.sim_effect_size(num_causal, rng)

print("Simulated Effect Sizes:", effect_sizes)

In [None]:
trait_df_normal_multi = sim_grg_causal_mutation(grg_1, num_causal=num_causal, model=model_normal_multi, random_seed=1)

trait_df_normal_multi #this is the pandas data frame showing the results using the normal distribution

In [None]:
sample_nodes_df_multi = additive_effect_sizes(grg_1, trait_df_normal_multi)
sample_nodes_df_multi #the pandas dataframe containing only the sample nodes and their associated effect sizes, 400*n sample nodes expected, where n is the number of unique causal mutation ids

In [None]:
split_genetic_values(sample_nodes_df_multi, return_print=True)

In [None]:
split_df_list = split_genetic_df = split_genetic_values(sample_nodes_df_multi, return_list=True)
split_df_list[0] #the first dataframe

In [None]:
split_df_list[1] #the second dataframe

In [None]:
df_dict, num = split_sample_df(sample_nodes_df_multi)
causal_mutation_df_dict = split_causal_mutation_df(trait_df_normal_multi)

ground_truth_genetic_values = {i: None for i in range(num)}

for i in range(num):
  ground_truth_genetic_values[i] = test_additive_effect_sizes(grg_1, causal_mutation_df_dict[i])
  r2 = r2_score(ground_truth_genetic_values[i], df_dict[i]['genetic_value']) #plotting the expected versus actual genetic values to verify the additive function

  plt.figure(figsize=(10, 6))
  plt.scatter(range(len(df_dict[i]['genetic_value'])), df_dict[i]['genetic_value'], color='blue', label='Observed Genetic Value')
  plt.scatter(range(len(ground_truth_genetic_values[i])), ground_truth_genetic_values[i], color='red', alpha=0.6, label='Expected Genetic Value')

  plt.title(f'Comparison of Observed and Expected Genetic Values for Causal Mutation {i+1} \n$R^2$ Score: {r2:.2f}')
  plt.xlabel('Index')
  plt.ylabel('Genetic Value Size')
  plt.legend()

  plt.grid(True)

  plt.show()


The two graphs above demonstrate that the genetic values for each of the causal mutations are calculated as expected by cross-verifying the values with the output generated by recursively verifying the ancestral mutation nodes in the GRG.

In [None]:
individuals = samples_to_individuals(sample_nodes_df_multi)
individuals

In [None]:
normalized_individual_multi_normal = normalize_genetic_values(individuals)
normalized_individual_multi_normal

In [None]:
normalized_dict, num = split_normalized_genetic_values(normalized_individual_multi_normal)


for i in range(num):
  print("The new mean of genetic values for causal mutation " + str(i+1) + " is " + str(normalized_dict[i]['genetic_value'].mean()))
  print("The new standard deviation of genetic values for causal mutation " + str(i+1) + " is " + str(normalized_dict[i]['genetic_value'].std()))



Now, we demonstrate multivariate genetic value simulation for more than two causal mutations

In [None]:
rng = np.random.default_rng()

scales_1 = np.array([1, 2, 5])
model_exp_multi = grg_causal_mutation_model("multivariate exponential", scales=scales_1, random_sign=False)

#simulating effect sizes using just the model
num_causal = 1000  #number of causal sites

effect_sizes = model_exp_multi.sim_effect_size(num_causal, rng)

print("Simulated Effect Sizes:", effect_sizes)

In [None]:
trait_df_exp_multi = sim_grg_causal_mutation(grg_1, num_causal=num_causal, model=model_exp_multi, random_seed=1)
trait_df_exp_multi


In [None]:
sample_nodes_df_multi_exp = additive_effect_sizes(grg_1, trait_df_exp_multi)
sample_nodes_df_multi_exp #the pandas dataframe containing only the sample nodes and their associated effect sizes, 400*n sample nodes expected, where n is the number of unique causal mutation ids

In [None]:
df_dict, num = split_sample_df(sample_nodes_df_multi_exp)
causal_mutation_df_dict = split_causal_mutation_df(trait_df_exp_multi)

ground_truth_genetic_values = {i: None for i in range(num)}

for i in range(num):
  ground_truth_genetic_values[i] = test_additive_effect_sizes(grg_1, causal_mutation_df_dict[i])
  r2 = r2_score(ground_truth_genetic_values[i], df_dict[i]['genetic_value']) #plotting the expected versus actual genetic values to verify the additive function

  plt.figure(figsize=(10, 6))
  plt.scatter(range(len(df_dict[i]['genetic_value'])), df_dict[i]['genetic_value'], color='blue', label='Observed Genetic Value')
  plt.scatter(range(len(ground_truth_genetic_values[i])), ground_truth_genetic_values[i], color='red', alpha=0.6, label='Expected Genetic Value')

  plt.title(f'Comparison of Observed and Expected Genetic Values for Causal Mutation {i+1}\n$R^2$ Score: {r2:.2f}')
  plt.xlabel('Index')
  plt.ylabel('Genetic Value Size')
  plt.legend()

  plt.grid(True)

  plt.show()


The three graphs above demonstrate that the genetic values computed for all three causal mutations are exactly as predicted by the recursive ancestral method

In [None]:
individuals_exp_multi = samples_to_individuals(sample_nodes_df_multi_exp)
individuals_exp_multi

In [None]:
normalized_individuals_multi_exp = normalize_genetic_values(individuals_exp_multi)
normalized_individuals_multi_exp

In [None]:
normalized_dict, num = split_normalized_genetic_values(normalized_individuals_multi_exp)


for i in range(num):
  print("The new mean of genetic values for causal mutation " + str(i+1) + " is " + str(normalized_dict[i]['genetic_value'].mean()))
  print("The new standard deviation of genetic values for causal mutation " + str(i+1) + " is " + str(normalized_dict[i]['genetic_value'].std()))

