In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import pygrgl

import sys
sys.path.append('/Users/adityasyam/grg_pheno_sim') 

from grg_pheno_sim.effect_size import sim_grg_causal_mutation, additive_effect_sizes, samples_to_individuals, normalize_genetic_values
from grg_pheno_sim.model import grg_causal_mutation_model

from test_phenotype_sim.effect_size_test import test_additive_effect_sizes


The following command only serves the purpose of converting the VCF zip file into a GRG that will be used for the phenotype simulation.

In [None]:
%%script bash --out /dev/null
echo "Test"
grg construct --no-maf-flip -p 10 -t 2 ../data/test-200-samples.vcf.gz --out-file test-200-samples.grg

In [3]:
grg_1 = pygrgl.load_immutable_grg("test-200-samples.grg") #loading in a sample grg stored in the same directory


In [None]:
num = grg_1.num_samples
print("The number of sample nodes in this grg is " + str(num))
num_nodes = grg_1.num_nodes
print("The number of nodes in this grg is " + str(num_nodes))
num_mutations = grg_1.num_mutations
print("The number of mutations in this grg is " + str(num_mutations))

In [None]:
rng = np.random.default_rng()

mean_1 = 0.0  
var_1 = 1.0  
model_normal = grg_causal_mutation_model("normal", mean=mean_1, var=var_1)

#simulating effect sizes using just the model
num_causal = 1000  #number of causal sites

effect_sizes = model_normal.sim_effect_size(num_causal, rng)

print("Simulated Effect Sizes:", effect_sizes)

In [None]:

trait_df_normal = sim_grg_causal_mutation(grg_1, num_causal=num_causal, model=model_normal, random_seed=1)

trait_df_normal #this is the pandas data frame showing the results using the normal distribution

In [7]:
sample_nodes_df = additive_effect_sizes(grg_1, trait_df_normal)


In [None]:
sample_nodes_df #the pandas dataframe containing only the sample nodes and their associated genetic values, 400 sample nodes expected

In [None]:
ground_truth_genetic_value = test_additive_effect_sizes(grg_1, trait_df_normal) #this uses a recursive hard-coded method that finds each sample nodes ancestors to ensure correct effect sizes are passed down
ground_truth_genetic_value

In [None]:
r2 = r2_score(ground_truth_genetic_value, sample_nodes_df['genetic_value']) #plotting the expected versus actual genetic values to verify the additive function

plt.figure(figsize=(10, 6))
plt.scatter(range(len(sample_nodes_df['genetic_value'])), sample_nodes_df['genetic_value'], color='blue', label='Observed Genetic Value')
plt.scatter(range(len(ground_truth_genetic_value)), ground_truth_genetic_value, color='red', alpha=0.6, label='Expected Genetic Value')

plt.title(f'Comparison of Observed and Expected Genetic Values\n$R^2$ Score: {r2:.2f}')
plt.xlabel('Index')
plt.ylabel('Genetic Value Size')
plt.legend()

plt.grid(True)

plt.show()

In [None]:
individual_genetic_value_df = samples_to_individuals(sample_nodes_df) #this combines the sample nodes into individual nodes

individual_genetic_value_df #half the number of individuals as compared to samples, due to diploid case

In [None]:
normalized_genetic_value_df = normalize_genetic_values(individual_genetic_value_df) #sets mean to 0 and variance to 1 by default
normalized_genetic_value_df

In [None]:
new_mean = normalized_genetic_value_df['genetic_value'].mean()
new_std = normalized_genetic_value_df['genetic_value'].std()
new_var = normalized_genetic_value_df['genetic_value'].var()

print("The new mean of the genetic values is " + str(new_mean))
print("The new standard deviation of the genetic values is " + str(new_std))
print("The new variance of the genetic values is " + str(new_var))