In [1]:
import numpy as np
import pygrgl
import matplotlib.pyplot as plt

import sys
sys.path.append('/Users/adityasyam/grg_pheno_sim') 

from grg_pheno_sim.effect_size import sim_grg_causal_mutation, additive_effect_sizes, samples_to_individuals
from grg_pheno_sim.model import grg_causal_mutation_model
from grg_pheno_sim.noise_sim import sim_env_noise
from grg_pheno_sim.normalization import quantile_normalize



The following command only serves the purpose of converting the VCF zip file into a GRG that will be used for the phenotype simulation.

In [None]:
%%script bash --out /dev/null
echo "Test"
grg construct --no-maf-flip -p 10 -t 2 ../data/test-200-samples.vcf.gz --out-file test-200-samples.grg

In [3]:
grg_1 = pygrgl.load_immutable_grg("test-200-samples.grg") #loading in a sample grg stored in the same directory


First, we demonstrate quantile normalization for single causal mutation's phenotypic dataframe.

In [4]:
mean_1 = 0.0  
var_1 = 1.0  
model_normal = grg_causal_mutation_model("normal", mean=mean_1, var=var_1)

In [None]:

trait_df_normal = sim_grg_causal_mutation(grg_1, num_causal=1000, model=model_normal, random_seed=1)
sample_nodes_df = additive_effect_sizes(grg_1, trait_df_normal)
individual_genetic_value_df = samples_to_individuals(sample_nodes_df) #non-normalized genetic value df
phenotypes = sim_env_noise(individual_genetic_value_df, h2=0.5) #simulating environmental noise without normalizing genetic values
phenotype_df = phenotypes.phenotype_df
phenotype_df_1 = phenotype_df.copy()
phenotype_df_2 = phenotype_df.copy()
phenotype_df


In [None]:
quantile_normalize_phenotype_df = quantile_normalize(phenotype_df) #quantile normalized to the normal distribution
quantile_normalize_phenotype_df

In [None]:
plt.hist(quantile_normalize_phenotype_df["normalized_phenotype"], bins=60)
plt.title("Normalized Phenotypes")
plt.show()
print("Mean of phenotypes: " + str(quantile_normalize_phenotype_df['normalized_phenotype'].mean()))
print("Standard deviation of phenotypes: " + str(quantile_normalize_phenotype_df['normalized_phenotype'].std()))

In [None]:
quantile_normal_genetic_df = quantile_normalize(phenotype_df_1, phenotype_normalize=False)
quantile_normal_genetic_df

In [None]:
quantile_both_normalized = quantile_normalize(phenotype_df_2, normalize_both=True)
quantile_both_normalized

Now, we show the same for multivariate simulation.

In [10]:
mean_1 = np.zeros(2)
cov_1 = np.eye(2) 
model_normal_multi = grg_causal_mutation_model("multivariate normal", mean=mean_1, cov=cov_1)



In [None]:
trait_df_normal_multi = sim_grg_causal_mutation(grg_1, num_causal=1000, model=model_normal_multi, random_seed=1)
sample_nodes_df_multi = additive_effect_sizes(grg_1, trait_df_normal_multi)
individuals = samples_to_individuals(sample_nodes_df_multi)
phenotypes_user_defined_multi_normal = sim_env_noise(individuals, user_defined=True, means=mean_1, cov=cov_1) #simulates noise from the multivariate standard normal distribution with means of zero and an identity matrix for covariance
phenotypes_user_defined_multi_normal_df = phenotypes_user_defined_multi_normal.phenotype_df
phenotypes_user_defined_multi_normal_df_1 = phenotypes_user_defined_multi_normal_df.copy()
phenotypes_user_defined_multi_normal_df_2 = phenotypes_user_defined_multi_normal_df.copy()
phenotypes_user_defined_multi_normal_df

In [None]:
quantile_normalized_multi_pheno = quantile_normalize(phenotypes_user_defined_multi_normal_df)
quantile_normalized_multi_pheno

In [None]:
df_dict_test = {k: v.sort_values('individual_id') for k, v in quantile_normalized_multi_pheno.groupby('causal_mutation_id')}

num = len(df_dict_test)

for i in range(num):
  temp_df = df_dict_test[i]
  plt.hist(temp_df["normalized_phenotype"], bins=60)
  plt.title("Normalized Phenotypes")
  plt.show()
  print("Mean of phenotypes: " + str(temp_df['normalized_phenotype'].mean()))
  print("Standard deviation of phenotypes: " + str(temp_df['normalized_phenotype'].std()))

In [None]:
quantile_normalized_multi_genetic = quantile_normalize(phenotypes_user_defined_multi_normal_df_1, phenotype_normalize=False)
quantile_normalized_multi_genetic

In [None]:
quantile_normalized_multi_both = quantile_normalize(phenotypes_user_defined_multi_normal_df_2, normalize_both=False)
quantile_normalized_multi_both