In [1]:
import pygrgl
import random

import sys
sys.path.append('/Users/adityasyam/grg_pheno_sim') 

from grg_pheno_sim.model import grg_causal_mutation_model
from grg_pheno_sim.binary_phenotype import sim_binary_phenotypes, sim_binary_phenotypes_custom


The following command only serves the purpose of converting the VCF zip file into a GRG that will be used for the phenotype simulation.

In [2]:
%%script bash --out /dev/null
if [ ! -f test-200-samples.grg ]; then
  grg construct --no-maf-flip -p 10 -t 2 ../data/test-200-samples.vcf.gz --out-file test-200-samples.grg
fi

In [3]:
grg_1 = pygrgl.load_immutable_grg("test-200-samples.grg") #loading in a sample grg stored in the same directory

model_type = "normal"
mean = 0
var = 1

model = grg_causal_mutation_model(model_type, mean=mean, var=var)

num_causal = 1000

random_seed = 1

normalize_genetic_values_before_noise = True

noise_heritability = 0.33

population_prevalence = 0.1 #this means 1 in 10 individuals are a case

effect_output_required = True #saves the effect sizes data for each mutation node in a .par file

effect_path = 'univariate_sample_effect_sizes.par'

standardized_output = True

output_path = 'normal_pheno_binary.phen' #define the path to be saved at, this output is saved in the file of this name in the same directory

header=True #set header parameter to true if column names are expected in output file

phenotypes_binary = sim_binary_phenotypes(grg_1, model, num_causal, population_prevalence, random_seed, normalize_genetic_values_before_noise=normalize_genetic_values_before_noise, noise_heritability=noise_heritability, save_effect_output=effect_output_required, effect_path=effect_path, standardized_output=standardized_output, path=output_path, header=header)


The initial effect sizes are 
     mutation_id  effect_size  causal_mutation_id
0             20    -1.810258                   0
1             28     1.151768                   0
2             62     1.681257                   0
3             76     2.346698                   0
4            119    -0.286668                   0
..           ...          ...                 ...
995        10862    -0.221163                   0
996        10874    -1.136983                   0
997        10879    -0.966133                   0
998        10883    -1.402602                   0
999        10889    -0.483777                   0

[1000 rows x 3 columns]
The genetic values of the individuals are 
     individual_id  genetic_value  causal_mutation_id
0                0     -16.501664                   0
1                1      -2.454348                   0
2                2     -17.303803                   0
3                3       6.641214                   0
4                4      -8.71021

In [4]:
phenotypes_binary

Unnamed: 0,causal_mutation_id,individual_id,genetic_value,environmental_noise,phenotype
0,0,0,-0.221014,-0.126973,0
1,0,1,0.403924,0.904646,1
2,0,2,-0.256700,0.202063,0
3,0,3,0.808568,0.823311,1
4,0,4,0.125613,-0.365165,0
...,...,...,...,...,...
195,0,195,1.070838,-1.301230,0
196,0,196,1.249353,-1.056326,0
197,0,197,0.756180,-0.318990,0
198,0,198,0.483583,-0.695453,0


In [5]:
binary_list = phenotypes_binary["phenotype"]
num_zeros = (binary_list == 0).sum()
num_ones = (binary_list == 1).sum()

print("Number of 0s:", num_zeros)
print("Number of 1s:", num_ones)
print("Population prevalence ratio observed: ", str(num_ones/(num_ones+num_zeros)))


Number of 0s: 181
Number of 1s: 19
Population prevalence ratio observed:  0.095


The observed population prevalence above indicates that our final phenotypes approximately reflect the expected prevalence of 0.1.

We now demonstrate the use case for a higher population prevalence.

In [6]:
grg_1 = pygrgl.load_immutable_grg("test-200-samples.grg") #loading in a sample grg stored in the same directory

model_type = "normal"
mean = 0
var = 1

model = grg_causal_mutation_model(model_type, mean=mean, var=var)

num_causal = 1000

random_seed = 1

normalize_genetic_values_before_noise = True

noise_heritability = 0.33

population_prevalence_high = 0.5 #this means 1 in 2 individuals are a case

effect_output_required = True #saves the effect sizes data for each mutation node in a .par file

effect_path = 'univariate_sample_effect_sizes.par'

standardized_output = True

output_path = 'normal_pheno_binary_high_prevalence.phen' #define the path to be saved at, this output is saved in the file of this name in the same directory

header=True #set header parameter to true if column names are expected in output file

phenotypes_binary_high_prev = sim_binary_phenotypes(grg_1, model, num_causal, population_prevalence_high, random_seed, normalize_genetic_values_before_noise=normalize_genetic_values_before_noise, noise_heritability=noise_heritability, save_effect_output=effect_output_required, effect_path=effect_path, standardized_output=standardized_output, path=output_path, header=header)


The initial effect sizes are 
     mutation_id  effect_size  causal_mutation_id
0             20    -1.810258                   0
1             28     1.151768                   0
2             62     1.681257                   0
3             76     2.346698                   0
4            119    -0.286668                   0
..           ...          ...                 ...
995        10862    -0.221163                   0
996        10874    -1.136983                   0
997        10879    -0.966133                   0
998        10883    -1.402602                   0
999        10889    -0.483777                   0

[1000 rows x 3 columns]
The genetic values of the individuals are 
     individual_id  genetic_value  causal_mutation_id
0                0     -16.501664                   0
1                1      -2.454348                   0
2                2     -17.303803                   0
3                3       6.641214                   0
4                4      -8.71021

In [7]:
phenotypes_binary_high_prev

Unnamed: 0,causal_mutation_id,individual_id,genetic_value,environmental_noise,phenotype
0,0,0,-0.217243,-0.711398,0
1,0,1,0.397032,-1.227984,0
2,0,2,-0.252320,-0.161118,0
3,0,3,0.794773,0.204181,1
4,0,4,0.123469,-0.655030,0
...,...,...,...,...,...
195,0,195,1.052568,-0.770886,1
196,0,196,1.228038,0.888635,1
197,0,197,0.743278,-1.046228,0
198,0,198,0.475332,1.508828,1


In [8]:
binary_list_high_prev = phenotypes_binary_high_prev["phenotype"]
num_zeros = (binary_list_high_prev == 0).sum()
num_ones = (binary_list_high_prev == 1).sum()

print("Number of 0s:", num_zeros)
print("Number of 1s:", num_ones)
print("Population prevalence ratio observed: ", str(num_ones/(num_ones+num_zeros)))


Number of 0s: 97
Number of 1s: 103
Population prevalence ratio observed:  0.515


Finally, we use custom effect sizes to compute the binary phenotypes.

In [9]:
random_effects = [random.random() for _ in range(grg_1.num_mutations)] #list input


In [10]:
normalize_genetic_values_before_noise = True

noise_heritability = 0.33

population_prevalence_custom = 0.25

standardized_output = True

output_path = 'custom_pheno_binary.phen' #define the path to be saved at, this output is saved in the file of this name in the same directory

phenotypes_list_binary = sim_binary_phenotypes_custom(grg_1, random_effects, population_prevalence_custom, normalize_genetic_values_before_noise=normalize_genetic_values_before_noise, noise_heritability=noise_heritability, standardized_output=standardized_output, path=output_path)

The initial effect sizes are 
       mutation_id  effect_size  causal_mutation_id
0                0     0.949787                   0
1                1     0.526681                   0
2                2     0.616673                   0
3                3     0.862413                   0
4                4     0.357261                   0
...            ...          ...                 ...
10888        10888     0.138680                   0
10889        10889     0.576951                   0
10890        10890     0.013830                   0
10891        10891     0.736318                   0
10892        10892     0.144842                   0

[10893 rows x 3 columns]
The genetic values of the individuals are 
     individual_id  genetic_value  causal_mutation_id
0                0    1322.518877                   0
1                1    1364.073002                   0
2                2    1344.233740                   0
3                3    1382.769615                   0
4      

In [11]:
phenotypes_list_binary

Unnamed: 0,causal_mutation_id,individual_id,genetic_value,environmental_noise,phenotype
0,0,0,-1.032290,0.235108,0
1,0,1,0.231930,0.342843,0
2,0,2,-0.371649,0.996948,0
3,0,3,0.800746,1.378561,1
4,0,4,-0.302185,-0.019552,0
...,...,...,...,...,...
195,0,195,0.649143,-0.329171,0
196,0,196,-0.852008,-0.370296,0
197,0,197,-0.242326,-0.240666,0
198,0,198,0.000594,-0.225209,0


In [12]:
binary_list_custom = phenotypes_list_binary["phenotype"]
num_zeros = (binary_list_custom == 0).sum()
num_ones = (binary_list_custom == 1).sum()

print("Number of 0s:", num_zeros)
print("Number of 1s:", num_ones)
print("Population prevalence ratio observed: ", str(num_ones/(num_ones+num_zeros)))


Number of 0s: 151
Number of 1s: 49
Population prevalence ratio observed:  0.245
