In [1]:
import pygrgl
import random

from grg_pheno_sim.model import grg_causal_mutation_model
from grg_pheno_sim.binary_phenotype import sim_binary_phenotypes, sim_binary_phenotypes_custom


The following command only serves the purpose of converting the VCF zip file into a GRG that will be used for the phenotype simulation.

In [2]:
%%script bash --out /dev/null
if [ ! -f test-200-samples.grg ]; then
  grg construct -p 10 -t 2 ../data/test-200-samples.vcf.gz --out-file test-200-samples.grg
fi

In [3]:
grg_1 = pygrgl.load_immutable_grg("test-200-samples.grg") #loading in a sample grg stored in the same directory

model_type = "normal"
mean = 0
var = 1

model = grg_causal_mutation_model(model_type, mean=mean, var=var)

num_causal = 1000

random_seed = 1

normalize_genetic_values_before_noise = True

noise_heritability = 0.33

population_prevalence = 0.1 #this means 1 in 10 individuals are a case

effect_output_required = True #saves the effect sizes data for each mutation node in a .par file

effect_path = 'univariate_sample_effect_sizes.par'

standardized_output = True

output_path = 'normal_pheno_binary.phen' #define the path to be saved at, this output is saved in the file of this name in the same directory

header=True #set header parameter to true if column names are expected in output file

phenotypes_binary = sim_binary_phenotypes(grg_1, model, num_causal, population_prevalence, random_seed, normalize_genetic_values_before_noise=normalize_genetic_values_before_noise, noise_heritability=noise_heritability, save_effect_output=effect_output_required, effect_path=effect_path, standardized_output=standardized_output, path=output_path, header=header)


The initial effect sizes are 
     mutation_id  effect_size  causal_mutation_id
0             20    -1.810258                   0
1             28     1.151768                   0
2             62     1.681257                   0
3             76     2.346698                   0
4            119    -0.286668                   0
..           ...          ...                 ...
995        10862    -0.221163                   0
996        10874    -1.136983                   0
997        10879    -0.966133                   0
998        10883    -1.402602                   0
999        10889    -0.483777                   0

[1000 rows x 3 columns]
The genetic values of the individuals are 
     individual_id  genetic_value  causal_mutation_id
0                0     -16.501664                   0
1                1      -2.454348                   0
2                2     -17.303803                   0
3                3       6.641214                   0
4                4      -8.71021

In [4]:
phenotypes_binary

Unnamed: 0,causal_mutation_id,individual_id,genetic_value,environmental_noise,phenotype
0,0,0,-0.216539,-0.795106,0
1,0,1,0.395746,1.072164,1
2,0,2,-0.251502,-0.203877,0
3,0,3,0.792197,-0.000972,0
4,0,4,0.123069,-0.063396,0
...,...,...,...,...,...
195,0,195,1.049157,0.677136,1
196,0,196,1.224058,0.972546,1
197,0,197,0.740869,1.314725,1
198,0,198,0.473792,-0.314411,0


In [5]:
binary_list = phenotypes_binary["phenotype"]
num_zeros = (binary_list == 0).sum()
num_ones = (binary_list == 1).sum()

print("Number of 0s:", num_zeros)
print("Number of 1s:", num_ones)
print("Population prevalence ratio observed: ", str(num_ones/(num_ones+num_zeros)))


Number of 0s: 177
Number of 1s: 23
Population prevalence ratio observed:  0.115


The observed population prevalence above indicates that our final phenotypes approximately reflect the expected prevalence of 0.1.

We now demonstrate the use case for a higher population prevalence.

In [6]:
grg_1 = pygrgl.load_immutable_grg("test-200-samples.grg") #loading in a sample grg stored in the same directory

model_type = "normal"
mean = 0
var = 1

model = grg_causal_mutation_model(model_type, mean=mean, var=var)

num_causal = 1000

random_seed = 1

normalize_genetic_values_before_noise = True

noise_heritability = 0.33

population_prevalence_high = 0.5 #this means 1 in 2 individuals are a case

effect_output_required = True #saves the effect sizes data for each mutation node in a .par file

effect_path = 'univariate_sample_effect_sizes.par'

standardized_output = True

output_path = 'normal_pheno_binary_high_prevalence.phen' #define the path to be saved at, this output is saved in the file of this name in the same directory

header=True #set header parameter to true if column names are expected in output file

phenotypes_binary_high_prev = sim_binary_phenotypes(grg_1, model, num_causal, population_prevalence_high, random_seed, normalize_genetic_values_before_noise=normalize_genetic_values_before_noise, noise_heritability=noise_heritability, save_effect_output=effect_output_required, effect_path=effect_path, standardized_output=standardized_output, path=output_path, header=header)


The initial effect sizes are 
     mutation_id  effect_size  causal_mutation_id
0             20    -1.810258                   0
1             28     1.151768                   0
2             62     1.681257                   0
3             76     2.346698                   0
4            119    -0.286668                   0
..           ...          ...                 ...
995        10862    -0.221163                   0
996        10874    -1.136983                   0
997        10879    -0.966133                   0
998        10883    -1.402602                   0
999        10889    -0.483777                   0

[1000 rows x 3 columns]
The genetic values of the individuals are 
     individual_id  genetic_value  causal_mutation_id
0                0     -16.501664                   0
1                1      -2.454348                   0
2                2     -17.303803                   0
3                3       6.641214                   0
4                4      -8.71021

In [7]:
phenotypes_binary_high_prev

Unnamed: 0,causal_mutation_id,individual_id,genetic_value,environmental_noise,phenotype
0,0,0,-0.219080,0.386321,1
1,0,1,0.400390,0.561498,1
2,0,2,-0.254454,-0.534813,0
3,0,3,0.801493,0.646457,1
4,0,4,0.124513,-0.816948,0
...,...,...,...,...,...
195,0,195,1.061467,0.546481,1
196,0,196,1.238421,-0.094990,1
197,0,197,0.749563,0.177013,1
198,0,198,0.479351,-0.451529,1


In [8]:
binary_list_high_prev = phenotypes_binary_high_prev["phenotype"]
num_zeros = (binary_list_high_prev == 0).sum()
num_ones = (binary_list_high_prev == 1).sum()

print("Number of 0s:", num_zeros)
print("Number of 1s:", num_ones)
print("Population prevalence ratio observed: ", str(num_ones/(num_ones+num_zeros)))


Number of 0s: 103
Number of 1s: 97
Population prevalence ratio observed:  0.485


Finally, we use custom effect sizes to compute the binary phenotypes.

In [9]:
random_effects = [random.random() for _ in range(grg_1.num_mutations)] #list input


In [10]:
normalize_genetic_values_before_noise = True

noise_heritability = 0.33

population_prevalence_custom = 0.25

standardized_output = True

output_path = 'custom_pheno_binary.phen' #define the path to be saved at, this output is saved in the file of this name in the same directory

phenotypes_list_binary = sim_binary_phenotypes_custom(grg_1, random_effects, population_prevalence_custom, normalize_genetic_values_before_noise=normalize_genetic_values_before_noise, noise_heritability=noise_heritability, standardized_output=standardized_output, path=output_path)

The initial effect sizes are 
       mutation_id  effect_size  causal_mutation_id
0                0     0.763835                   0
1                1     0.153509                   0
2                2     0.524408                   0
3                3     0.401295                   0
4                4     0.552405                   0
...            ...          ...                 ...
10888        10888     0.908213                   0
10889        10889     0.030195                   0
10890        10890     0.542659                   0
10891        10891     0.264924                   0
10892        10892     0.780432                   0

[10893 rows x 3 columns]
The genetic values of the individuals are 
     individual_id  genetic_value  causal_mutation_id
0                0    1330.576786                   0
1                1    1366.883450                   0
2                2    1374.904500                   0
3                3    1398.877987                   0
4      

In [11]:
phenotypes_list_binary

Unnamed: 0,causal_mutation_id,individual_id,genetic_value,environmental_noise,phenotype
0,0,0,-1.334423,0.107752,0
1,0,1,-0.321712,-0.077230,0
2,0,2,-0.097979,-0.331499,0
3,0,3,0.570719,-0.376596,0
4,0,4,-0.069611,-0.486038,0
...,...,...,...,...,...
195,0,195,0.407043,-1.018499,0
196,0,196,-0.522043,0.495159,0
197,0,197,-0.176018,-1.089302,0
198,0,198,-0.093755,0.412435,0


In [12]:
binary_list_custom = phenotypes_list_binary["phenotype"]
num_zeros = (binary_list_custom == 0).sum()
num_ones = (binary_list_custom == 1).sum()

print("Number of 0s:", num_zeros)
print("Number of 1s:", num_ones)
print("Population prevalence ratio observed: ", str(num_ones/(num_ones+num_zeros)))


Number of 0s: 151
Number of 1s: 49
Population prevalence ratio observed:  0.245
