In [1]:
import numpy as np
import pandas as pd
import pygrgl
import matplotlib.pyplot as plt
from scipy.stats import norm
import scipy.stats as stats
import random

import sys
sys.path.append('/Users/adityasyam/compgen/grg_pheno_sim') 

from grg_pheno_sim.model import grg_causal_mutation_model
from grg_pheno_sim.binary_phenotype import sim_binary_phenotypes, sim_binary_phenotypes_custom


In [2]:
grg_1 = pygrgl.load_immutable_grg("test-200-samples.vcf.gz.final.grg") #loading in a sample grg stored in the same directory

model_type = "normal"
mean = 0
var = 1

model = grg_causal_mutation_model(model_type, mean=mean, var=var)

num_causal = 1000

random_seed = 1

normalize_genetic_values_before_noise = True

noise_heritability = 0.33

population_prevalence = 0.1 #this means 1 in 10 individuals are a case

effect_output_required = True #saves the effect sizes data for each mutation node in a .par file

effect_path = 'univariate_sample_effect_sizes.par'

standardized_output = True

output_path = 'normal_pheno_binary.phen' #define the path to be saved at, this output is saved in the file of this name in the same directory

header=True #set header parameter to true if column names are expected in output file

phenotypes_binary = sim_binary_phenotypes(grg_1, model, num_causal, population_prevalence, random_seed, normalize_genetic_values_before_noise=normalize_genetic_values_before_noise, noise_heritability=noise_heritability, save_effect_output=effect_output_required, effect_path=effect_path, standardized_output=standardized_output, path=output_path, header=header)


The initial effect sizes are 
     mutation_id  effect_size  causal_mutation_id
0             20    -1.810258                   0
1             28     1.151768                   0
2             62     1.681257                   0
3             76     2.346698                   0
4            119    -0.286668                   0
..           ...          ...                 ...
995        10862    -0.221163                   0
996        10874    -1.136983                   0
997        10879    -0.966133                   0
998        10883    -1.402602                   0
999        10889    -0.483777                   0

[1000 rows x 3 columns]
The genetic values of the individuals are 
     individual_id  genetic_value  causal_mutation_id
0                0     -15.227789                   0
1                1     -14.879777                   0
2                2      11.834985                   0
3                3      -3.905829                   0
4                4     -20.24068

In [3]:
phenotypes_binary

Unnamed: 0,causal_mutation_id,individual_id,genetic_value,environmental_noise,phenotype
0,0,0,-0.728110,-0.553799,0
1,0,1,-0.713080,0.542342,0
2,0,2,0.440680,-1.019672,0
3,0,3,-0.239136,0.207960,0
4,0,4,-0.944608,-1.290741,0
...,...,...,...,...,...
195,0,195,-0.479898,0.578204,0
196,0,196,-0.321614,0.674239,0
197,0,197,-0.938490,0.081610,0
198,0,198,0.367147,-0.319056,0


In [4]:
binary_list = phenotypes_binary["phenotype"]
num_zeros = (binary_list == 0).sum()
num_ones = (binary_list == 1).sum()

print("Number of 0s:", num_zeros)
print("Number of 1s:", num_ones)
print("Population prevalence ratio observed: ", str(num_ones/(num_ones+num_zeros)))


Number of 0s: 184
Number of 1s: 16
Population prevalence ratio observed:  0.08


The observed population prevalence above indicates that our final phenotypes approximately reflect the expected prevalence of 0.1.

We now demonstrate the use case for a higher population prevalence.

In [5]:
grg_1 = pygrgl.load_immutable_grg("test-200-samples.vcf.gz.final.grg") #loading in a sample grg stored in the same directory

model_type = "normal"
mean = 0
var = 1

model = grg_causal_mutation_model(model_type, mean=mean, var=var)

num_causal = 1000

random_seed = 1

normalize_genetic_values_before_noise = True

noise_heritability = 0.33

population_prevalence_high = 0.5 #this means 1 in 2 individuals are a case

effect_output_required = True #saves the effect sizes data for each mutation node in a .par file

effect_path = 'univariate_sample_effect_sizes.par'

standardized_output = True

output_path = 'normal_pheno_binary_high_prevalence.phen' #define the path to be saved at, this output is saved in the file of this name in the same directory

header=True #set header parameter to true if column names are expected in output file

phenotypes_binary_high_prev = sim_binary_phenotypes(grg_1, model, num_causal, population_prevalence_high, random_seed, normalize_genetic_values_before_noise=normalize_genetic_values_before_noise, noise_heritability=noise_heritability, save_effect_output=effect_output_required, effect_path=effect_path, standardized_output=standardized_output, path=output_path, header=header)


The initial effect sizes are 
     mutation_id  effect_size  causal_mutation_id
0             20    -1.810258                   0
1             28     1.151768                   0
2             62     1.681257                   0
3             76     2.346698                   0
4            119    -0.286668                   0
..           ...          ...                 ...
995        10862    -0.221163                   0
996        10874    -1.136983                   0
997        10879    -0.966133                   0
998        10883    -1.402602                   0
999        10889    -0.483777                   0

[1000 rows x 3 columns]
The genetic values of the individuals are 
     individual_id  genetic_value  causal_mutation_id
0                0     -15.227789                   0
1                1     -14.879777                   0
2                2      11.834985                   0
3                3      -3.905829                   0
4                4     -20.24068

In [6]:
phenotypes_binary_high_prev

Unnamed: 0,causal_mutation_id,individual_id,genetic_value,environmental_noise,phenotype
0,0,0,-0.712723,-1.134867,0
1,0,1,-0.698011,1.553244,1
2,0,2,0.431367,0.154638,1
3,0,3,-0.234083,0.195473,0
4,0,4,-0.924646,0.096591,0
...,...,...,...,...,...
195,0,195,-0.469757,0.576440,1
196,0,196,-0.314818,0.342873,1
197,0,197,-0.918657,0.522325,0
198,0,198,0.359389,-0.029525,1


In [7]:
binary_list_high_prev = phenotypes_binary_high_prev["phenotype"]
num_zeros = (binary_list_high_prev == 0).sum()
num_ones = (binary_list_high_prev == 1).sum()

print("Number of 0s:", num_zeros)
print("Number of 1s:", num_ones)
print("Population prevalence ratio observed: ", str(num_ones/(num_ones+num_zeros)))


Number of 0s: 99
Number of 1s: 101
Population prevalence ratio observed:  0.505


Finally, we use custom effect sizes to compute the binary phenotypes.

In [8]:
random_effects = [random.random() for _ in range(grg_1.num_mutations)] #list input


In [9]:
normalize_genetic_values_before_noise = True

noise_heritability = 0.33

population_prevalence_custom = 0.25

standardized_output = True

output_path = 'custom_pheno_binary.phen' #define the path to be saved at, this output is saved in the file of this name in the same directory

phenotypes_list_binary = sim_binary_phenotypes_custom(grg_1, random_effects, population_prevalence_custom, normalize_genetic_values_before_noise=normalize_genetic_values_before_noise, noise_heritability=noise_heritability, standardized_output=standardized_output, path=output_path)

The initial effect sizes are 
       mutation_id  effect_size  causal_mutation_id
0                0     0.117408                   0
1                1     0.861882                   0
2                2     0.498269                   0
3                3     0.917238                   0
4                4     0.460836                   0
...            ...          ...                 ...
10888        10888     0.573960                   0
10889        10889     0.038802                   0
10890        10890     0.415040                   0
10891        10891     0.181088                   0
10892        10892     0.813309                   0

[10893 rows x 3 columns]
The genetic values of the individuals are 
     individual_id  genetic_value  causal_mutation_id
0                0     912.739569                   0
1                1     979.725854                   0
2                2     905.551539                   0
3                3    1014.532636                   0
4      

In [10]:
phenotypes_list_binary

Unnamed: 0,causal_mutation_id,individual_id,genetic_value,environmental_noise,phenotype
0,0,0,-0.608617,0.517785,0
1,0,1,0.193090,1.183318,1
2,0,2,-0.694645,0.164885,0
3,0,3,0.609666,0.766120,1
4,0,4,-0.561771,0.444871,0
...,...,...,...,...,...
195,0,195,-0.343251,-0.539703,0
196,0,196,-0.406970,-0.759601,0
197,0,197,0.750418,0.316494,1
198,0,198,0.331718,0.819459,1


In [11]:
binary_list_custom = phenotypes_list_binary["phenotype"]
num_zeros = (binary_list_custom == 0).sum()
num_ones = (binary_list_custom == 1).sum()

print("Number of 0s:", num_zeros)
print("Number of 1s:", num_ones)
print("Population prevalence ratio observed: ", str(num_ones/(num_ones+num_zeros)))


Number of 0s: 149
Number of 1s: 51
Population prevalence ratio observed:  0.255
