In [1]:
from grg_pheno_sim.multi_grg_phenotype import sim_phenotypes_multi_grg
from grg_pheno_sim.model import grg_causal_mutation_model


The following commands only serve the purpose of converting the VCF zip file into GRGs that will be used for phenotype simulation.

In [2]:
%%script bash --out /dev/null
if [ ! -f test-200-samples.grg ]; then
  grg construct --no-maf-flip -p 10 -t 2 ../data/test-200-samples.vcf.gz --out-file test-200-samples.grg
fi

In [3]:
%%script bash --out /dev/null
if [ ! -f test-200-samples_copy.grg ]; then
  grg construct --no-maf-flip -p 10 -t 2 ../data/test-200-samples.vcf.gz --out-file test-200-samples_copy.grg
fi

In [4]:
%%script bash --out /dev/null
if [ ! -f test-200-samples_last.grg ]; then
  grg construct --no-maf-flip -p 10 -t 2 ../data/test-200-samples.vcf.gz --out-file test-200-samples_last.grg
fi

In [5]:
grg_list = ["test-200-samples.grg", "test-200-samples_copy.grg"]
#this is the list of GRG files to be loaded in 

We will first demonstrate loading all GRG files into RAM and simulating phenotypes. Causal mutations are sampled from each GRG, and the genetic values are obtained for the samples. The combined genetic dataframe is the addition of each GRG's genetic values. Noise is sampled at the end and added to obtain the phenotypes.

NOTE: It is necessary for each GRG to have the same number of samples.

In [6]:
model_type = "normal"
mean = 0
var = 1

model = grg_causal_mutation_model(model_type, mean=mean, var=var)

num_causal_per_file = 1000

random_seed = 1

normalize_genetic_values_before_noise = False

noise_heritability = 0.33

load_all_into_RAM = True #this parameter decides whether to load all GRGs into RAM together

save_effects = True

path_list = ['first_sample_effect_sizes.par', 'second_sample_effect_sizes.par']

multi_grg_uni_phenotypes = sim_phenotypes_multi_grg(grg_list, model, num_causal_per_file, random_seed, load_all_ram=load_all_into_RAM, normalize_genetic_values_before_noise=normalize_genetic_values_before_noise, noise_heritability=noise_heritability, save_effect_output=save_effects, effect_path_list=path_list)


Loaded test-200-samples.grg into RAM
Loaded test-200-samples_copy.grg into RAM
Genetic values for test-200-samples.grg are as follows:
     individual_id  genetic_value  causal_mutation_id
0                0      -7.904997                   0
1                1     -17.412037                   0
2                2     -10.563775                   0
3                3       0.961305                   0
4                4     -11.237289                   0
..             ...            ...                 ...
195            195     -24.737286                   0
196            196       6.195336                   0
197            197     -10.975352                   0
198            198      -7.242642                   0
199            199       3.335119                   0

[200 rows x 3 columns]
Genetic values for test-200-samples_copy.grg are as follows:
     individual_id  genetic_value  causal_mutation_id
0                0      33.221469                   0
1                1      

In [7]:
multi_grg_uni_phenotypes

Unnamed: 0,causal_mutation_id,individual_id,genetic_value,environmental_noise,phenotype
0,0,0,0.664620,-0.184810,0.479810
1,0,1,-0.182382,0.223431,0.041048
2,0,2,-0.115355,0.724536,0.609181
3,0,3,0.612357,-0.216181,0.396176
4,0,4,-0.247630,1.110499,0.862869
...,...,...,...,...,...
195,0,195,-0.754646,-0.674379,-1.429025
196,0,196,0.685529,-0.340609,0.344920
197,0,197,0.280632,0.257394,0.538026
198,0,198,0.479682,0.246563,0.726245


We now perform similar simulations, but by loading the GRGs into RAM sequentially (instead of all together).

In [8]:
model_type = "normal"
mean = 0
var = 1

model = grg_causal_mutation_model(model_type, mean=mean, var=var)

num_causal_per_file = 1000

random_seed = 1

normalize_genetic_values_before_noise = False

noise_heritability = 0.33

load_all_into_RAM = False #this parameter decides whether to load all GRGs into RAM together

save_effects = True

path_list = ['first_seq_sample_effect_sizes.par', 'second_seq_sample_effect_sizes.par']

multi_grg_uni_seq_phenotypes = sim_phenotypes_multi_grg(grg_list, model, num_causal_per_file, random_seed, load_all_ram=load_all_into_RAM, normalize_genetic_values_before_noise=normalize_genetic_values_before_noise, noise_heritability=noise_heritability, save_effect_output=save_effects, effect_path_list=path_list)

Loaded test-200-samples.grg into RAM
Genetic values for test-200-samples.grg are as follows:
     individual_id  genetic_value  causal_mutation_id
0                0       1.717704                   0
1                1      -0.620457                   0
2                2      -1.649852                   0
3                3      -0.351501                   0
4                4     -10.761766                   0
..             ...            ...                 ...
195            195      -9.304320                   0
196            196     -18.342839                   0
197            197      -2.093446                   0
198            198      -9.906869                   0
199            199     -11.833612                   0

[200 rows x 3 columns]
Loaded test-200-samples_copy.grg into RAM
Genetic values for test-200-samples_copy.grg are as follows:
     individual_id  genetic_value  causal_mutation_id
0                0      -4.058441                   0
1                1     -

In [9]:
multi_grg_uni_seq_phenotypes

Unnamed: 0,causal_mutation_id,individual_id,genetic_value,environmental_noise,phenotype
0,0,0,0.212817,-0.289426,-0.076609
1,0,1,-0.149121,-1.846627,-1.995748
2,0,2,0.748310,-0.297404,0.450906
3,0,3,0.163687,-0.040549,0.123139
4,0,4,0.218012,0.259225,0.477236
...,...,...,...,...,...
195,0,195,0.536418,0.882987,1.419405
196,0,196,-1.148907,-0.508651,-1.657558
197,0,197,0.554018,1.141027,1.695046
198,0,198,-0.632607,-0.839414,-1.472021


Now, we demonstrate a case with binary phenotypes.

In [10]:
model_type = "normal"
mean = 0
var = 1

model = grg_causal_mutation_model(model_type, mean=mean, var=var)

num_causal_per_file = 1000

random_seed = 1

normalize_genetic_values_before_noise = False

binary=True

population_prevalence = 0.2

noise_heritability = 0.33

load_all_into_RAM = True #this parameter decides whether to load all GRGs into RAM together

save_effects = True

path_list = ['first_seq_sample_effect_sizes.par', 'second_seq_sample_effect_sizes.par']

multi_grg_uni_bin_phenotypes = sim_phenotypes_multi_grg(grg_list, model, num_causal_per_file, random_seed, load_all_ram=load_all_into_RAM, binary=binary, population_prevalence=population_prevalence, normalize_genetic_values_before_noise=normalize_genetic_values_before_noise, noise_heritability=noise_heritability, save_effect_output=save_effects, effect_path_list=path_list)

Loaded test-200-samples.grg into RAM
Loaded test-200-samples_copy.grg into RAM
Genetic values for test-200-samples.grg are as follows:
     individual_id  genetic_value  causal_mutation_id
0                0       5.542910                   0
1                1      13.209347                   0
2                2      26.327956                   0
3                3      11.614856                   0
4                4       4.805084                   0
..             ...            ...                 ...
195            195      28.201323                   0
196            196      30.937258                   0
197            197      20.565616                   0
198            198      29.476440                   0
199            199       4.852535                   0

[200 rows x 3 columns]
Genetic values for test-200-samples_copy.grg are as follows:
     individual_id  genetic_value  causal_mutation_id
0                0      18.235461                   0
1                1      

In [11]:
multi_grg_uni_bin_phenotypes

Unnamed: 0,causal_mutation_id,individual_id,genetic_value,environmental_noise,phenotype
0,0,0,0.056790,1.236824,1
1,0,1,0.618759,-1.282344,0
2,0,2,0.596563,-0.857566,0
3,0,3,-0.181517,-0.922881,0
4,0,4,-0.389107,-1.718923,0
...,...,...,...,...,...
195,0,195,0.520207,-0.983028,0
196,0,196,-0.195309,0.231177,0
197,0,197,0.455779,0.652949,1
198,0,198,0.446607,-0.252423,0


In [12]:
binary_list = multi_grg_uni_bin_phenotypes["phenotype"]
num_zeros = (binary_list == 0).sum()
num_ones = (binary_list == 1).sum()

print("Number of 0s:", num_zeros)
print("Number of 1s:", num_ones)
print("Population prevalence ratio observed: ", str(num_ones/(num_ones+num_zeros)))


Number of 0s: 155
Number of 1s: 45
Population prevalence ratio observed:  0.225


Finally, we demonstrate a case with more than 2 GRGs. 

In [13]:
new_grg_list = ["test-200-samples.grg", "test-200-samples_copy.grg", "test-200-samples_last.grg"]


In [14]:
model_type = "normal"
mean = 0
var = 1

model = grg_causal_mutation_model(model_type, mean=mean, var=var)

num_causal_per_file = 1000

random_seed = 1

normalize_genetic_values_before_noise = False

noise_heritability = 0.33

load_all_into_RAM = True #this parameter decides whether to load all GRGs into RAM together

multi_grg_uni_phenotypes_three = sim_phenotypes_multi_grg(new_grg_list, model, num_causal_per_file, random_seed, load_all_ram=load_all_into_RAM, normalize_genetic_values_before_noise=normalize_genetic_values_before_noise, noise_heritability=noise_heritability)


Loaded test-200-samples.grg into RAM
Loaded test-200-samples_copy.grg into RAM
Loaded test-200-samples_last.grg into RAM
Genetic values for test-200-samples.grg are as follows:
     individual_id  genetic_value  causal_mutation_id
0                0      13.501031                   0
1                1       4.686923                   0
2                2      20.014314                   0
3                3      -2.238156                   0
4                4       1.618351                   0
..             ...            ...                 ...
195            195       5.880117                   0
196            196      14.228098                   0
197            197      11.817775                   0
198            198      10.572352                   0
199            199      10.320619                   0

[200 rows x 3 columns]
Genetic values for test-200-samples_copy.grg are as follows:
     individual_id  genetic_value  causal_mutation_id
0                0      -3.351209   

In [15]:
multi_grg_uni_phenotypes_three

Unnamed: 0,causal_mutation_id,individual_id,genetic_value,environmental_noise,phenotype
0,0,0,0.371903,-0.332674,0.039229
1,0,1,-0.437471,-0.122974,-0.560445
2,0,2,0.810494,-1.450295,-0.639801
3,0,3,-0.008917,0.219090,0.210172
4,0,4,0.382137,0.732645,1.114783
...,...,...,...,...,...
195,0,195,-0.304185,-0.706260,-1.010445
196,0,196,0.223450,0.586459,0.809909
197,0,197,-0.214687,0.196249,-0.018437
198,0,198,0.137708,-0.264065,-0.126357
