In [1]:
import numpy as np

import sys
sys.path.append('/Users/adityasyam/grg_pheno_sim') 

from grg_pheno_sim.multi_grg_phenotype import sim_phenotypes_multi_grg
from grg_pheno_sim.model import grg_causal_mutation_model


The following commands only serve the purpose of converting the VCF zip file into GRGs that will be used for phenotype simulation.

In [2]:
%%script bash --out /dev/null
echo "Test"
grg construct --no-maf-flip -p 10 -t 2 ../data/test-200-samples.vcf.gz --out-file test-200-samples.grg

Construction took 40 ms
Wrote GRG to test-200-samples.vcf.gz.part0.tree0.grg in 0 ms
Construction took 63 ms
Wrote GRG to test-200-samples.vcf.gz.part0.tree1.grg in 0 ms
Construction took 0 ms
Mapping mutations took264 ms
Wrote GRG to test-200-samples.vcf.gz.part0.grg in 3 ms
Construction took 46 ms
Wrote GRG to test-200-samples.vcf.gz.part1.tree0.grg in 0 ms
Construction took 49 ms
Wrote GRG to test-200-samples.vcf.gz.part1.tree1.grg in 0 ms
Construction took 0 ms
Mapping mutations took330 ms
Wrote GRG to test-200-samples.vcf.gz.part1.grg in 3 ms
Construction took 52 ms
Wrote GRG to test-200-samples.vcf.gz.part2.tree0.grg in 1 ms
Construction took 48 ms
Wrote GRG to test-200-samples.vcf.gz.part2.tree1.grg in 0 ms
Construction took 0 ms
Mapping mutations took482 ms
Wrote GRG to test-200-samples.vcf.gz.part2.grg in 4 ms
Construction took 59 ms
Wrote GRG to test-200-samples.vcf.gz.part3.tree0.grg in 0 ms
Construction took 53 ms
Wrote GRG to test-200-samples.vcf.gz.part3.tree1.grg in 0 ms

In [3]:
%%script bash --out /dev/null
echo "Test"
grg construct --no-maf-flip -p 10 -t 2 ../data/test-200-samples.vcf.gz --out-file test-200-samples_copy.grg

Construction took 37 ms
Wrote GRG to test-200-samples.vcf.gz.part0.tree0.grg in 0 ms
Construction took 53 ms
Wrote GRG to test-200-samples.vcf.gz.part0.tree1.grg in 0 ms
Construction took 0 ms
Mapping mutations took265 ms
Wrote GRG to test-200-samples.vcf.gz.part0.grg in 3 ms
Construction took 45 ms
Wrote GRG to test-200-samples.vcf.gz.part1.tree0.grg in 0 ms
Construction took 49 ms
Wrote GRG to test-200-samples.vcf.gz.part1.tree1.grg in 0 ms
Construction took 0 ms
Mapping mutations took323 ms
Wrote GRG to test-200-samples.vcf.gz.part1.grg in 3 ms
Construction took 52 ms
Wrote GRG to test-200-samples.vcf.gz.part2.tree0.grg in 0 ms
Construction took 47 ms
Wrote GRG to test-200-samples.vcf.gz.part2.tree1.grg in 0 ms
Construction took 0 ms
Mapping mutations took439 ms
Wrote GRG to test-200-samples.vcf.gz.part2.grg in 4 ms
Construction took 59 ms
Wrote GRG to test-200-samples.vcf.gz.part3.tree0.grg in 0 ms
Construction took 53 ms
Wrote GRG to test-200-samples.vcf.gz.part3.tree1.grg in 0 ms

In [4]:
grg_list = ["test-200-samples.grg", "test-200-samples_copy.grg"]
#this is the list of GRG files to be loaded in 

We will first demonstrate loading all GRG files into RAM and simulating phenotypes. Causal mutations are sampled from each GRG, and the genetic values are obtained for the samples. The combined genetic dataframe is the addition of each GRG's genetic values (for each causal mutation ID). Noise is sampled at the end and added to obtain the phenotypes.

NOTE: It is necessary for each GRG to have the same number of samples.

In [5]:
model_type = "multivariate normal"
means = np.zeros(2)
cov = np.array([[1, 0.25], [0.25, 1]])

model = grg_causal_mutation_model(model_type, mean=means, cov=cov)

num_causal_per_file = 1000

random_seed = 1

normalize_genetic_values_before_noise = False

noise_heritability = [0.33, 0.25]

load_all_into_RAM = True #this parameter decides whether to load all GRGs into RAM together

multi_grg_multi_phenotypes = sim_phenotypes_multi_grg(grg_list, model, num_causal_per_file, random_seed, load_all_ram=load_all_into_RAM, normalize_genetic_values_before_noise=normalize_genetic_values_before_noise, noise_heritability=noise_heritability)


Loaded test-200-samples.grg into RAM
Loaded test-200-samples_copy.grg into RAM
Genetic values for test-200-samples.grg are as follows:
     individual_id  genetic_value  causal_mutation_id
0                0       6.935723                   0
1                0       2.660461                   1
2                1      17.970039                   0
3                1       9.241984                   1
4                2      22.197880                   0
..             ...            ...                 ...
395            197     -17.156577                   1
396            198      -9.955634                   0
397            198     -21.257324                   1
398            199      16.791882                   0
399            199     -15.675249                   1

[400 rows x 3 columns]
Genetic values for test-200-samples_copy.grg are as follows:
     individual_id  genetic_value  causal_mutation_id
0                0      25.106253                   0
1                0      

In [6]:
multi_grg_multi_phenotypes

Unnamed: 0,causal_mutation_id,individual_id,genetic_value,environmental_noise,phenotype
0,0,0,0.509684,-0.552259,-0.042575
1,1,0,0.353197,1.174005,1.527202
2,0,1,0.771054,0.212180,0.983234
3,1,1,0.922897,0.975503,1.898400
4,0,2,0.661517,0.926252,1.587769
...,...,...,...,...,...
395,1,197,-0.434756,0.172404,-0.262352
396,0,198,-0.174865,-1.243312,-1.418177
397,1,198,-0.242194,0.198956,-0.043238
398,0,199,0.681555,-0.104545,0.577010


We now perform similar simulations, but by loading the GRGs into RAM sequentially (instead of all together).

In [7]:
model_type = "multivariate normal"
means = np.zeros(2)
cov = np.array([[1, 0.25], [0.25, 1]])

model = grg_causal_mutation_model(model_type, mean=means, cov=cov)

num_causal_per_file = 1000

random_seed = 1

normalize_genetic_values_before_noise = False

noise_heritability = [0.33, 0.25]

load_all_into_RAM = False #this parameter decides whether to load all GRGs into RAM together

multi_grg_multi_seq_phenotypes = sim_phenotypes_multi_grg(grg_list, model, num_causal_per_file, random_seed, load_all_ram=load_all_into_RAM, normalize_genetic_values_before_noise=normalize_genetic_values_before_noise, noise_heritability=noise_heritability)


Loaded test-200-samples.grg into RAM
Genetic values for test-200-samples.grg are as follows:
     individual_id  genetic_value  causal_mutation_id
0                0      40.245559                   0
1                0      -1.377394                   1
2                1      23.919021                   0
3                1      -7.443861                   1
4                2      27.284434                   0
..             ...            ...                 ...
395            197       7.825733                   1
396            198      20.281161                   0
397            198       4.909584                   1
398            199      39.539234                   0
399            199       6.923056                   1

[400 rows x 3 columns]
Loaded test-200-samples_copy.grg into RAM
Genetic values for test-200-samples_copy.grg are as follows:
     individual_id  genetic_value  causal_mutation_id
0                0      17.657593                   0
1                0      

In [8]:
multi_grg_multi_seq_phenotypes

Unnamed: 0,causal_mutation_id,individual_id,genetic_value,environmental_noise,phenotype
0,0,0,0.759046,-0.167286,0.591760
1,1,0,0.592335,0.479757,1.072092
2,0,1,0.232256,0.384351,0.616606
3,1,1,0.004037,-0.167182,-0.163146
4,0,2,-0.030180,-0.373711,-0.403891
...,...,...,...,...,...
395,1,197,0.633479,1.351857,1.985336
396,0,198,-0.188104,-1.182265,-1.370369
397,1,198,0.197821,-0.596464,-0.398643
398,0,199,0.420914,0.632572,1.053486
