In [1]:
import numpy as np
import pandas as pd
import pygrgl
import random

import sys
sys.path.append('/Users/adityasyam/grg_pheno_sim') 

from grg_pheno_sim.phenotype import sim_phenotypes_custom


This notebook contains demos for the user to input custom effect sizes instead of using one of the distribution-based models provided in the library. We allow for input types for effect sizes to be either a list, a dictionary, or a pandas dataframe. This is for the univariate case. In the multivariate case, the user must input a pandas dataframe with effect sizes that follows the format shown in multivariate demos for it to be compatible with the simulation library.

The following command only serves the purpose of converting the VCF zip file into a GRG that will be used for the phenotype simulation.

In [2]:
%%script bash --out /dev/null
if [ ! -f test-200-samples.grg ]; then
  grg construct --no-maf-flip -p 10 -t 2 ../data/test-200-samples.vcf.gz --out-file test-200-samples.grg
fi

In [3]:
grg_1 = pygrgl.load_immutable_grg("test-200-samples.grg") #loading in a sample grg stored in the same directory
n = grg_1.num_mutations

In [4]:
random_effects = [random.random() for _ in range(n)] #list input

specific_effects = [1.0 for _ in range(n)] #list input, non-random inputs

effect_sizes = np.random.randn(n)  

mutation_dict = {i: effect_sizes[i] for i in range(n)} #dictionary input

input_df = pd.DataFrame(list(mutation_dict.items()), columns=['mutation_id', 'effect_size']) #dataframe input

input_df_manual = pd.DataFrame(list(mutation_dict.items()), columns=['mutation_id', 'effect_size']) #dataframe input
input_df_manual['causal_mutation_id']=0


We first show custom effect sizes contained within a list.

In [5]:
normalize_genetic_values_before_noise = True

noise_heritability = 0.33

standardized_output = True

output_path = 'custom_pheno.phen' #define the path to be saved at, this output is saved in the file of this name in the same directory

phenotypes_list = sim_phenotypes_custom(grg_1, specific_effects, normalize_genetic_values_before_noise=normalize_genetic_values_before_noise, noise_heritability=noise_heritability, standardized_output=standardized_output, path=output_path)
phenotypes_list

The initial effect sizes are 
       mutation_id  effect_size  causal_mutation_id
0                0          1.0                   0
1                1          1.0                   0
2                2          1.0                   0
3                3          1.0                   0
4                4          1.0                   0
...            ...          ...                 ...
10888        10888          1.0                   0
10889        10889          1.0                   0
10890        10890          1.0                   0
10891        10891          1.0                   0
10892        10892          1.0                   0

[10893 rows x 3 columns]
The genetic values of the individuals are 
     individual_id  genetic_value  causal_mutation_id
0                0         2665.0                   0
1                1         2729.0                   0
2                2         2740.0                   0
3                3         2773.0                   0
4      

Unnamed: 0,causal_mutation_id,individual_id,genetic_value,environmental_noise,phenotype
0,0,0,-1.100518,-1.031840,-2.132358
1,0,1,-0.087820,1.146894,1.059074
2,0,2,0.086238,-0.199308,-0.113070
3,0,3,0.608410,-0.503494,0.104916
4,0,4,-0.087820,-0.675030,-0.762850
...,...,...,...,...,...
195,0,195,0.782468,-0.906764,-0.124296
196,0,196,-0.815697,-0.201857,-1.017554
197,0,197,-0.467582,-0.914263,-1.381845
198,0,198,-0.404288,0.213116,-0.191172


We then show custom effect sizes contained within a dictionary.

In [6]:
normalize_genetic_values_before_noise = True

noise_heritability = 0.33

#by default, the standard .phen output will not be saved

phenotypes_dict = sim_phenotypes_custom(grg_1, mutation_dict, normalize_genetic_values_before_noise=normalize_genetic_values_before_noise, noise_heritability=noise_heritability)
phenotypes_dict

The initial effect sizes are 
       mutation_id  effect_size  causal_mutation_id
0                0     1.722344                   0
1                1    -0.104128                   0
2                2     0.318194                   0
3                3     0.591837                   0
4                4    -0.657885                   0
...            ...          ...                 ...
10888        10888     0.037326                   0
10889        10889    -0.953928                   0
10890        10890    -0.381611                   0
10891        10891     0.371581                   0
10892        10892     1.106401                   0

[10893 rows x 3 columns]
The genetic values of the individuals are 
     individual_id  genetic_value  causal_mutation_id
0                0      29.720383                   0
1                1      49.414794                   0
2                2      35.848493                   0
3                3      22.604641                   0
4      

Unnamed: 0,causal_mutation_id,individual_id,genetic_value,environmental_noise,phenotype
0,0,0,-0.000990,0.608092,0.607102
1,0,1,0.275782,0.237769,0.513551
2,0,2,0.085130,0.666636,0.751767
3,0,3,-0.100990,0.658592,0.557602
4,0,4,-0.058502,0.308486,0.249985
...,...,...,...,...,...
195,0,195,-0.063575,-1.179926,-1.243501
196,0,196,-0.797446,-1.504991,-2.302438
197,0,197,-0.075644,0.783435,0.707791
198,0,198,1.012758,-0.547250,0.465509


We finally show custom effect sizes contained within a pandas dataframe (the user need not add the causal mutation id column - that is handled internally).

In [7]:
normalize_genetic_values_before_noise = True

noise_heritability = 0.33

phenotypes_df = sim_phenotypes_custom(grg_1, input_df, normalize_genetic_values_before_noise=normalize_genetic_values_before_noise, noise_heritability=noise_heritability)
phenotypes_df

The initial effect sizes are 
       mutation_id  effect_size  causal_mutation_id
0                0     1.722344                   0
1                1    -0.104128                   0
2                2     0.318194                   0
3                3     0.591837                   0
4                4    -0.657885                   0
...            ...          ...                 ...
10888        10888     0.037326                   0
10889        10889    -0.953928                   0
10890        10890    -0.381611                   0
10891        10891     0.371581                   0
10892        10892     1.106401                   0

[10893 rows x 3 columns]
The genetic values of the individuals are 
     individual_id  genetic_value  causal_mutation_id
0                0      29.720383                   0
1                1      49.414794                   0
2                2      35.848493                   0
3                3      22.604641                   0
4      

Unnamed: 0,causal_mutation_id,individual_id,genetic_value,environmental_noise,phenotype
0,0,0,-0.001081,-0.535615,-0.536696
1,0,1,0.301011,0.796810,1.097821
2,0,2,0.092918,0.212077,0.304995
3,0,3,-0.110229,0.356480,0.246251
4,0,4,-0.063853,0.308581,0.244727
...,...,...,...,...,...
195,0,195,-0.069391,0.646474,0.577083
196,0,196,-0.870397,-0.501351,-1.371748
197,0,197,-0.082564,0.290010,0.207446
198,0,198,1.105406,-0.468742,0.636664


Alternatively, the user can also use his custom effect sizes (enclosed within a compatible dataframe) and manually build the consecutive steps of the simulation instead of using the sim_phenotypes_custom function. For this, the dataframe (for the univariate case) will have to be formed as shown for the df `input_df_manual` above.

Now, we show how the user can simulate custom phenotypes using custom noise.

In [8]:
normalize_genetic_values_before_noise = True

mean_1 = 0
std_1 = 1

phenotypes_df_user_noise = sim_phenotypes_custom(grg_1, input_df, normalize_genetic_values_before_noise=normalize_genetic_values_before_noise, user_mean=mean_1, user_cov=std_1)
phenotypes_df_user_noise

The initial effect sizes are 
       mutation_id  effect_size  causal_mutation_id
0                0     1.722344                   0
1                1    -0.104128                   0
2                2     0.318194                   0
3                3     0.591837                   0
4                4    -0.657885                   0
...            ...          ...                 ...
10888        10888     0.037326                   0
10889        10889    -0.953928                   0
10890        10890    -0.381611                   0
10891        10891     0.371581                   0
10892        10892     1.106401                   0

[10893 rows x 3 columns]
The genetic values of the individuals are 
     individual_id  genetic_value  causal_mutation_id
0                0      29.720383                   0
1                1      49.414794                   0
2                2      35.848493                   0
3                3      22.604641                   0
4      

Unnamed: 0,causal_mutation_id,individual_id,genetic_value,environmental_noise,phenotype
0,0,0,-0.001225,0.343012,0.341787
1,0,1,0.341003,-0.237615,0.103389
2,0,2,0.105263,0.166414,0.271677
3,0,3,-0.124874,-0.036775,-0.161650
4,0,4,-0.072337,1.339836,1.267499
...,...,...,...,...,...
195,0,195,-0.078611,-0.490675,-0.569286
196,0,196,-0.986039,0.914493,-0.071546
197,0,197,-0.093534,0.825908,0.732374
198,0,198,1.252271,0.019457,1.271728
