In [3]:
## We read our SNP metadata file
SNP_metadata = read.table("../../plink-182/data/snp_metadata.txt")
colnames(SNP_metadata) = c("ID", "POS", "AF")
head(SNP_metadata)



ID,POS,AF
rs78200054,9411410,0.466853
rs71235073,9411500,0.459065
rs368646645,9411602,0.365415
rs71235074,9411645,0.44349
rs71235075,9411785,0.45607
rs71220884,9412503,0.901957


In [27]:
#We read in our genotypes
mydata = read.table("../../plink-182/data/chr_21_genotypes.tab")

In [28]:
rownames(mydata) = SNP_metadata$ID
head(mydata)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,⋯,V2495,V2496,V2497,V2498,V2499,V2500,V2501,V2502,V2503,V2504
rs78200054,1,2,1,1,1,1,1,1,2,1,⋯,1,1,1,1,1,1,2,1,1,1
rs71235073,0,2,1,1,0,1,1,1,2,1,⋯,0,1,1,1,1,1,2,1,1,1
rs368646645,0,0,1,1,1,1,1,1,0,1,⋯,1,1,1,1,1,1,0,1,1,1
rs71235074,1,2,1,1,1,1,1,1,2,1,⋯,1,1,1,1,1,1,1,1,1,1
rs71235075,1,2,1,1,1,1,1,1,2,1,⋯,0,1,1,1,1,1,2,1,1,1
rs71220884,1,2,2,2,2,1,2,2,2,2,⋯,2,2,2,2,2,2,2,2,2,2


In [72]:
#We read in our populations
mydataPop = read.table("../../plink-182/data/sample_metadata.tsv", header = TRUE)

In [73]:
head(mydataPop)

id,population,superpopulation
HG00096,GBR,EUR
HG00097,GBR,EUR
HG00099,GBR,EUR
HG00100,GBR,EUR
HG00101,GBR,EUR
HG00102,GBR,EUR


In [74]:
#We select causal SNPs and assign them an effect size

set.seed(11123)

for(numsim in 1:10)
{
    causal_SNP = SNP_metadata$ID[sample(NROW(mydata), 100)]
    effect_size = rnorm(100, sd=1)
    
    info_causal_SNPs = data.frame(causal_SNP, effect_size)
    
    write.table(info_causal_SNPs, paste0("Info about simulations/Causal SNPs/info_causal_SNPs_sim", numsim, ".txt"),row.names=FALSE, sep="\t")
}


In [75]:
#We assign an enviromental effect to each population

set.seed(432423)

for(numsim in 1:10)
{
    Population = c("AFR", "EAS", "EUR", "SAS", "AMR", "None")
    
    effect = rnorm(5, sd=5)
    effect = c(effect, 0)
    
    info_population_effect = data.frame(Population, effect)
    
    write.table(info_population_effect, paste0("Info about simulations/Population effects/info_population_effect_sim", numsim, ".txt"),row.names=FALSE, sep="\t")
}


In [90]:
## We simulate our phenotypes

set.seed(324324)

for(numsim in 1:10)
{
    info_population_effect = read.table(paste0("Info about simulations/Population effects/info_population_effect_sim", numsim, ".txt"), header=T)
    info_causal_SNPs = read.table(paste0("Info about simulations/Causal SNPs/info_causal_SNPs_sim", numsim, ".txt"), header = T)
    
    #Simulation with no environmental effects
    pheno = t(mydata[info_causal_SNPs$causal_SNP,]) %*% info_causal_SNPs$effect_size + rnorm(NCOL(mydata), sd=3)
    
    #Simulation with environmental effects
    mydataPop$order_id = 1:NROW(mydataPop)
    merged_data_pop = merge(mydataPop, info_population_effect, by.x = "superpopulation", by.y="Population")
    merged_data_pop = merged_data_pop[order(merged_data_pop$order_id),]
    
    pheno_env = pheno + merged_data_pop$effect
    
    #We create files in format that plink understands
    mypheno = data.frame(sample=mydataPop$id, fam=mydataPop$id, phen=pheno)
    mypheno_env = data.frame(sample=mydataPop$id, fam=mydataPop$id, phen=pheno_env)
    
    write.table(mypheno, paste0("SimulatedPhenotypes/Noenvironmenteffect/pheno_sim", numsim, ".phen"), row.names=F, sep="\t", quote=F)
    write.table(mypheno_env, paste0("SimulatedPhenotypes/Yesenvironmenteffect/pheno_sim", numsim, ".phen"), row.names=F, sep="\t", quote=F)
}

In [88]:
mypheno_env

Unnamed: 0,sample,fam,phen
V1,HG00096,HG00096,-13.478764
V2,HG00097,HG00097,-10.097050
V3,HG00099,HG00099,-24.804879
V4,HG00100,HG00100,-27.936369
V5,HG00101,HG00101,-14.698855
V6,HG00102,HG00102,-21.705869
V7,HG00103,HG00103,-33.126240
V8,HG00105,HG00105,-20.554933
V9,HG00106,HG00106,-26.571126
V10,HG00107,HG00107,-19.281542
