# Generate synthetic data

In [1]:
import os
import sys
sys.path.append('..')

import utils.settings as settings

import pandas as pd
import numpy as np

from pandas_plink import read_plink1_bin, write_plink1_bin

from limix.qc import quantile_gaussianize

from cellregmap._simulate import (
    sample_persistent_effsizes,
    sample_persistent_effects,
    sample_gxe_effects,
    column_normalize
)

import scanpy as sc

In [2]:
rng = np.random.default_rng(123)

## Load data

In [3]:
adata = sc.read(settings.DATA_DIR + '/filtered/adata.h5ad')
genotypes = read_plink1_bin(settings.DATA_DIR + '/filtered/genotypes.bed')

Mapping files: 100%|██████████| 3/3 [00:11<00:00,  3.73s/it]


In [4]:
# sample cells from donors with most cells and sample genes uniformly
top_donors = adata.obs['donor_long_id'].value_counts()[:settings.N_DONORS].index
adata = adata[adata.obs['donor_long_id'].isin(top_donors), :]
adata.obs['donor_long_id'] = adata.obs['donor_long_id'].cat.remove_unused_categories()

cell_filter = adata.obs.groupby('donor_long_id').sample(n=settings.N_CELLS, random_state=42).index # sample settings.N_CELLS genes for simulation

gene_filter = adata.var_names.to_series().sample(settings.N_GENES, random_state=42) # sample settings.N_GENES genes for simulation

adata = adata[cell_filter, gene_filter]
adata

  adata.obs['donor_long_id'] = adata.obs['donor_long_id'].cat.remove_unused_categories()


View of AnnData object with n_obs × n_vars = 5000 × 500
    obs: 'assigned', 'auxDir', 'cell_filter', 'cell_name', 'compatible_fragment_ratio', 'day', 'donor', 'expected_format', 'experiment', 'frag_dist_length', 'gc_bias_correct', 'is_cell_control', 'is_cell_control_bulk', 'is_cell_control_control', 'library_types', 'libType', 'log10_total_counts', 'log10_total_counts_endogenous', 'log10_total_counts_ERCC', 'log10_total_counts_feature_control', 'log10_total_counts_MT', 'log10_total_features', 'log10_total_features_endogenous', 'log10_total_features_ERCC', 'log10_total_features_feature_control', 'log10_total_features_MT', 'mapping_type', 'mates1', 'mates2', 'n_alt_reads', 'n_total_reads', 'num_assigned_fragments', 'num_bias_bins', 'num_bootstraps', 'num_compatible_fragments', 'num_consistent_mappings', 'num_inconsistent_mappings', 'num_libraries', 'num_mapped', 'num_processed', 'num_targets', 'nvars_used', 'pct_counts_endogenous', 'pct_counts_ERCC', 'pct_counts_feature_control', 'pct_c

In [5]:
genotypes = genotypes[pd.Series(genotypes.sample.values).isin(top_donors), :] # restrict to top donors
genotypes = genotypes[:, genotypes.values.sum(0) / (2 * genotypes.shape[0]) > 0.05] # filter by MAF
genotypes = genotypes[:, genotypes.values.std(0) > 0.0] # remove low-variance SNPs

genotypes.shape

(50, 2610302)

## Sample SNPs for each gene from a different chromosome

In [6]:
gene_annotation = pd.read_csv(settings.DATA_DIR + '/gene_annotation.tsv', sep='\t')[['Gene stable ID', 'Chromosome/scaffold name']].drop_duplicates()
gene_annotation.columns = ['gene', 'chrom']
gene_annotation = gene_annotation.set_index('gene')

In [7]:
ensembl_ids = gene_filter.str.split('_', expand=True)[0]
adata.var['chrom'] = gene_annotation.loc[ensembl_ids, 'chrom'].tolist()

  adata.var['chrom'] = gene_annotation.loc[ensembl_ids, 'chrom'].tolist()


In [8]:
chroms = adata.var['chrom'].unique()
n_chroms = chroms.size
for i, chrom in enumerate(chroms):
    print('[' + ((i+1) * '=') + ((n_chroms - i-1) * ' ') + ']', end='\r')
    ids = adata.var['chrom'] == chrom
    variants = rng.choice(genotypes.snp[genotypes.chrom != chrom], settings.N_SNPS * ids.sum())
    adata.var.loc[ids, 'snpID'] = pd.DataFrame(variants.reshape((ids.sum(), settings.N_SNPS))).apply(lambda x: ','.join(x), 1).tolist()



In [9]:
snps = ','.join(adata.var['snpID'].tolist()).split(',')

In [10]:
genotypes = genotypes[:, pd.Series(genotypes.snp).isin(np.unique(snps))]
genotypes.shape

(50, 500)

## Save

In [11]:
write_plink1_bin(genotypes, settings.DATA_DIR + '/filtered/genotypes_sim.bed')

Writing BED: 100%|██████████| 1/1 [00:01<00:00,  1.81s/it]

Writing FAM... done.
Writing BIM... done.





In [12]:
adata.obsm['X_mofa'] = column_normalize(quantile_gaussianize(adata.obsm['X_mofa']))

## Simulate eQTLs

In [13]:
if not os.path.exists(settings.DATA_DIR + '/simulated'):
    os.mkdir(settings.DATA_DIR + '/simulated')
else:
    print('Warning: Dir exists')

In [14]:
def simulate(gvar, fev_gxc, ncontexts, adata=adata):
    path = settings.DATA_DIR + f'/simulated/gvar~{gvar}_fevgxc~{fev_gxc}_ncontexts~{ncontexts}'    
    print(path)
    
    if not os.path.exists(path):
        os.mkdir(path)
    else:
        return None
        
    y_persistent = np.zeros_like(adata.X)
    y_gxc = np.zeros_like(adata.X)

    # fraction of explained variance by GxC and G
    v_g = gvar * (1 - fev_gxc)
    v_gxc = gvar * fev_gxc

    # contexts with GxC
    C = adata.obsm['X_mofa'][:, :ncontexts] / np.sqrt(ncontexts)
    # simulate data for each gene
    for i, g in enumerate(adata.var_names):
        # SNPs for this gene
        G = genotypes[:, pd.Series(genotypes.snp).isin(adata.var.loc[g, 'snpID'].split(','))]
        # expand to cell-level
        G = G.sel(sample=adata.obs['donor_long_id'].tolist()).values
        G = column_normalize(G)
        
        # sample
        beta_g = sample_persistent_effsizes(settings.N_SNPS, [0], v_g, rng)
        y_persistent[:, i] = sample_persistent_effects(G, beta_g, v_g)
        y_gxc[:, i] = sample_gxe_effects(G, C, [0], v_gxc, rng)

    # map to Poisson rate and sample
    y_mean = np.exp(adata.X + y_persistent + y_gxc)
    y = rng.poisson(lam=y_mean)

    
    adata.layers['y_base'] = adata.X
    adata.layers['y_persistent'] = y_persistent
    adata.layers['y_gxc'] = y_gxc
    adata.X = y
    
    sc.write(path + '/adata.h5ad', adata)

### Simulated data 1: Vary FEV by GxC

In [15]:
for gvar in settings.GENETIC_VAR:    
    for fev_gxc in settings.FEV_GXC:        
        ncontexts = settings.NUM_CONTEXTS_DEFAULT
        _ = simulate(gvar=gvar, fev_gxc=fev_gxc, ncontexts=ncontexts, adata=adata.copy())  

../utils/../data/simulated/gvar~0.01_fevgxc~0.0_ncontexts~10
../utils/../data/simulated/gvar~0.01_fevgxc~0.25_ncontexts~10
../utils/../data/simulated/gvar~0.01_fevgxc~0.5_ncontexts~10
../utils/../data/simulated/gvar~0.01_fevgxc~0.75_ncontexts~10
../utils/../data/simulated/gvar~0.01_fevgxc~1.0_ncontexts~10
../utils/../data/simulated/gvar~0.025_fevgxc~0.0_ncontexts~10
../utils/../data/simulated/gvar~0.025_fevgxc~0.25_ncontexts~10
../utils/../data/simulated/gvar~0.025_fevgxc~0.5_ncontexts~10
../utils/../data/simulated/gvar~0.025_fevgxc~0.75_ncontexts~10
../utils/../data/simulated/gvar~0.025_fevgxc~1.0_ncontexts~10


### Simulated data 2: Vary number of contexts with GxC

In [16]:
for gvar in settings.GENETIC_VAR:    
    for ncontexts in settings.NUM_CONTEXTS:
        fev_gxc = settings.FEV_GXC_DEFAULT
        _ = simulate(gvar=gvar, fev_gxc=fev_gxc, ncontexts=ncontexts, adata=adata.copy())  

../utils/../data/simulated/gvar~0.01_fevgxc~0.5_ncontexts~2
../utils/../data/simulated/gvar~0.01_fevgxc~0.5_ncontexts~5
../utils/../data/simulated/gvar~0.01_fevgxc~0.5_ncontexts~10
../utils/../data/simulated/gvar~0.01_fevgxc~0.5_ncontexts~15
../utils/../data/simulated/gvar~0.01_fevgxc~0.5_ncontexts~20
../utils/../data/simulated/gvar~0.025_fevgxc~0.5_ncontexts~2
../utils/../data/simulated/gvar~0.025_fevgxc~0.5_ncontexts~5
../utils/../data/simulated/gvar~0.025_fevgxc~0.5_ncontexts~10
../utils/../data/simulated/gvar~0.025_fevgxc~0.5_ncontexts~15
../utils/../data/simulated/gvar~0.025_fevgxc~0.5_ncontexts~20


### Simulated data 3: Null

In [17]:
for gvar in settings.GENETIC_VAR + [0.0]:    
    ncontexts = settings.NUM_CONTEXTS_DEFAULT
    fev_gxc = 0.0
    _ = simulate(gvar=gvar, fev_gxc=fev_gxc, ncontexts=ncontexts, adata=adata.copy())  

../utils/../data/simulated/gvar~0.01_fevgxc~0.0_ncontexts~10
../utils/../data/simulated/gvar~0.025_fevgxc~0.0_ncontexts~10
../utils/../data/simulated/gvar~0.0_fevgxc~0.0_ncontexts~10
