In [1]:
# load required modules
import scanpy.api as sc
import besca as bc
import pandas as pd
import numpy as np
from matplotlib import pyplot
import os
import sys
np.random.seed(42)


In a future version of Scanpy, `scanpy.api` will be removed.
Simply use `import scanpy as sc` and `import scanpy.external as sce` instead.


No direct replacement for 'numba.targets' available. Visit https://gitter.im/numba/numba-dev to request help. Thanks!



# Simulate Bulk
Define simulate_bulk function:

In [4]:
def simulate_bulk(gep, n=10, filename_sim='simulated_blk-ensg-gene-expr.csv', filename_truth='truth.csv'):
    """Simulate bulk RNA-seq from a GEP
    
    GEP should be normalised. Simulates bulk according to the linear regression model
    
    y = X*w^T
    
    where y=bulk, X=GEP, w^T=cell fractions
    
    Args:
        gep(pandas.DataFrame): normalised gene expression profile (GEP)
        n(int): number of patients to simulate
        
    Returns:
        Writes out 2 .csv files: bulk.csv and truth.csv, 
        both in the same format as the DREAM Tumor Deconvolution challenge i/o standards.
        
        Returns pandas.DataFrames of simulated bulk and truth
    
    """

    genes = gep.index.values.tolist()
    subjects = []
    f_list = []
    b_array = np.zeros(shape = (n, gep.shape[0]))

    for i in range(n):
        subjects.append('S{}'.format(i))

        f = np.random.uniform(size=gep.shape[1])
        f *= 1.0/sum(f)
        f_list.append(f)
        X = gep.values
        b = X.dot(f)
        b_array[i]=b

    sim_df = pd.DataFrame(np.transpose(b_array), columns = subjects)
    sim_df.insert(0, column='Gene', value=genes)
    sim_df.to_csv(os.path.join(outdir_simulated, filename_sim), header=True, index=False)
    
    # output truth file
    g_df = pd.DataFrame( columns = ['dataset.name', 'sample.id', 'cell.type', 'measured'])
    for i, s in enumerate(subjects):  
        dataset_name = 'GSE134809'
        out_df = pd.DataFrame( columns = ['dataset.name', 'sample.id', 'cell.type', 'measured'])
        out_df['cell.type'] = gep.columns.tolist()
        out_df['measured'] = f_list[i]
        out_df['dataset.name'] = dataset_name
        out_df['sample.id'] = s

        g_df = g_df.append(out_df, ignore_index=True)

    g_df.to_csv(os.path.join(outdir_truth, filename_truth), header=True, index=False)
    
    return sim_df, g_df



In [3]:


# gene expression matrix to simulate from. Can be generated using besca.export.generate_gep function
input_dir = './gep/'
input_file = 'segerstolpe_gep_sum.csv'
input_data = os.path.join(input_dir, input_file)

# output folder for simulated bulk and truth
outdir_simulated = './output/simulated'
outdir_truth = './output/truth'

if not os.path.exists(outdir_simulated):
    os.makedirs(outdir_simulated)
if not os.path.exists(outdir_truth):
    os.makedirs(outdir_truth)

gep = pd.read_csv(input_data, sep=',', index_col='NAME')
#gep = gep.drop(['Description', 'others'], axis=1)
#gep = gep.drop(['Description'], axis=1) 

### Generate GEP and run simulate_bulk
**Note:** must change following paths:

`outdir_simulated`\
`outdir_truth`\
`filename`

As an example we are simulating on a dataset from Segerstolpe et al. https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-5061/

In [None]:
# first generate GEP for segerstolpe
path_segerstolpe = '/path/to/anndata/segerstolpe.h5ad'
adata_seg = sc.read(path_segerstolpe)
bc.export.generate_gep(adata_seg,
                       filename='segerstolpe_gep.csv',
                       column='dblabel',
                       annot='ENSEMBL',
                       outpath=os.getcwd())

In [None]:
# Load data
# Define directories

# gene expression matrix to simulate from
input_dir = './'
input_file = 'segerstolpe_gep.csv'
input_data = os.path.join(input_dir, input_file)

# output folder for simulated bulk and truth
outdir_simulated = './output/simulated'
outdir_truth = './output/truth'
filename_simbulk = 'simulated_blk_segerstolpe.csv'
filename_truth = 'truth_segerstolpe.csv'

if not os.path.exists(outdir_simulated):
    os.makedirs(outdir_simulated)
if not os.path.exists(outdir_truth):
    os.makedirs(outdir_truth)

gep = pd.read_csv(input_data, sep=',', index_col='NAME')

In [9]:
sim_df, truth_df = simulate_bulk(gep, filename_sim=filename_simbulk, filename_truth=filename_truth)

generate simulated bulk with HUGO symbols

In [None]:
filename_truth_hugo = 'truth_segerstolpe_hugo.csv'
filename_sim_hugo = 'simulated_blk_segerstolpe_hugo.csv'

In [15]:
# generate another GEP but with HUGO
bc.export.generate_gep(adata_seg,
                       filename='segerstolpe_hugo_gep.csv',
                       column='dblabel',
                       annot='SYMBOL',
                       outpath=os.getcwd())

Choosing column: [ dblabel ]for cell annotations
Loading gene expression from adata.raw.X
Calculating average expression per cell type per gene
segerstolpe_hugo_gep.csv exported successfully to file


In [17]:
input_dir = './'
input_file = 'segerstolpe_hugo_gep.csv'
input_data = os.path.join(input_dir, input_file)

gep = pd.read_csv(input_data, sep=',', index_col='NAME')

In [20]:
sim_df, truth_df = simulate_bulk(gep, filename_sim=filename_sim_hugo, filename_truth=filename_truth_hugo)