In [10]:
import pandas as pd
import numpy as np
import scanpy as sc
import anndata

In [11]:
# File paths
# path_adata = "./manuscript_data/segerstolpe_annotated.h5ad"
#path_adata = "/pstore/data/bi/reference/scseq/E-MTAB-5061_Pancreas/analyzed/standard_workflow_besca2_0/standard_workflow_besca2_0_annotated.h5ad/"
#path_annot = '~/segerstolpe_annot.csv'

path_adata = '/pstore/data/biomics/ONC/70259_TLR7/scRNAseqMouseComb/analyzed/sw_besca2_0/sw_besca2_0.annotated.h5ad'
path_annot = '/pstore/data/biomics/ONC/70259_TLR7/scRNAseqMouseComb/analyzed/sw_besca2_0/sw_besca2_0.annot_tkt.csv'
annot_col = 'dblabel' # adata column containing annotations


In [12]:
# Read anndata objects
adata = sc.read(path_adata)

In [13]:
# Conversion to R changes cell annotations to factors 
# -> export cell type annotations to be later rewritten into the R obj
# here we are selecting ['dblabel'] as the annotation column. Modify this for the desired annotation column
annot = adata.obs[annot_col]
annot.to_csv(path_annot, index=False, header=False)


In [14]:
# For SCDC and MuSiC we want expressions stored in AnnData.raw us they contain all sequenced genes, 
# not just highly variable genes filtered further downstream of the BESCA workflow
obs = adata.obs
var = adata.raw.var
uns = adata.uns
raw = adata.raw
# raw.X contains CP10k log scaled data. Thus, we need to linearize using raw.X.expm1()
adata_raw = anndata.AnnData(raw.X.expm1(), obs=obs, var=var, uns=uns, raw=raw)

### Fix segerstolpe index - needs to contain patient id <a id='bisque_id'></a>

This is required by the BisqueRNA::SeuratToExpressionSet() R package: https://rdrr.io/cran/BisqueRNA/man/SeuratToExpressionSet.html \
The sample index has to contain subscriptable individual patient IDs

Example correct patient ID: **ERR1630619_T2D1**, where ERR1630619 is the cell UMI and T2D1 is the unique patient ID we want. 

We will then specify this in the R script that calls the `BisqueRNA::SeuratToExpressionSet()` method \
`BisqueRNA::SeuratToExpressionSet(seurat.object, delimiter='_', position='2', version = "v3")`

`delimiter` = Character to split cell names with to find individual ID. \
`position` = Integer indicating 1-indexed position of individual ID after splitting cell name with delimiter. R indexing starts from 1


In [15]:
# To Omit or find matching field
idx = adata_raw.obs.index
idx_new = []
for i, val in enumerate(idx):
    out = val + '_' +adata_raw.obs['Sample Characteristic[individual]'][i]
    idx_new.append(out)
    
adata_raw.obs.index = idx_new
adata_raw.obs.head()

KeyError: 'Sample Characteristic[individual]'

### Convert to SingleCellExperiment R Object

saves anndata as SCE object. Postprocess in R

based on tutorial here: https://github.com/LuckyMD/Code_snippets/blob/master/Seurat_to_anndata.ipynb

In [16]:
import anndata2ri
anndata2ri.activate()

In [17]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [18]:
%%R -i adata_raw
saveRDS(adata_raw, '/pstore/data/biomics/ONC/70259_TLR7/scRNAseqMouseComb/analyzed/sw_besca2_0/adata_raw_to_sce_tkt.RDS')

# Next: postprocess in R to convert SCE --> Eset
Use the `sce_to_eset.R` script. 
1. Edit the last two lines of the script for the correct input/output file paths:\
`seurat <- sce_to_seurat(sce_path='./segerstolpe_raw_sce.RDS', sc_anno_path='./segerstolpe_annot.csv', filename=NULL)`\
`sce_path` = path to SCE file generated above\
`sc_anno_path` = path to the .csv file containing cell annotations generated in the section above
\
\
`eset <- seurat_to_eset(seurat, delim='_', idx=2, filename='./segerstolpe_raw_eset.RDS')`\
`delim` = Character to split cell names with to find individual ID. \
`idx` = Integer indicating 1-indexed position of individual ID after splitting cell name with delimiter. R indexing starts from 1
2. Set the correct patient ID indexing as explained in the [section above](#bisque_id)
3. Run from cmd line: `env R_MAX_VSIZE=100Gb Rscript sce_to_eset.R`