## install scME

In [None]:
#first install scME
!python setup.py install

## scME 

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import scvi
import scanpy as sc
import anndata as ad
sc.set_figure_params(figsize=(8, 8))
from scipy.io import mmread

%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'
import scme

In [4]:
#read the count data matrix of BMNC dataset
#read RNA data
rna_count=mmread('./example_data/rnacountdgc.txt').toarray()
genes=pd.read_csv("./example_data/genes.txt",index_col=0)
cellids=pd.read_csv("./example_data/cellids.txt",index_col=0)

#read protein data
protein_count=pd.read_csv('./example_data/adtcount.csv',index_col=0)

In [5]:
rna_count=pd.DataFrame(rna_count.T,index=cellids.values[:,0],columns=genes.values[:,0])
rna_count

Unnamed: 0,FO538757.2,AP006222.2,RP4-669L17.10,RP11-206L10.9,LINC00115,FAM41C,SAMD11,NOC2L,KLHL17,PLEKHN1,...,FAM19A5,RP3-522J7.6,CITF22-1A6.3,MOV10L1,MIR99AHG,AP000223.42,DSCR9,AP001626.2,AP001046.5,AC004556.1
a_AAACCTGAGCTTATCG-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a_AAACCTGAGGTGGGTT-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a_AAACCTGAGTACATGA-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a_AAACCTGCAAACCTAC-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a_AAACCTGCAAGGTGTG-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
b_TTTGTCATCCGAGCCA-1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
b_TTTGTCATCCGTAGGC-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
b_TTTGTCATCCTCGCAT-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
b_TTTGTCATCGCCGTGA-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
#preprocess the data
#create adata object
rna=ad.AnnData(X=rna_count.values,obs=pd.DataFrame(index=rna_count.index), var=pd.DataFrame(index=rna_count.columns))
protein=ad.AnnData(X=protein_count.values,obs=pd.DataFrame(index=protein_count.index), var=pd.DataFrame(index=protein_count.columns))

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [8]:
# select highly variable genes
rna.layers["counts"] = rna.X.copy()
sc.pp.normalize_total(rna)
sc.pp.log1p(rna)
sc.pp.highly_variable_genes( 
    rna,
    n_top_genes=2000,
    flavor="seurat_v3",
    subset = True
)
rna.raw = rna
rna = rna[:, rna.var.highly_variable]




In [10]:
#PCA and clustering for RNA data
sc.pp.pca(rna,svd_solver='arpack')
sc.pp.neighbors(rna, n_neighbors=30,n_pcs=30)   
sc.tl.leiden(rna, key_added="rna_leiden",resolution=1)

In [11]:
def clr_normalize_each_cell(adata, inplace=True):
    """Normalize count vector for each cell, i.e. for each row of .X"""

    import numpy as np
    import scipy

    def seurat_clr(x):
        # TODO: support sparseness
        s = np.sum(np.log1p(x[x > 0]))
        exp = np.exp(s / len(x))
        return np.log1p(x / exp)

    if not inplace:
        adata = adata.copy()

    # apply to dense or sparse matrix, along axis. returns dense matrix
    adata.X = np.apply_along_axis(
        seurat_clr, 1, (adata.X.A if scipy.sparse.issparse(adata.X) else adata.X)
    )
    return adata

In [12]:
protein.layers["counts"] = protein.X.copy()
protein=clr_normalize_each_cell(protein)
sc.pp.pca(protein, svd_solver="arpack")
sc.pp.neighbors(protein, n_neighbors=10) 
sc.tl.leiden(protein, key_added="protein_leiden",resolution=1)

In [13]:
#create training dataset
rna.X=rna.layers["counts"]
protein.X=protein.layers["counts"]
traindataset=scme.AnnDataset(rna,protein,to_onehot=True)

In [17]:
#get scme model
model=scme.build_scme(rna_count,rna_count,traindataset,protein_dist="NB",if_preprocess=True)

TypeError: build_scme() got multiple values for argument 'protein_dist'