# Running SCellBOW on simulated prostate cancer dataset

In [1]:
import SCellBOW as sb
import scanpy as sc
import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt to /home/namratab/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Import Dataset

In [2]:
# read source data
adata_source = sc.read("data/adata_source.h5ad")
adata_source

AnnData object with n_obs × n_vars = 1334 × 2000

In [3]:
# read target data
adata_target = sc.read("data/adata_target.h5ad")
adata_target

AnnData object with n_obs × n_vars = 836 × 2000
    obs: 'subtype'

In [4]:
# create a copy of target dataset for phenotype algebra
adata_test = adata_target.copy()
adata_test

AnnData object with n_obs × n_vars = 836 × 2000
    obs: 'subtype'

In [5]:
#load the survival data
adata_train = sc.read("data/adata_train.h5ad")
adata_train

  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 81 × 17981
    obs: 'subtype', 'time', 'status'
    var: 'n_cells'

# SCellBOW Pretrained model

In [6]:
# preprocess source dataset for pretraining
adata_source.var_names_make_unique()
sc.pp.filter_cells(adata_source, min_genes=10)
sc.pp.filter_genes(adata_source, min_cells=2)

sc.pp.normalize_total(adata_source, target_sum=1e4)
sc.pp.log1p(adata_source)
    
sc.pp.highly_variable_genes(adata_source, n_top_genes = 1000)
adata_source = adata_source[:, adata_source.var.highly_variable]

sc.pp.scale(adata_source, max_value=10)
adata_source

  view_to_actual(adata)


AnnData object with n_obs × n_vars = 1334 × 1000
    obs: 'n_genes'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg'

In [7]:
sb.SCellBOW_pretrain(adata_source, save_dir= "pca", vec_size=300, n_worker=1, iter=20)

[ 2022-12-30 11:05:15.683988 ] The path to save directory is ./pca/
[ 2022-12-30 11:05:15.684017 ] Creating the source model.
[ 2022-12-30 11:05:15.695078 ] Creating the corpus.


  0%|          | 0/1334 [00:00<?, ?it/s]

[ 2022-12-30 11:05:16.460787 ] Corpus created with size = 1334
[ 2022-12-30 11:05:16.460830 ] Tagging the corpora.
[ 2022-12-30 11:05:19.917635 ] All corpuses tagged with length 1334
[ 2022-12-30 11:05:19.917697 ] Inititalize the SCellBOW source model.
[ 2022-12-30 11:05:19.917706 ] INFO - SCellBOW: vector size = 300
[ 2022-12-30 11:05:19.917716 ] INFO - SCellBOW: initial learning rate = 0.025
[ 2022-12-30 11:05:19.917729 ] INFO - SCellBOW: min_alpha = 0.00025
[ 2022-12-30 11:05:19.917739 ] INFO - SCellBOW: min_count = 1
[ 2022-12-30 11:05:19.917748 ] INFO - SCellBOW: number of cpu = 1
[ 2022-12-30 11:05:19.918177 ] Building vocabulary.
[ 2022-12-30 11:05:20.078269 ] Vocabulary built.
[ 2022-12-30 11:05:20.078291 ] Start training the neural network.
[ 2022-12-30 11:05:59.156754 ] Training SCellBOW source model finished.
[ 2022-12-30 11:05:59.162554 ] Model saved in directory  ./pca/
[ 2022-12-30 11:05:59.179103 ] Source model created!


<SCellBOW.SCellBOW.SCellBOW_pretrain at 0x7f9ea96ab790>

# SCellBOW Clustering

In [8]:
# preprocess target dataset for clustering 
adata_target.var_names_make_unique()
sc.pp.filter_cells(adata_target, min_genes=10)
sc.pp.filter_genes(adata_target, min_cells=2)

sc.pp.normalize_total(adata_target, target_sum=1e4)
sc.pp.log1p(adata_target)
    
sc.pp.highly_variable_genes(adata_target, n_top_genes = 1000)
adata_target.raw = adata_target
adata_target = adata_target[:, adata_target.var.highly_variable]

sc.pp.scale(adata_target, max_value=10)

adata_target

  view_to_actual(adata)


AnnData object with n_obs × n_vars = 836 × 1000
    obs: 'subtype', 'n_genes'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg'

In [None]:
# invoke SCellBOW cluster
adata_output = sb.SCellBOW_cluster(adata_target,'pca').run()

[ 2022-12-30 11:05:59.244904 ] The path to save directory is ./pca/
[ 2022-12-30 11:05:59.245006 ] Begin SCellBOW: transfer learning.
MinMaxScaler(feature_range=(1, 10))
[ 2022-12-30 11:05:59.257085 ] Creating the corpus.


  0%|          | 0/836 [00:00<?, ?it/s]

[ 2022-12-30 11:05:59.729087 ] Tagging the corpora for transfer learning.
[ 2022-12-30 11:06:02.007222 ] All corpuses tagged with length = 836
[ 2022-12-30 11:06:02.007295 ] Updating the vocabulary.
[ 2022-12-30 11:06:02.127024 ] Vocabulary updated.
[ 2022-12-30 11:06:02.127047 ] Start transfer learning on the neural network.
[ 2022-12-30 11:06:53.060211 ] Weights of the neural network calibrated.
[ 2022-12-30 11:06:53.060281 ] Start infering the vectors for target dataset.


  0%|          | 0/836 [00:00<?, ?it/s]

In [None]:
adata_output

In [None]:
#visualisation of leiden clusters
resolution = 1.0
with plt.rc_context({'figure.figsize': (5, 5)}):
    sc.pl.umap(adata_output, 
               color='clusters_'+str(resolution), 
               add_outline=True, 
               legend_fontsize=14, 
               legend_fontoutline=2,
               title='UMAP visualisation', 
               size = 50,
               palette=plt.rcParams["axes.prop_cycle"],
              )

# SCellBOW Phenotype algebra

In [None]:
#invoke phenotype algebra
adata_test.obs =  adata_output.obs
median_score, predicted_score=sb.SCellBOW_algebra(adata_test,adata_train,"pca",
                    type='clusters_1.0',
                    bootstrap_samples=10,n_top_features=1000).run()

In [None]:
predicted_score

In [None]:
median_score

In [None]:
median_score.sort_values(ascending=True, inplace=True)
predicted_score = predicted_score[median_score.index]
plt.figure(figsize=(4,4))
predicted_score.boxplot(patch_artist=True, notch=True)
plt.xticks(rotation=90, size=10)
plt.show()