In [None]:
# Author: Antti Kiviaho
# Date: 20.1.2023
# A script for running normalization and sample integration clustering.
# Uses the scbi integration environment and pipeline:
#
#
# 1. Cell and gene filtering
# 2. scran normalization through R interface using
# 3. batch-aware scaling with scib
# 4. batch-aware HVGs with scib
# 5. scanorama integration into PCA, clustering, UMAP

In [None]:
import numpy as np
import anndata as ad
import scanpy as sc
import pandas as pd
import seaborn as sns
import scib
import scanorama

import matplotlib.pyplot as plt
from scripts.utils import get_sample_ids, save_to_pickle

In [None]:
# Download data from spaceranger output files in bulk
samples = get_sample_ids()
adata_dict = {}
for idx,sample_id in enumerate(samples):
    adata_sample = sc.read_visium('./results/'+sample_id+'/outs/',library_id=sample_id)
    adata_sample.var_names_make_unique()
    adata_dict[sample_id] = adata_sample

In [None]:
# Produce QC plots as done at https://scanpy-tutorials.readthedocs.io/en/latest/spatial/integration-scanorama.html
# Save the QC plots to a path
from pathlib import Path
dir_path = './plots/qc-plots-for-spatial-sections'
Path(dir_path).mkdir(parents=True, exist_ok=True)
for name in samples:
    adata = adata_dict[name]
    sc.pp.calculate_qc_metrics(adata, inplace=True)
    fig, axs = plt.subplots(1, 4, figsize=(24, 6))
    fig.suptitle(f"Covariates for filtering: {name}")

    sns.distplot(adata.obs["total_counts"], kde=False, ax=axs[0])
    sns.distplot(
        adata.obs["total_counts"][adata.obs["total_counts"] < 20000],
        kde=False,
        bins=40,
        ax=axs[1],
    )
    sns.distplot(adata.obs["n_genes_by_counts"], kde=False, bins=60, ax=axs[2])
    sns.distplot(
        adata.obs["n_genes_by_counts"][adata.obs["n_genes_by_counts"] < 4000],
        kde=False,
        bins=60,
        ax=axs[3],
    )
    fig.tight_layout()
    fig.set_dpi(200)
    plt.savefig(dir_path+'/'+name+'_qc_metrics'+'.png')
    plt.clf()

In [None]:
def qc_and_normalize(adata):
    # requires scib-pipline-R4.0 conda environment !
    import scib
    # normalize and calculate leiden clustering
    sc.pp.filter_genes(adata, min_cells=5)
    sc.pp.filter_cells(adata, min_counts=500)
    scib.preprocessing.normalize(adata,precluster=False)
    return adata

In [11]:
for sample_id in samples:
    adata = adata_dict[sample_id]
    adata = qc_and_normalize(adata)

    # add ids to the data for use after data concatenation
    adata_dict[sample_id].obs['sample_id'] = sample_id
    adata_dict[sample_id].obs_names = sample_id + '_' + adata_dict[sample_id].obs_names

  values, tz_parsed = conversion.datetime_to_datetime64(data.ravel("K"))


Note! Performing log1p-transformation after normalization.
Note! Performing log1p-transformation after normalization.
Note! Performing log1p-transformation after normalization.
Note! Performing log1p-transformation after normalization.
Note! Performing log1p-transformation after normalization.
Note! Performing log1p-transformation after normalization.
Note! Performing log1p-transformation after normalization.
Note! Performing log1p-transformation after normalization.
Note! Performing log1p-transformation after normalization.
Note! Performing log1p-transformation after normalization.
Note! Performing log1p-transformation after normalization.
Note! Performing log1p-transformation after normalization.
Note! Performing log1p-transformation after normalization.
Note! Performing log1p-transformation after normalization.
Note! Performing log1p-transformation after normalization.
Note! Performing log1p-transformation after normalization.
Note! Performing log1p-transformation after normalizatio

In [17]:
save_to_pickle(adata_dict,'normalized_visium_data.pickle')

In [32]:
adata_concat = sc.concat(adata_dict)
adata_concat.obs.sample_id = adata_concat.obs.sample_id.astype('category')
scib.preprocessing.scale_batch(adata_concat,batch='sample_id')
hvg_list = scib.preprocessing.hvg_batch(adata_concat,batch_key='sample_id',target_genes=3000,flavor='seurat',adataOut=False)

adata_scanorama = scib.integration.scanorama(adata_concat,batch='sample_id',hvg=hvg_list)

save_to_pickle(adata_scanorama,'./data/visium_after_scanorama.pickle')

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  adata_scaled.layers[key] = tmp[key]


Using 27 HVGs from full intersect set
Using 33 HVGs from n_batch-1 set
Using 56 HVGs from n_batch-2 set
Using 64 HVGs from n_batch-3 set
Using 83 HVGs from n_batch-4 set
Using 104 HVGs from n_batch-5 set
Using 151 HVGs from n_batch-6 set
Using 203 HVGs from n_batch-7 set
Using 280 HVGs from n_batch-8 set
Using 426 HVGs from n_batch-9 set
Using 627 HVGs from n_batch-10 set
Using 943 HVGs from n_batch-11 set
Using 3 HVGs from n_batch-12 set
Using 3000 HVGs


['HBA2',
 'IGHG3',
 'IGHG1',
 'JCHAIN',
 'IGHA1',
 'IGKC',
 'MYH11',
 'TPM2',
 'ACTG2',
 'DES',
 'LYZ',
 'SYNPO2',
 'TPSB2',
 'TAGLN',
 'CTGF',
 'FLNC',
 'CLU',
 'VWF',
 'MGP',
 'MT-ATP8',
 'MYLK',
 'LMOD1',
 'CALD1',
 'SYNM',
 'SAMHD1',
 'SMTN',
 'MRVI1',
 'NEXN',
 'RGS5',
 'PRRX1',
 'PPP1R12B',
 'PRELP',
 'FN1',
 'TNS1',
 'FILIP1L',
 'CCDC80',
 'ZBTB20',
 'PALLD',
 'ITGA1',
 'MAP1B',
 'THBS2',
 'SFRP4',
 'COL1A2',
 'FHL1',
 'CRYAB',
 'MCAM',
 'ACTA2',
 'IGF1',
 'RNASE1',
 'IGHM',
 'THBS1',
 'UACA',
 'CCL2',
 'PTGIS',
 'CFD',
 'CNN1',
 'PPP1R14A',
 'IGLC3',
 'ADAMTS1',
 'PCP4',
 'C1QB',
 'CD52',
 'PODN',
 'ANKRD36C',
 'ZEB2',
 'NRP2',
 'MTRNR2L12',
 'COL8A1',
 'SPARCL1',
 'SPP1',
 'FYB1',
 'CXCL14',
 'CARMN',
 'HIST1H4C',
 'HIST1H1E',
 'CFB',
 'HLA-DRA',
 'DST',
 'AKAP12',
 'SYNE1',
 'AGR2',
 'INHBA',
 'ELN',
 'TRBC2',
 'CHRDL1',
 'FLNA',
 'SFRP1',
 'SULF1',
 'PTGDS',
 'HBB',
 'C11orf96',
 'ITGA8',
 'MSMB',
 'SORBS1',
 'A2M',
 'KRT7',
 'IGFBP6',
 'ITGA5',
 'MSRB3',
 'PPP1R12A',
 'HSPB