# Spatial transcriptomics unveils the in situ cellular and molecular hallmarks of the lung in fatal COVID-19

# Preprocessing and quality control (QC) of Visium spatial transcriptomics (ST) data

**Author:** Carlos A. Garcia-Prieto

* This notebook explains the preprocessing and QC steps performed with the Visium ST samples used in our study.
* We have followed single-cell best practices [tutorial.](https://www.sc-best-practices.org/preprocessing_visualization/quality_control.html)

## Import modules

In [1]:
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import anndata
import numpy as np
from numpy import mean
from numpy import std
from scipy.stats import median_abs_deviation
import os
import anndata2ri
import logging
from scipy.sparse import issparse
import rpy2.rinterface_lib.callbacks as rcb
import rpy2.robjects as ro

sc.settings.figdir = "./"
sc.settings.verbosity = 0
sc.settings.set_figure_params(
    dpi=80,
    facecolor="white",
    # color_map="YlGnBu",
    frameon=False,
)

rcb.logger.setLevel(logging.ERROR)
ro.pandas2ri.activate()
anndata2ri.activate()

%load_ext rpy2.ipython

## Read Visium ST data

In [2]:
#Set Space Ranger output directory
spaceranger="/Users/carlosgarciaprieto/Proyectos_IJC/Spatial/COVID/SpaceRanger"
#Set image directory
tiff="/Users/carlosgarciaprieto/Proyectos_IJC/Spatial/COVID/Images"

In [3]:
#Set Sample name
sample="L25P"

In [4]:
#Create annData object
adata = sc.read_visium(path=f"{spaceranger}/{sample}/",genome="GRCh38",source_image_path=f"{tiff}/{sample}.tif", library_id=f"{sample}")

  utils.warn_names_duplicates("var")


In [5]:
#Explore data
adata.var_names

Index(['SAMD11', 'NOC2L', 'KLHL17', 'PLEKHN1', 'PERM1', 'HES4', 'ISG15',
       'AGRN', 'RNF223', 'C1orf159',
       ...
       'SRY', 'ZFY', 'PCDH11Y', 'AMELY', 'TBL1Y', 'TSPY1', 'USP9Y', 'DDX3Y',
       'TMSB4Y', 'KDM5D'],
      dtype='object', length=17943)

In [6]:
#Mark genes for filtering
# mitochondrial genes
adata.var["mt"] = adata.var_names.str.startswith("MT-")
# ribosomal genes
adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes.
adata.var["hb"] = adata.var_names.str.contains(("^HB[^(P)]"))

In [7]:
adata.var

Unnamed: 0,gene_ids,feature_types,genome,mt,ribo,hb
SAMD11,ENSG00000187634,Gene Expression,GRCh38,False,False,False
NOC2L,ENSG00000188976,Gene Expression,GRCh38,False,False,False
KLHL17,ENSG00000187961,Gene Expression,GRCh38,False,False,False
PLEKHN1,ENSG00000187583,Gene Expression,GRCh38,False,False,False
PERM1,ENSG00000187642,Gene Expression,GRCh38,False,False,False
...,...,...,...,...,...,...
TSPY1,ENSG00000258992,Gene Expression,GRCh38,False,False,False
USP9Y,ENSG00000114374,Gene Expression,GRCh38,False,False,False
DDX3Y,ENSG00000067048,Gene Expression,GRCh38,False,False,False
TMSB4Y,ENSG00000154620,Gene Expression,GRCh38,False,False,False


In [8]:
adata.var["mt"].sum()

0

In [9]:
adata.var[adata.var["mt"]==True]

Unnamed: 0,gene_ids,feature_types,genome,mt,ribo,hb


In [10]:
adata.var["ribo"].sum()

0

In [11]:
adata.var[adata.var["ribo"]==True]

Unnamed: 0,gene_ids,feature_types,genome,mt,ribo,hb


In [12]:
adata.var["hb"].sum()

8

In [13]:
adata.var[adata.var["hb"]==True]

Unnamed: 0,gene_ids,feature_types,genome,mt,ribo,hb
HBEGF,ENSG00000113070,Gene Expression,GRCh38,False,False,True
HBS1L,ENSG00000112339,Gene Expression,GRCh38,False,False,True
HBG2,ENSG00000196565,Gene Expression,GRCh38,False,False,True
HBE1,ENSG00000213931,Gene Expression,GRCh38,False,False,True
HBM,ENSG00000206177,Gene Expression,GRCh38,False,False,True
HBA2,ENSG00000188536,Gene Expression,GRCh38,False,False,True
HBA1,ENSG00000206172,Gene Expression,GRCh38,False,False,True
HBQ1,ENSG00000086506,Gene Expression,GRCh38,False,False,True


In [14]:
#Set ENSEMBL gene ids as row names
adata.var['SYMBOL'] = adata.var_names
adata.var.set_index('gene_ids', drop=True, inplace=True)

In [15]:
#adata.var_names_make_unique()

In [16]:
#Data summary
adata

AnnData object with n_obs × n_vars = 4682 × 17943
    obs: 'in_tissue', 'array_row', 'array_col'
    var: 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'SYMBOL'
    uns: 'spatial'
    obsm: 'spatial'

## Explore data

In [17]:
adata.obs

Unnamed: 0,in_tissue,array_row,array_col
AAACAACGAATAGTTC-1,1,0,16
AAACAAGTATCTCCCA-1,1,50,102
AAACAATCTACTAGCA-1,1,3,43
AAACACCAATAACTGC-1,1,59,19
AAACAGAGCGACTCCT-1,1,14,94
...,...,...,...
TTGTTTCACATCCAGG-1,1,58,42
TTGTTTCATTAGTCTA-1,1,60,30
TTGTTTCCATACAACT-1,1,45,27
TTGTTTGTATTACACG-1,1,73,41


In [18]:
adata.var

Unnamed: 0_level_0,feature_types,genome,mt,ribo,hb,SYMBOL
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000187634,Gene Expression,GRCh38,False,False,False,SAMD11
ENSG00000188976,Gene Expression,GRCh38,False,False,False,NOC2L
ENSG00000187961,Gene Expression,GRCh38,False,False,False,KLHL17
ENSG00000187583,Gene Expression,GRCh38,False,False,False,PLEKHN1
ENSG00000187642,Gene Expression,GRCh38,False,False,False,PERM1
...,...,...,...,...,...,...
ENSG00000258992,Gene Expression,GRCh38,False,False,False,TSPY1
ENSG00000114374,Gene Expression,GRCh38,False,False,False,USP9Y
ENSG00000067048,Gene Expression,GRCh38,False,False,False,DDX3Y
ENSG00000154620,Gene Expression,GRCh38,False,False,False,TMSB4Y


In [19]:
adata.obsm

AxisArrays with keys: spatial

In [20]:
adata.uns

OrderedDict([('spatial',
              {'L25P': {'images': {'hires': array([[[0., 0., 0.],
                         [0., 0., 0.],
                         [0., 0., 0.],
                         ...,
                         [0., 0., 0.],
                         [0., 0., 0.],
                         [0., 0., 0.]],
                 
                        [[0., 0., 0.],
                         [0., 0., 0.],
                         [0., 0., 0.],
                         ...,
                         [0., 0., 0.],
                         [0., 0., 0.],
                         [0., 0., 0.]],
                 
                        [[0., 0., 0.],
                         [0., 0., 0.],
                         [0., 0., 0.],
                         ...,
                         [0., 0., 0.],
                         [0., 0., 0.],
                         [0., 0., 0.]],
                 
                        ...,
                 
                        [[0., 0., 0.],
             

### Calculate QC metrics

In [21]:
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20], log1p=True)

In [22]:
adata

AnnData object with n_obs × n_vars = 4682 × 17943
    obs: 'in_tissue', 'array_row', 'array_col', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb'
    var: 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'SYMBOL', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: 'spatial'
    obsm: 'spatial'

## Create directory & save QC metrics

In [23]:
directory = f"{sample}"
# Check if the directory exists
if not os.path.exists(directory):
    # If it doesn't exist, create it
    os.makedirs(directory)

In [24]:
stats_genes = adata.obs["n_genes_by_counts"].describe() #the number of genes with positive counts in a barcode
stats_genes.to_csv(f"{sample}/{sample}_genes_stats_before_filtering.csv")

In [25]:
stats_library_size = adata.obs["total_counts"].describe() #total number of counts per barcode, this might also be known as library size
stats_library_size.to_csv(f"{sample}/{sample}_library_size_stats_before_filtering.csv")

In [26]:
stats_mt = adata.obs["pct_counts_mt"].describe() #proportion of total counts for a barcode which are mitochondrial
stats_mt.to_csv(f"{sample}/{sample}_mt_stats_before_filtering.csv")

## We now plot the three QC covariates n_genes_by_counts, total_counts and pct_counts_mt per sample to assess how well the respective cells were captured.

In [27]:
with plt.rc_context():  # Use this to set figure params like size and dpi
    sns.displot(adata.obs["total_counts"], kde=False, bins=100)
    plt.savefig(f"{sample}/{sample}_stats_library_size_before_filtering.png",dpi=300, format="png")
    plt.close()

In [28]:
with plt.rc_context():  # Use this to set figure params like size and dpi
    sns.displot(adata.obs["n_genes_by_counts"], kde=False, bins=100)
    plt.savefig(f"{sample}/{sample}_stats_gene_counts_before_filtering.png",dpi=300, format="png")
    plt.close()

In [29]:
with plt.rc_context():  # Use this to set figure params like size and dpi
    sc.pl.violin(adata,["n_genes_by_counts", "total_counts", "pct_counts_mt"],jitter=0.4,multi_panel=True, show=False)
    plt.savefig(f"{sample}/{sample}_counts_violin_before_filtering.png",dpi=300, format="png")
    plt.close()

In [30]:
with plt.rc_context():  # Use this to set figure params like size and dpi
    sc.pl.scatter(adata, "total_counts", "n_genes_by_counts", color="pct_counts_mt", show=False)
    plt.savefig(f"{sample}/{sample}_counts_scatter_before_filtering.png",dpi=300, format="png")
    plt.close()

## QC with automatic thresholding and filtering based on MAD (median absolute deviations). The MAD is given by the respective QC metric of an observation and describes a robust statistic of the variability of the metric. 

In [31]:
#First, we define a function that takes a metric, i.e. a column in .obs and the number of MADs that is still permissive within the filtering strategy.
def is_outlier(adata, metric: str, nmads: int):
    M = adata.obs[metric]
    outlier = (M < np.median(M) - nmads * median_abs_deviation(M)) | (
        np.median(M) + nmads * median_abs_deviation(M) < M
    )
    return outlier

In [32]:
#We now apply this function to the log1p_total_counts, log1p_n_genes_by_counts and pct_counts_in_top_20_genes QC covariates each with a threshold of 5 MADs.
adata.obs["outlier"] = (
    is_outlier(adata, "log1p_total_counts", 5)
    | is_outlier(adata, "log1p_n_genes_by_counts", 5)
    | is_outlier(adata, "pct_counts_in_top_20_genes", 5)
)
adata.obs.outlier.value_counts()

False    4477
True      205
Name: outlier, dtype: int64

In [33]:
#pct_counts_Mt is filtered with 3 MADs. Additionally, cells with a percentage of mitochondrial counts exceeding 8 % are filtered out.
adata.obs["mt_outlier"] = is_outlier(adata, "pct_counts_mt", 3) | (
    adata.obs["pct_counts_mt"] > 8
)
adata.obs.mt_outlier.value_counts()

False    4682
Name: mt_outlier, dtype: int64

## SAVE STATS BEFORE FILTERING

In [34]:
#We save stats before filtering
with open(f"{sample}/{sample}_stats_before_filtering.txt", "a") as f:
    print(f"Total number of cells before filtering: {adata.n_obs}", file=f)
    print(f"Total number of genes with positive counts before filtering: {adata.n_vars}", file=f)

In [35]:
#We now filter our AnnData object based on these two additional columns.
print(f"Total number of cells: {adata.n_obs}")
adata = adata[(~adata.obs.outlier) & (~adata.obs.mt_outlier)].copy()
print(f"Number of cells after filtering of low quality cells: {adata.n_obs}")

Total number of cells: 4682
Number of cells after filtering of low quality cells: 4477


## SAVE STATS AFTER FILTERING

In [36]:
#We save stats after filtering
with open(f"{sample}/{sample}_stats_after_filtering.txt", "a") as f:
    print(f"Number of cells after filtering of low quality cells: {adata.n_obs}", file=f)
    print(f"Number of genes after filtering of low quality cells: {adata.n_vars}", file=f)

In [37]:
#QC plots
with plt.rc_context():  # Use this to set figure params like size and dpi
    sns.displot(adata.obs["total_counts"], kde=False, bins=100)
    plt.savefig(f"{sample}/{sample}_stats_library_size_after_filtering.png",dpi=300, format="png")
    plt.close()

In [38]:
with plt.rc_context():  # Use this to set figure params like size and dpi
    sns.displot(adata.obs["n_genes_by_counts"], kde=False, bins=100)
    plt.savefig(f"{sample}/{sample}_stats_genes_counts_after_filtering.png",dpi=300, format="png")
    plt.close()

In [39]:
with plt.rc_context():  # Use this to set figure params like size and dpi
    sc.pl.violin(adata,["n_genes_by_counts", "total_counts", "pct_counts_mt"],jitter=0.4,multi_panel=True, show=False)
    plt.savefig(f"{sample}/{sample}_counts_violin_after_filtering.png",dpi=300, format="png")
    plt.close()

In [40]:
with plt.rc_context():  # Use this to set figure params like size and dpi
    sc.pl.scatter(adata, "total_counts", "n_genes_by_counts", color="pct_counts_mt", show=False)
    plt.savefig(f"{sample}/{sample}_counts_scatter_after_filtering.png",dpi=300, format="png")
    plt.close()

In [41]:
#Save filtering stats
stats_genes = adata.obs["n_genes_by_counts"].describe() #the number of genes with positive counts in a barcode
stats_genes.to_csv(f"{sample}/{sample}_genes_stats_after_filtering.csv")

In [42]:
stats_library_size = adata.obs["total_counts"].describe() #total number of counts per barcode, this might also be known as library size
stats_library_size.to_csv(f"{sample}/{sample}_library_size_stats_after_filtering.csv")

In [43]:
stats_mt = adata.obs["pct_counts_mt"].describe() #proportion of total counts for a barcode which are mitochondrial
stats_mt.to_csv(f"{sample}/{sample}_mt_stats_after_filtering.csv")

In [44]:
#AnnData object summary
adata

AnnData object with n_obs × n_vars = 4477 × 17943
    obs: 'in_tissue', 'array_row', 'array_col', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier'
    var: 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'SYMBOL', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: 'spatial'
    obsm: 'spatial'

## NORMALIZATION

In [45]:
#Save raw counts as layer
adata.layers["raw_counts"] = adata.X.copy()

### Shifted logarithm

In [46]:
scales_counts = sc.pp.normalize_total(adata, target_sum=None, inplace=False)
# log1p transform
adata.layers["log1p_norm"] = sc.pp.log1p(scales_counts["X"], copy=True)

In [47]:
#Plot normalized counts
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
p1 = sns.histplot(adata.obs["total_counts"], bins=100, kde=False, ax=axes[0])
axes[0].set_title("Total counts")
p2 = sns.histplot(adata.layers["log1p_norm"].sum(1), bins=100, kde=False, ax=axes[1])
axes[1].set_title("Shifted logarithm")
fig.savefig(f"{sample}/{sample}_counts_normalization_shifted_log.png",dpi=300, format="png")
plt.close(fig)

## FEATURE SELECTION USING DEVIANCE

### Deviance works on raw counts so there is no need to replace adata.X with one of the normalized layers.

In [48]:
#TIP when transforming from anndata to R (https://github.com/theislab/anndata2ri/issues/50): 
#Ensure all of the dtypes on any pandas dataframe within your anndata object are NOT object type. 
#Cast any numpy.ndarray objects within your anndata object to scipy.sparse.csc_matrix. 

In [49]:
#Create new object
adata_deviance = adata.copy()

In [50]:
#Transform object to int for R import
adata_deviance.obsm["spatial"] = adata_deviance.obsm["spatial"].astype(str).astype(int)

In [51]:
adata_deviance.obsm["spatial"]

array([[ 724, 1931],
       [5809, 6877],
       [1040, 3494],
       ...,
       [5270, 2533],
       [8101, 3322],
       [1447, 3955]])

In [52]:
# Delete uns as this can contain arbitrary objects which are difficult to convert
del adata_deviance.uns

In [53]:
adata_deviance

AnnData object with n_obs × n_vars = 4477 × 17943
    obs: 'in_tissue', 'array_row', 'array_col', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier'
    var: 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'SYMBOL', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    obsm: 'spatial'
    layers: 'raw_counts', 'log1p_norm'

In [54]:
%R library(scry)

array(['scry', 'tools', 'stats', 'graphics', 'grDevices', 'utils',
       'datasets', 'methods', 'base'], dtype='<U9')

In [55]:
#Similar to before, we save the AnnData object in our R environment (you may need to run this cell twice, i.e: sometimes it raises an error)
ro.globalenv["adata_deviance"] = adata_deviance

In [56]:
#We can now directly call feature selection with deviance on the non-normalized counts matrix and export the bionomial deviance values as a vector.
%R sce = devianceFeatureSelection(adata_deviance, assay="X")

AnnData object with n_obs × n_vars = 4477 × 17943
    obs: 'in_tissue', 'array_row', 'array_col', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier'
    var: 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'SYMBOL', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'binomial_deviance'
    obsm: 'spatial'
    layers: 'raw_counts', 'log1p_norm'

In [57]:
binomial_deviance = ro.r("rowData(sce)$binomial_deviance").T

In [58]:
#As a next step, we now sort the vector an select the top 6,000 highly deviant genes and save them as an additional column in .var as ‘highly_deviant’. 
#We additionally save the computed binomial deviance in case we want to sub-select a different number of highly variable genes afterwards.

In [59]:
idx = binomial_deviance.argsort()[-6000:]
mask = np.zeros(adata.var_names.shape, dtype=bool)
mask[idx] = True

adata.var["highly_deviant"] = mask
adata.var["binomial_deviance"] = binomial_deviance

In [60]:
adata

AnnData object with n_obs × n_vars = 4477 × 17943
    obs: 'in_tissue', 'array_row', 'array_col', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier'
    var: 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'SYMBOL', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_deviant', 'binomial_deviance'
    uns: 'spatial'
    obsm: 'spatial'
    layers: 'raw_counts', 'log1p_norm'

In [61]:
#Last, we visualise the feature selection results. We use a scanpy function to compute the mean and dispersion for each gene accross all cells.
sc.pp.highly_variable_genes(adata, layer="log1p_norm")

In [62]:
adata

AnnData object with n_obs × n_vars = 4477 × 17943
    obs: 'in_tissue', 'array_row', 'array_col', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier'
    var: 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'SYMBOL', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_deviant', 'binomial_deviance', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'spatial', 'hvg'
    obsm: 'spatial'
    layers: 'raw_counts', 'log1p_norm'

In [63]:
#We inspect our results by plotting dispersion versus mean for the genes and color by ‘highly_deviant’.
with plt.rc_context():  # Use this to set figure params like size and dpi
    sns.scatterplot(data=adata.var, x="means", y="dispersions", hue="highly_deviant", s=5)
    #ax.set_xlim(None, 1.5)
    #ax.set_ylim(None, 3)
    plt.savefig(f"{sample}/{sample}_highly_deviant_genes.png",dpi=300, format="png")
    plt.close()

In [64]:
#We observe that genes with a high mean expression are selected as highly deviant. 

## DIMENSIONALITY REDUCTION

In [65]:
#We will use a normalized representation of the dataset for dimensionality reduction and visualization, specifically the shifted logarithm.
adata.X = adata.layers["log1p_norm"]

### PCA

In [66]:
# setting highly variable as highly deviant to use scanpy 'use_highly_variable' argument in sc.pp.pca
adata.var["highly_variable"] = adata.var["highly_deviant"]
sc.pp.pca(adata, svd_solver="arpack", use_highly_variable=True)

### t-SNE

In [67]:
sc.tl.tsne(adata, use_rep="X_pca")

### UMAP

In [68]:
#We first calculate PCA and subsequently a neighborhood graph on our data.
sc.pp.neighbors(adata)
sc.tl.umap(adata)

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


### Inspecting QC metrics

In [69]:
#We can now also inspect the quality control metrics we calculated previously in our PCA, TSNE or UMAP plot and potentially identify low-quality cells.
with plt.rc_context():  # Use this to set figure params like size and dpi
    plt.rcParams["figure.figsize"] = (4, 4)
    sc.pl.pca_scatter(adata,color=["total_counts", "n_genes_by_counts","pct_counts_mt"], show=False)
    plt.savefig(f"{sample}/{sample}_PCA_dimensionality_reduction_QC.png",dpi=300, format="png")
    plt.close()

In [70]:
with plt.rc_context():  
    plt.rcParams["figure.figsize"] = (4, 4)
    sc.pl.tsne(adata,color=["total_counts", "n_genes_by_counts","pct_counts_mt"], show=False)
    plt.savefig(f"{sample}/{sample}_tSNE_dimensionality_reduction_QC.png",dpi=300, format="png")
    plt.close()

In [71]:
with plt.rc_context(): 
    plt.rcParams["figure.figsize"] = (4, 4)
    sc.pl.umap(adata,color=["total_counts", "n_genes_by_counts","pct_counts_mt"], show=False)
    plt.savefig(f"{sample}/{sample}_UMAP_dimensionality_reduction_QC.png",dpi=300, format="png")
    plt.close()

## CLUSTERING

In [72]:
#We call Leiden algorithm
sc.tl.leiden(adata)

In [73]:
#The default resolution parameter in scanpy is 1.0. 
#However, in many cases the analyst may want to try different resolution parameters to control the coarseness of the clustering. 
sc.tl.leiden(adata, key_added="leiden_res0_25", resolution=0.25)
sc.tl.leiden(adata, key_added="leiden_res0_5", resolution=0.5)
#sc.tl.leiden(adata, key_added="leiden_res1", resolution=1.0)

In [74]:
#Visualize clustering
with plt.rc_context():  # Use this to set figure params like size and dpi
    plt.rcParams["figure.figsize"] = (4, 4)
    sc.pl.umap(adata,color=["leiden_res0_25", "leiden_res0_5", "leiden"],legend_loc="on data", show=False)
    plt.savefig(f"{sample}/{sample}_leiden_community_res.png",dpi=300, format="png")
    plt.close()

  cax = scatter(
  cax = scatter(
  cax = scatter(


## PLOTTING QC ON TOP OF VISIUM ST IMAGES

In [75]:
#Transform to int
adata.obsm["spatial"] = adata.obsm["spatial"].astype(int)

In [76]:
with plt.rc_context():  
    plt.rcParams["figure.figsize"] = (8, 8)
    sc.pl.spatial(adata, img_key="hires", color=["total_counts", "n_genes_by_counts", "leiden"], show=False)
    plt.savefig(f"{sample}/{sample}_QC_histology.png",dpi=300, format="png")
    plt.close()

## CLUSTER MARKER GENES

In [77]:
with plt.rc_context():  
    plt.rcParams["figure.figsize"] = (4, 6)
    sc.tl.rank_genes_groups(adata, "leiden", method="t-test")
    sc.pl.rank_genes_groups_heatmap(adata, n_genes=5, groupby="leiden", show_gene_labels=True, show=False, gene_symbols="SYMBOL") #groups="3" for specific cluster
    plt.savefig(f"{sample}/{sample}_leiden_marker_genes.png",dpi=300, format="png")
    plt.close()

## SAVE PREPROCESSED ANNDATA OBJECT

In [78]:
#Save anndata with clustering results
adata.write_h5ad(f'{sample}/{sample}_preprocessed.h5ad', compression='gzip')