# Spatial transcriptomics unveils the in situ cellular and molecular hallmarks of the lung in fatal COVID-19

# Concatenate preprocessed Visium ST samples into a single annData object and re-normalise the data.

**Author:** Carlos A. Garcia-Prieto

* This notebook explains the concatenation of the individually preprocessed Visium ST samples into a single object. Because of the concatenation we also need to re-normalise the data.
* See the [concatenation tutorial](https://anndata.readthedocs.io/en/stable/concatenation.html) for more details.
* We have followed single-cell best practices [tutorial](https://www.sc-best-practices.org/preprocessing_visualization/quality_control.html) for preprocessing the concatenated object.

## Import modules

In [1]:
import anndata
import matplotlib.pyplot as plt
import scanpy as sc
import pandas as pd
import seaborn as sns
import scanpy.external as sce
import numpy as np
import os
import anndata2ri
import logging
import seaborn as sns
import rpy2.rinterface_lib.callbacks as rcb
import rpy2.robjects as ro

sc.settings.verbosity = 0
#sc.settings.set_figure_params(
#    dpi=80,
#    facecolor="white",
#    frameon=False,
#)

rcb.logger.setLevel(logging.ERROR)
ro.pandas2ri.activate()
anndata2ri.activate()

%reload_ext rpy2.ipython

In [2]:
pd.set_option('display.max_columns', 500)

## Read Visium ST preprocessed data

In [3]:
# Prepare anndata concat object (read preprocessed objects)
adata_L2P = anndata.read(f'L2P/L2P_preprocessed.h5ad')
adata_L5P = anndata.read(f'L5P/L5P_preprocessed.h5ad')
adata_L14P = anndata.read(f'L14P/L14P_preprocessed.h5ad')
adata_L19P = anndata.read(f'L19P/L19P_preprocessed.h5ad')
adata_L24P = anndata.read(f'L24P/L24P_preprocessed.h5ad')
adata_L3C = anndata.read(f'L3C/L3C_preprocessed.h5ad')
adata_L14C = anndata.read(f'L14C/L14C_preprocessed.h5ad')
adata_L2C = anndata.read(f'L2C/L2C_preprocessed.h5ad')
adata_L11P = anndata.read(f'L11P/L11P_preprocessed.h5ad')
adata_L12P = anndata.read(f'L12P/L12P_preprocessed.h5ad')
adata_CONTROL2 = anndata.read(f'CONTROL2/CONTROL2_preprocessed.h5ad')
adata_HRC2 = anndata.read(f'HRC2/HRC2_preprocessed.h5ad')
adata_HRC4 = anndata.read(f'HRC4/HRC4_preprocessed.h5ad')
adata_HRC5 = anndata.read(f'HRC5/HRC5_preprocessed.h5ad')
adata_HRC6 = anndata.read(f'HRC6/HRC6_preprocessed.h5ad')
adata_HRC8 = anndata.read(f'HRC8/HRC8_preprocessed.h5ad')
adata_HRC10 = anndata.read(f'HRC10/HRC10_preprocessed.h5ad')
adata_HRC11 = anndata.read(f'HRC11/HRC11_preprocessed.h5ad')
adata_HRC12 = anndata.read(f'HRC12/HRC12_preprocessed.h5ad')
adata_HRC13 = anndata.read(f'HRC13/HRC13_preprocessed.h5ad')
adata_HRC16 = anndata.read(f'HRC16/HRC16_preprocessed.h5ad')
adata_HRC17 = anndata.read(f'HRC17/HRC17_preprocessed.h5ad')
adata_HRC18 = anndata.read(f'HRC18/HRC18_preprocessed.h5ad')

In [4]:
#Set metadata
proliferative = ["L2P","L19P","L11P","HRC5","HRC6","HRC8","HRC10","HRC11","HRC12","HRC13","HRC16","HRC17"]
acute = ["L5P","L14P","L24P","L12P","HRC2","HRC4","HRC18"]
controls = ["L3C","L14C","L2C","CONTROL2"]
covid = ["L5P","L14P","L24P","L12P","HRC2","HRC4","HRC18","L2P","L19P","L11P","HRC5","HRC6","HRC8","HRC10","HRC11","HRC12","HRC13","HRC16","HRC17"]
sweden = ["L2P","L19P","L11P","L5P","L14P","L24P","L12P","L3C","L14C","L2C"]
spain = ["CONTROL2","HRC2","HRC4","HRC5","HRC6","HRC8","HRC10","HRC11","HRC12","HRC13","HRC16","HRC17","HRC18"]
batch1 = ["L2P","L5P","L14P","L19P","L24P","L3C","L14C"]
batch2 = ["L2C","L11P","L12P"]
batch3 = ["CONTROL2","HRC2","HRC4","HRC5","HRC6","HRC8","HRC10","HRC11","HRC12","HRC13","HRC16","HRC17","HRC18"]

## Concatenate preprocessed objects

In [5]:
#Concatenate preprocessed objects
adata_concat_hrc_sq = anndata.concat(
    {"L2P" : adata_L2P, "L5P" : adata_L5P, "L14P" : adata_L14P, "L19P" : adata_L19P, "L24P" : adata_L24P, "L3C" : adata_L3C, "L14C" : adata_L14C,
    "L2C" : adata_L2C, "L11P" : adata_L11P, "L12P" : adata_L12P,
    "CONTROL2": adata_CONTROL2, "HRC2": adata_HRC2, "HRC4": adata_HRC4, "HRC5": adata_HRC5,
    "HRC6": adata_HRC6, "HRC8": adata_HRC8, "HRC10": adata_HRC10, "HRC11": adata_HRC11, "HRC12": adata_HRC12, "HRC13": adata_HRC13,
    "HRC16": adata_HRC16, "HRC17": adata_HRC17, "HRC18": adata_HRC18},
    label = "sample",
    index_unique="_",
    join = "outer",
    merge = "unique",
    uns_merge = "unique",
    pairwise = False,
    axis=0,
)

In [6]:
#Explore concatenated object
adata_concat_hrc_sq 

AnnData object with n_obs × n_vars = 91068 × 17943
    obs: 'in_tissue', 'array_row', 'array_col', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier', 'leiden', 'leiden_res0_25', 'leiden_res0_5', 'sample'
    var: 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'SYMBOL'
    uns: 'dendrogram_leiden', 'hvg', 'leiden', 'neighbors', 'pca', 'rank_genes_groups', 'spatial', 'tsne', 'umap'
    obsm: 'X_pca', 'X_tsne', 'X_umap', 'spatial'
    layers: 'log1p_norm', 'raw_counts'

In [7]:
adata_concat_hrc_sq.obs

Unnamed: 0,in_tissue,array_row,array_col,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_20_genes,total_counts_mt,log1p_total_counts_mt,pct_counts_mt,total_counts_ribo,log1p_total_counts_ribo,pct_counts_ribo,total_counts_hb,log1p_total_counts_hb,pct_counts_hb,outlier,mt_outlier,leiden,leiden_res0_25,leiden_res0_5,sample
AAACAACGAATAGTTC-1_L2P,1,0,16,2182,7.688455,4029.0,8.301521,15.984115,0.0,0.0,0.0,0.0,0.0,0.0,8.0,2.197225,0.198560,False,False,3,2,3,L2P
AAACAAGTATCTCCCA-1_L2P,1,50,102,1369,7.222566,2590.0,7.859799,22.818533,0.0,0.0,0.0,0.0,0.0,0.0,6.0,1.945910,0.231660,False,False,1,0,1,L2P
AAACAATCTACTAGCA-1_L2P,1,3,43,1987,7.594884,3673.0,8.209036,19.656956,0.0,0.0,0.0,0.0,0.0,0.0,5.0,1.791759,0.136129,False,False,2,0,1,L2P
AAACACCAATAACTGC-1_L2P,1,59,19,1889,7.544332,3206.0,8.073092,14.441672,0.0,0.0,0.0,0.0,0.0,0.0,8.0,2.197225,0.249532,False,False,7,0,0,L2P
AAACAGAGCGACTCCT-1_L2P,1,14,94,1501,7.314553,2612.0,7.868254,22.511485,0.0,0.0,0.0,0.0,0.0,0.0,16.0,2.833213,0.612557,False,False,0,0,0,L2P
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTTCACATCCAGG-1_HRC18,1,58,42,432,6.070738,520.0,6.255750,14.615385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,False,False,3,0,0,HRC18
TTGTTTCATTAGTCTA-1_HRC18,1,60,30,379,5.940171,444.0,6.098074,15.540541,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.693147,0.225225,False,False,3,0,0,HRC18
TTGTTTCCATACAACT-1_HRC18,1,45,27,467,6.148468,565.0,6.338594,15.752212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,False,False,1,0,0,HRC18
TTGTTTGTATTACACG-1_HRC18,1,73,41,621,6.432940,842.0,6.736967,17.102138,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.386294,0.356295,False,False,4,0,2,HRC18


In [8]:
adata_concat_hrc_sq.var

Unnamed: 0_level_0,feature_types,genome,mt,ribo,hb,SYMBOL
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ENSG00000187634,Gene Expression,GRCh38,False,False,False,SAMD11
ENSG00000188976,Gene Expression,GRCh38,False,False,False,NOC2L
ENSG00000187961,Gene Expression,GRCh38,False,False,False,KLHL17
ENSG00000187583,Gene Expression,GRCh38,False,False,False,PLEKHN1
ENSG00000187642,Gene Expression,GRCh38,False,False,False,PERM1
...,...,...,...,...,...,...
ENSG00000258992,Gene Expression,GRCh38,False,False,False,TSPY1
ENSG00000114374,Gene Expression,GRCh38,False,False,False,USP9Y
ENSG00000067048,Gene Expression,GRCh38,False,False,False,DDX3Y
ENSG00000154620,Gene Expression,GRCh38,False,False,False,TMSB4Y


In [9]:
#adata_concat_hrc_sq.var = adata_concat_hrc_sq.var[["SYMBOL","feature_types","genome"]]

In [10]:
#Add metadata columns
adata_concat_hrc_sq.obs['condition'] = adata_concat_hrc_sq.obs['sample'].isin(covid)
adata_concat_hrc_sq.obs['condition_acute'] = adata_concat_hrc_sq.obs['sample'].isin(acute)
adata_concat_hrc_sq.obs['condition_proliferative'] = adata_concat_hrc_sq.obs['sample'].isin(proliferative)
adata_concat_hrc_sq.obs['origin'] = adata_concat_hrc_sq.obs['sample'].isin(spain)
adata_concat_hrc_sq.obs['batch1'] = adata_concat_hrc_sq.obs['sample'].isin(batch1)
adata_concat_hrc_sq.obs['batch2'] = adata_concat_hrc_sq.obs['sample'].isin(batch2)
adata_concat_hrc_sq.obs['batch3'] = adata_concat_hrc_sq.obs['sample'].isin(batch3)
adata_concat_hrc_sq.obs['batch'] = np.where(adata_concat_hrc_sq.obs['batch1'] == True, 'batch1', (np.where(adata_concat_hrc_sq.obs['batch2'] == True, 'batch2', 'batch3')))
adata_concat_hrc_sq.obs['origin'] = np.where(adata_concat_hrc_sq.obs['origin'] == True, 'Spain', 'Sweden')
adata_concat_hrc_sq.obs['condition'] = np.where(adata_concat_hrc_sq.obs['condition'] == True, 'COVID-19', 'Control')
adata_concat_hrc_sq.obs['condition_subtype'] = np.where(adata_concat_hrc_sq.obs['condition_acute'] == True, 'Acute', (np.where(adata_concat_hrc_sq.obs['condition_proliferative'] == True, 'Proliferative', 'Control')))


In [11]:
adata_concat_hrc_sq.obs[['condition_subtype',"sample"]].value_counts()

condition_subtype  sample  
Proliferative      HRC10       4668
                   L11P        4621
Acute              L12P        4555
Proliferative      L2P         4524
Acute              HRC18       4462
                   HRC2        4459
Proliferative      HRC12       4364
                   HRC17       4341
Acute              HRC4        4311
Control            L3C         4265
Proliferative      HRC11       4072
                   HRC8        4042
                   HRC16       3978
                   HRC5        3946
Control            L2C         3894
Proliferative      HRC6        3843
                   HRC13       3760
Acute              L14P        3594
Proliferative      L19P        3470
Acute              L5P         3446
Control            L14C        3299
Acute              L24P        3124
Control            CONTROL2    2030
dtype: int64

### We make a copy of tha anndata object to start working

In [12]:
adata = adata_concat_hrc_sq.copy()

### Because of the concatenation we need to re-normalise the data. Here we just normalise using global scaling by the total counts per cell and shifted logarithm.

In [13]:
adata.layers

Layers with keys: log1p_norm, raw_counts

In [14]:
adata.X = adata.layers["raw_counts"].copy()

In [15]:
scales_counts = sc.pp.normalize_total(adata, target_sum=None, inplace=False)
# log1p transform
adata.layers["log1p_norm"] = sc.pp.log1p(scales_counts["X"], copy=True)

In [16]:
adata

AnnData object with n_obs × n_vars = 91068 × 17943
    obs: 'in_tissue', 'array_row', 'array_col', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier', 'leiden', 'leiden_res0_25', 'leiden_res0_5', 'sample', 'condition', 'condition_acute', 'condition_proliferative', 'origin', 'batch1', 'batch2', 'batch3', 'batch', 'condition_subtype'
    var: 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'SYMBOL'
    uns: 'dendrogram_leiden', 'hvg', 'leiden', 'neighbors', 'pca', 'rank_genes_groups', 'spatial', 'tsne', 'umap'
    obsm: 'X_pca', 'X_tsne', 'X_umap', 'spatial'
    layers: 'log1p_norm', 'raw_counts'

# Preprocess concatenated data
#### It is always recommended to look at the raw data before performing any integration. This can give some indication of how big any batch effects are and what might be causing them (and therefore which variables to consider as the batch label).

## FEATURE SELECTION USING DEVIANCE

### Deviance works on raw counts so there is no need to replace adata.X with one of the normalized layers.

In [17]:
#TIP when transforming from anndata to R (https://github.com/theislab/anndata2ri/issues/50): 
#Ensure all of the dtypes on any pandas dataframe within your anndata object are NOT object type. 
#Cast any numpy.ndarray objects within your anndata object to scipy.sparse.csc_matrix. 

In [18]:
#Create new object
adata_deviance = adata.copy()

In [19]:
adata_deviance.obsm["spatial"]

array([[ 692, 1779],
       [5769, 6704],
       [1008, 3338],
       ...,
       [5429, 2431],
       [8255, 3224],
       [1609, 3843]])

In [20]:
#Transform object to int for R import
adata_deviance.obsm["spatial"] = adata_deviance.obsm["spatial"].astype(str).astype(int)

In [21]:
adata_deviance.obsm["spatial"]

array([[ 692, 1779],
       [5769, 6704],
       [1008, 3338],
       ...,
       [5429, 2431],
       [8255, 3224],
       [1609, 3843]])

In [22]:
# Delete uns as this can contain arbitrary objects which are difficult to convert
del adata_deviance.uns

In [23]:
adata_deviance

AnnData object with n_obs × n_vars = 91068 × 17943
    obs: 'in_tissue', 'array_row', 'array_col', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier', 'leiden', 'leiden_res0_25', 'leiden_res0_5', 'sample', 'condition', 'condition_acute', 'condition_proliferative', 'origin', 'batch1', 'batch2', 'batch3', 'batch', 'condition_subtype'
    var: 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'SYMBOL'
    obsm: 'X_pca', 'X_tsne', 'X_umap', 'spatial'
    layers: 'log1p_norm', 'raw_counts'

In [24]:
%R library(scry)

array(['scry', 'tools', 'stats', 'graphics', 'grDevices', 'utils',
       'datasets', 'methods', 'base'], dtype='<U9')

In [25]:
#Similar to before, we save the AnnData object in our R environment (you may need to run this cell twice, i.e: sometimes it raises an error)
ro.globalenv["adata_deviance"] = adata_deviance

In [26]:
#We can now directly call feature selection with deviance on the non-normalized counts matrix and export the bionomial deviance values as a vector.
%R sce = devianceFeatureSelection(adata_deviance, assay="X")

AnnData object with n_obs × n_vars = 91068 × 17943
    obs: 'in_tissue', 'array_row', 'array_col', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier', 'leiden', 'leiden_res0_25', 'leiden_res0_5', 'sample', 'condition', 'condition_acute', 'condition_proliferative', 'origin', 'batch1', 'batch2', 'batch3', 'batch', 'condition_subtype'
    var: 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'SYMBOL', 'binomial_deviance'
    obsm: 'X_pca', 'X_tsne', 'X_umap', 'spatial'
    layers: 'log1p_norm', 'raw_counts'

In [27]:
binomial_deviance = ro.r("rowData(sce)$binomial_deviance").T

In [28]:
#As a next step, we now sort the vector an select the top 6,000 highly deviant genes and save them as an additional column in .var as ‘highly_deviant’. 
#We additionally save the computed binomial deviance in case we want to sub-select a different number of highly variable genes afterwards.

In [29]:
idx = binomial_deviance.argsort()[-6000:]
mask = np.zeros(adata.var_names.shape, dtype=bool)
mask[idx] = True

adata.var["highly_deviant"] = mask
adata.var["binomial_deviance"] = binomial_deviance

In [30]:
#Last, we visualise the feature selection results. We use a scanpy function to compute the mean and dispersion for each gene accross all cells.
sc.pp.highly_variable_genes(adata, layer="log1p_norm")

In [31]:
adata

AnnData object with n_obs × n_vars = 91068 × 17943
    obs: 'in_tissue', 'array_row', 'array_col', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier', 'leiden', 'leiden_res0_25', 'leiden_res0_5', 'sample', 'condition', 'condition_acute', 'condition_proliferative', 'origin', 'batch1', 'batch2', 'batch3', 'batch', 'condition_subtype'
    var: 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'SYMBOL', 'highly_deviant', 'binomial_deviance', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'dendrogram_leiden', 'hvg', 'leiden', 'neighbors', 'pca', 'rank_genes_groups', 'spatial', 'tsne', 'umap'
    obsm: 'X_pca', 'X_tsne', 'X_umap', 'spatial'
    layers: 'log1p_norm', 'raw_counts'

## Create directory to save results with the concatenated object

In [32]:
folder = "Concatenation_6000_hdg" #We create a folder to save outputs with the concatenated annData

In [33]:
directory = f"{folder}"
# Check if the directory exists
if not os.path.exists(directory):
    # If it doesn't exist, create it
    os.makedirs(directory)

In [34]:
#We inspect our results by plotting dispersion versus mean for the genes and color by ‘highly_deviant’.
with plt.rc_context():  # Use this to set figure params like size and dpi
    sns.scatterplot(data=adata.var, x="means", y="dispersions", hue="highly_deviant", s=5)
    #ax.set_xlim(None, 1.5)
    #ax.set_ylim(None, 3)
    plt.savefig(f"{folder}/{folder}_highly_deviant_genes.png",dpi=300, format="png")
    plt.close()

In [35]:
#We observe that genes with a high mean expression are selected as highly deviant. 

## We will create an object with just the highly deviant (most informative) genes to use for integration.

In [36]:
adata_hdg = adata[:, adata.var["highly_deviant"]].copy()
adata_hdg

AnnData object with n_obs × n_vars = 91068 × 6000
    obs: 'in_tissue', 'array_row', 'array_col', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier', 'leiden', 'leiden_res0_25', 'leiden_res0_5', 'sample', 'condition', 'condition_acute', 'condition_proliferative', 'origin', 'batch1', 'batch2', 'batch3', 'batch', 'condition_subtype'
    var: 'feature_types', 'genome', 'mt', 'ribo', 'hb', 'SYMBOL', 'highly_deviant', 'binomial_deviance', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'dendrogram_leiden', 'hvg', 'leiden', 'neighbors', 'pca', 'rank_genes_groups', 'spatial', 'tsne', 'umap'
    obsm: 'X_pca', 'X_tsne', 'X_umap', 'spatial'
    layers: 'log1p_norm', 'raw_counts'

### DIMENSIONALITY REDUCTION

In [37]:
#We will use a normalized representation of the dataset for dimensionality reduction and visualization, specifically the shifted logarithm.
adata_hdg.X = adata_hdg.layers["log1p_norm"]

### PCA

In [38]:
# setting highly variable as highly deviant
#adata_hdg.var["highly_variable"] = adata_hdg.var["highly_deviant"]

In [39]:
adata_hdg.var[["highly_variable", "highly_deviant"]].value_counts()

highly_variable  highly_deviant
False            True              5331
True             True               669
dtype: int64

In [40]:
sc.pp.pca(adata_hdg, svd_solver="arpack")

### t-SNE

In [41]:
sc.tl.tsne(adata_hdg, use_rep="X_pca")

### UMAP

In [42]:
#We first calculate PCA and subsequently a neighborhood graph on our data.
sc.pp.neighbors(adata_hdg)
sc.tl.umap(adata_hdg)

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


### Inspecting potential batch effect to identify batch variables to consider for integration

In [43]:
#We can now also inspect the batch effect in our PCA, TSNE or UMAP plot and potentially identify batch variables.
with plt.rc_context():  
    plt.rcParams["figure.figsize"] = (4, 4)
    sc.pl.pca_scatter(adata_hdg,color=["batch", "condition_subtype", "sample"], show=False, wspace=1)
    plt.savefig(f"{folder}/{folder}_PCA_dimensionality_reduction_hdg_batch.png",dpi=300, format="png")
    plt.close()

  cax = scatter(
  cax = scatter(
  cax = scatter(


In [44]:
with plt.rc_context():  
    plt.rcParams["figure.figsize"] = (4, 4)
    sc.pl.tsne(adata_hdg,color=["batch", "condition_subtype", "sample"], show=False, wspace=1)
    plt.savefig(f"{folder}/{folder}_tSNE_dimensionality_reduction_hdg_batch.png",dpi=300, format="png")
    plt.close()

  cax = scatter(
  cax = scatter(
  cax = scatter(


In [45]:
with plt.rc_context():  
    plt.rcParams["figure.figsize"] = (4, 4)
    sc.pl.umap(adata_hdg,color=["batch", "condition_subtype", "sample"], show=False, wspace=1)
    plt.savefig(f"{folder}/{folder}_UMAP_dimensionality_reduction_hdg_batch.png",dpi=300, format="png")
    plt.close()

  cax = scatter(
  cax = scatter(
  cax = scatter(


### CLUSTERING

In [46]:
#We call Leiden algorithm
sc.tl.leiden(adata_hdg, key_added="leiden_concat_hdg_res1", resolution=1.0)

In [47]:
#The default resolution parameter in scanpy is 1.0. 
#However, in many cases the analyst may want to try different resolution parameters to control the coarseness of the clustering. 
sc.tl.leiden(adata_hdg, key_added="leiden_concat_hdg_res0_25", resolution=0.25)
sc.tl.leiden(adata_hdg, key_added="leiden_concat_hdg_res0_5", resolution=0.5)
#sc.tl.leiden(adata, key_added="leiden_res1", resolution=1.0)

In [48]:
#Visualize clustering
with plt.rc_context():  # Use this to set figure params like size and dpi
    plt.rcParams["figure.figsize"] = (4, 4)
    sc.pl.umap(adata_hdg,color=["leiden_concat_hdg_res0_25", "leiden_concat_hdg_res0_5", "leiden_concat_hdg_res1"],legend_loc="on data", show=False)
    plt.savefig(f"{folder}/{folder}_leiden_community_res_hdg.png",dpi=300, format="png")
    plt.close()

  cax = scatter(
  cax = scatter(
  cax = scatter(


### Save concatenated anndata object with highly deviant genes

In [49]:
adata_hdg.write_h5ad(f'{folder}/Concatenation_adata_hdg_6000.h5ad', compression='gzip')