# Spatial transcriptomics unveils the in situ cellular and molecular hallmarks of the lung in fatal COVID-19

# Spatial dependencies as a function of niche composition across COVID-19 induced DAD progression

**Author:** Carlos A. Garcia-Prieto

* This notebook explains the preparation of the spatially mapped and deconvoluted Visium ST data with cell2location for the analysis of spatial intercellular dependencies as a function of spot composition using [NCEM](https://ncem.readthedocs.io/en/latest).
* We followed prepare NCEM data from deconvoluted Visium ST data [tutorial.](https://github.com/theislab/ncem_benchmarks/blob/main/notebooks/data_preparation/deconvolution/cell2location_human_lymphnode.ipynb) 

## Import modules

In [1]:
import warnings
warnings.filterwarnings("ignore")
import os
import sys
import scanpy as sc
import anndata as ad
import pandas as pd
pd.set_option('display.max_columns', 100)
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import cell2location
import scvi
from matplotlib import rcParams
rcParams['pdf.fonttype'] = 42 # enables correct plotting of text
import seaborn as sns
import decoupler as dc

Global seed set to 0


In [2]:
#Set data directory
results_folder = "/mnt/beegfs/cgarcia/Spatial/COVID19/cell2location/HLCA_publication/HLCA/"

ref_run_name = f'{results_folder}reference_signatures_finest'
run_name = f'{results_folder}cell2location_map_finest'

bivariate_folder = '/mnt/beegfs/cgarcia/Spatial/COVID19/cell2location/HLCA_publication/HLCA/liana/bivariate/'

## Read cell type specific gene expression needed for NCEM

In [3]:
#Read cell type specific gene expression needed for NCEM
adata_file = f"{run_name}/ad_vis_post_distrib_finest_cell_type_gene_expr.h5ad"
adata_vis = sc.read_h5ad(adata_file)

## Read WikiPathways gene sets
To focus the analysis on biologically relevant genes, we selected gene sets described in the WikiPathways database from the Molecular Signature Database (MSigDB)

In [4]:
# Get MSigDB resource
msigdb = dc.get_resource('MSigDB')
#Select WikiPathways databse
msigdb = msigdb[msigdb['collection']=='wikipathways']
#Remove duplicated entries
msigdb = msigdb[~msigdb.duplicated(['geneset', 'genesymbol'])]
#Remove prefix
msigdb.loc[:, 'geneset'] = [name.split('WP_')[1] for name in msigdb['geneset']]
#Write WikiPathways gene sets
msigdb.to_csv(f"{bivariate_folder}anndataWith005MinimumExpr/msigdb_WP_Paper.csv", index=True) 

In [5]:
#Read WikiPathways gene sets
msigdb_WP = pd.read_csv(f"{bivariate_folder}anndataWith005MinimumExpr/msigdb_WP_Paper.csv", index_col = 0)

In [6]:
#Retrieve gene symbols
msigdb_WP_genes = list(set(msigdb_WP["genesymbol"]))

## Filter out cell type marker genes
We filtered out cell type specific marker genes computed using rank_genes_groups_df() function in scanpy using adjusted p-value < 0.05 and minimum log fold change > 2

In [7]:
#Read Marker genes to filter
markers = pd.read_csv(f"{ref_run_name}/Marker_genes_filter_pvaladj005_logfc2.csv", index_col = 0)
#Select unique marker genes
markers_filter = markers['names'].unique()

In [8]:
#Remove marker genes
adata_vis.var["keep"] = ~adata_vis.var_names.isin(markers_filter)

In [9]:
#Set Hugo symbols as row names
adata_vis.var['gene_ids'] = adata_vis.var_names
adata_vis.var.set_index('SYMBOL', drop=False, inplace=True)

In [10]:
## Remove cell type marker genes
adata_vis = adata_vis[:, adata_vis.var.keep]

## Select Wikipathways gene sets (1614 genes shared)

In [11]:
#Select msigdb Wiki Pathways (WP) genes (1614 genes in total)
adata_vis.var["keep"] = adata_vis.var_names.isin(msigdb_WP_genes)

In [12]:
#Select WikiPathways gene sets
adata_vis = adata_vis[:, adata_vis.var.keep]

# Prepare data for NCEM analysis

In [13]:
#Set raw counts
adata_vis.X = adata_vis.layers["raw_counts"].copy()

In [14]:
#Select 45 cell types
cell_types = np.unique(list(adata_vis.layers.keys())[:46]) #finest
cell_types = np.delete(cell_types,44) #Remove logcounts finest

In [15]:
# Extract cell type abundances: using 5% quantile (representing confident cell abundance)
adata_vis.obs[adata_vis.uns['mod']['factor_names']] = adata_vis.obsm['q05_cell_abundance_w_sf']

In [16]:
#Extract cell type proportions
prop = adata_vis.obs[cell_types]

## Select cell types with credible differential abundance according to scCODA results and select most abundant cell types (22 cell types in total for NCEM analysis)

In [17]:
#Select cell types with credible differential abundance according to scCODA results and select most abundant cell types
cell_types_filtered = ['Alveolar fibroblasts','Alveolar Mph CCL3+','Alveolar Mph MT-positive','AT1', 'Adventitial fibroblasts',
                      'CD4 T cells', 'CD8 T cells', 'EC aerocyte capillary', 'EC arterial',
                      'EC general capillary', 'EC venous pulmonary', 'Interstitial Mph perivascular', 'Monocyte-derived Mph',
                      'NK cells', 'Non-classical monocytes', 'Peribronchial fibroblasts', 'Pericytes', 'Plasma cells', 'Subpleural fibroblasts',
                      'T cells proliferating', 'SM activated stress response', 'AT2'] #selected most abundant cell types

In [18]:
#Create directory to save results
directory = f'{results_folder}ncem'
# Check if the directory exists
if not os.path.exists(directory):
    # If it doesn't exist, create it
    os.makedirs(directory)

directory = f'{results_folder}ncem/Celltype22_Markers_Wiki_1614'
# Check if the directory exists
if not os.path.exists(directory):
    # If it doesn't exist, create it
    os.makedirs(directory)

In [19]:
#Selected celltypes
prop_filtered = prop.filter(items=cell_types_filtered)

In [20]:
#Set cell_types and proportions with only the selected cell types
cell_types = cell_types_filtered
prop = prop_filtered

In [21]:
#Set samples
samples = adata_vis.obs["sample"]

## Collect NCEM data

In [22]:
#Prepare data for NCEM

cell_expression = []
node_types = []
proportions = []
spatial = []
sample = []
for i, ct in enumerate(cell_types):
    proportions.append(prop)
    cell_expression.append(adata_vis.layers[ct].toarray())
    nt = np.zeros((prop.shape[0], len(cell_types)))
    nt[:, i] = 1
    node_types.append(nt)
    spatial.append(adata_vis.obsm['spatial'])
    sample.append(adata_vis.obs['sample'])
    
proportions = pd.DataFrame(np.concatenate(proportions), columns=cell_types)
cell_expression = pd.DataFrame(np.concatenate(cell_expression), columns=adata_vis.var_names)
node_types = pd.DataFrame(np.concatenate(node_types), columns=cell_types)
spatial = pd.DataFrame(np.concatenate(spatial))
sample = pd.DataFrame(np.concatenate(sample))

In [23]:
from anndata import AnnData
adata_ncem = AnnData(cell_expression)
adata_ncem.obsm['proportions'] = np.array(proportions)
adata_ncem.obsm['node_types'] = np.array(node_types)
adata_ncem.obsm['spatial'] = np.array(spatial)
adata_ncem.obs['sample'] = np.array(sample)

adata_ncem.uns["node_type_names"] = {x: x for x in cell_types}

## Preprocess NCEM data

In [24]:
sc.pp.filter_genes(adata_ncem, min_cells=0)

In [25]:
adata_ncem.obsm['sample'] = np.array(sample)

In [26]:
adata_ncem.layers["Cell_expression"] = adata_ncem.X

In [27]:
sc.pp.log1p(adata_ncem)

In [28]:
h_0 = pd.DataFrame(adata_ncem.obsm['node_types'], columns=list(adata_ncem.uns['node_type_names'].values()))
target_type = pd.DataFrame(np.array(h_0.idxmax(axis=1)), columns=["target_cell"]).reset_index()

In [29]:
adata_ncem.obs = target_type

In [30]:
adata_ncem.obs['sample'] = np.array(sample)

## Save NCEM data

In [31]:
# Save anndata object for NCEM analysis
adata_file = f"{results_folder}ncem/Celltype22_Markers_Wiki_1614/cell2location_ad_vis_finest_ncem_HUGO_scCODA_22_filter_celltype_markers_Wiki_1614_Paper.h5ad"
adata_ncem.write(adata_file)

### Modules and their versions used for this analysis


Useful for debugging and reporting issues.

In [32]:
cell2location.utils.list_imported_modules()

sys 3.9.16 (main, Jan 11 2023, 16:05:54) 
[GCC 11.2.0]
re 2.2.1
ipykernel._version 6.9.1
json 2.0.9
jupyter_client._version 7.2.2
traitlets._version 5.1.1
traitlets 5.1.1
logging 0.5.1.2
platform 1.0.8
_ctypes 1.1.0
ctypes 1.1.0
zmq.sugar.version 23.2.0
zmq.sugar 23.2.0
zmq 23.2.0
argparse 1.1
tornado 6.1
zlib 1.0
colorama 0.4.6
_curses b'2.2'
dateutil._version 2.8.2
dateutil 2.8.2
six 1.16.0
_decimal 1.70
decimal 1.70
jupyter_core.version 4.10.0
jupyter_core 4.10.0
entrypoints 0.4
jupyter_client 7.2.2
ipykernel 6.9.1
IPython.core.release 8.4.0
executing.version 0.8.3
executing 0.8.3
pure_eval.version 0.2.2
pure_eval 0.2.2
stack_data.version 0.2.0
stack_data 0.2.0
pygments 2.11.2
ptyprocess 0.7.0
pexpect 4.8.0
IPython.core.crashhandler 8.4.0
pickleshare 0.7.5
backcall 0.2.0
decorator 5.1.1
_sqlite3 2.6.0
sqlite3.dbapi2 2.6.0
sqlite3 2.6.0
wcwidth 0.2.5
prompt_toolkit 3.0.20
parso 0.8.3
jedi 0.18.1
urllib.request 3.9
IPython.core.magics.code 8.4.0
IPython 8.4.0
setuptools._distutils 3.9