Environment: This script should be run with the `python_plant_pathogen_atlas` environment using the devcontainer `docker_python`

In [None]:
import scanpy as sc
import os
import pandas as pd
import numpy as np
from merfishing.core import clustering
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import glob
import diopy

In [None]:
data_folder = r'../../data'
output_folder = r'../../outputs'

In the following cell, we convert our DC3000 multiome object into an anndata object. This will allow us to use the scanpy package to perform the clustering and visualization of the data.

In [None]:
seq_data = diopy.input.read_h5(file = os.path.join(data_folder, 'temp_objects', 'DC3000_alone.h5'))
seq_data.obs['modality'] = 'seq'
seq_data.write(os.path.join(data_folder, 'temp_objects', 'DC3000_alone.h5ad'))

try:
    os.remove(os.path.join(data_folder, 'temp_objects', 'DC3000_alone.h5'))
except:
    print('DC3000_alone.h5 already removed')

We can now put in the path to our DC3000 Baysor segmentation folder to read in the MERFISH data.

In [None]:
list_of_baysor_segmentations = [os.path.join(data_folder, 'segmentations', 'kt56')]

In [None]:
for input_file in list_of_baysor_segmentations:
    try: 
        os.mkdir(os.path.join(input_file, 'adatas'))
    except:
        print('Adatas dir already exists')
    cell_stats = pd.read_csv(glob.glob(os.path.join(input_file, '*_cell_stats.csv'))[0], index_col=0)
    cell_counts = pd.read_csv(glob.glob(os.path.join(input_file, '*_counts.tsv'))[0], sep='\t', index_col=0)
    adata = sc.AnnData(X=cell_counts.values.T, obs = cell_stats, var = pd.DataFrame(index = cell_counts.index.values))
    adata.obs = cell_stats
    adata.write(os.path.join(input_file, 'adatas', 'adata.h5ad'))

Let's store the raw counts in an adata layer, and perform log normalization. It is often advised to use raw counts when processing spatial data, so we want to hold onto them.

In [None]:
for input_file in list_of_baysor_segmentations:
    adata = sc.read(os.path.join(input_file, 'adatas', 'adata.h5ad'))
    adata.layers['counts'] = adata.X
    sc.pp.normalize_total(adata)
    sc.pp.log1p(adata)
    adata.write(os.path.join(input_file, 'adatas', 'preprocessed_00.h5ad'))

    try:
        os.remove(os.path.join(input_file, 'adatas', 'adata.h5ad'))
    except:
        print('Could not remove original adata')

The following cell performs QC filtering on the DC3000 data.

In [None]:
for input_file in list_of_baysor_segmentations:
    min_transcript_threshold=50
    max_transcript_threshold=1200
    min_cell_area = 5
    max_cell_area = 15000
    min_tc_area = 0.01
    max_tc_area = 10

    experiment = sc.read(os.path.join(input_file, 'adatas', 'preprocessed_00.h5ad'))
    print(experiment)
    experiment = experiment[(experiment.obs['area'] > min_cell_area) & (experiment.obs['area'] < max_cell_area), :]
    experiment = experiment[(experiment.obs['n_transcripts'] > min_transcript_threshold) & (experiment.obs['n_transcripts'] < max_transcript_threshold), :]
    experiment.obs['transcript_counts_div_cell_area'] = experiment.obs['n_transcripts']/experiment.obs['area']
    
    print('QC metrics for batch '+os.path.basename(input_file))
    print(f'{len(experiment.obs.index)} cells before QC filtering')

    experiment = experiment[experiment.obs['transcript_counts_div_cell_area'] < np.percentile(experiment.obs['transcript_counts_div_cell_area'], 99.5)]
    

    experiment = experiment[(experiment.obs['transcript_counts_div_cell_area'] > min_tc_area) & (experiment.obs['transcript_counts_div_cell_area'] < max_tc_area), :]
    print(f'{len(experiment.obs.index)} cells after QC filtering')
    
    df = pd.DataFrame(experiment.X, columns=experiment.var.index.values, index=experiment.obs.index.values)
    metadata = experiment.obs
    
    experiment.write(os.path.join(input_file, 'adatas', 'preprocessed_and_filtered_01.h5ad'))

    try:
        os.remove(os.path.join(input_file, 'adatas', 'preprocessed_00.h5ad'))
    except:
        print('Could not remove preprocessed_00.h5ad')


The following cell will convert gene IDs to gene symbols in the preprocessed and filtered object. It also removes blank barcodes from the cellxgene

In [None]:
input_file = list_of_baysor_segmentations[0]

spatial_data = sc.read(os.path.join(input_file, 'adatas', 'preprocessed_and_filtered_01.h5ad'))
spatial_data.obs['batch'] = os.path.basename(input_file)

spatial_data = spatial_data[:, ~spatial_data.var.index.str.contains('Blank')]
spatial_data.obs_names_make_unique()
spatial_data.var_names_make_unique()

gene_symbols = pd.read_csv(os.path.join(data_folder, 'useful_files', 'geneID_to_geneName_MERSCOPE_panel1.txt'), sep='\t', index_col=0)
new_indices = spatial_data.var.merge(gene_symbols, left_index=True, right_on='gene_id').index.tolist()
gene_names = spatial_data.var.merge(gene_symbols, left_index=True, right_on='gene_id').gene_name.tolist()
gene_id = spatial_data.var.merge(gene_symbols, left_index=True, right_on='gene_id').gene_id.tolist()
spatial_data = spatial_data[:, gene_id]
new_vars = spatial_data.var.merge(gene_symbols, left_index=True, right_on='gene_id')
new_vars.index = new_vars.gene_name.tolist()
new_vars = new_vars.drop(['gene_name'], axis=1)
spatial_data.var = new_vars

spatial_data.write(os.path.join(input_file, 'adatas', 'preprocessed_and_filtered_02.h5ad'))


try:
    os.remove(os.path.join(input_file, 'adatas', 'preprocessed_and_filtered_01.h5ad'))
except:
    print('Could not remove preprocessed_and_filtered_01.h5ad')