Environment: This script should be run with the `python_plant_pathogen_atlas` environment using the devcontainer `docker_python`

In [None]:
import scanpy as sc
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import glob
from merfishing.core import clustering
import diopy

In [None]:
data_folder = r'../../data'
output_folder = r'../../outputs'
os.makedirs(output_folder, exist_ok=True)

##### Let's first convert the scMultiome object with both replicates into an h5ad file so it can be loaded into python easier

In [None]:
seq_data = diopy.input.read_h5(file = os.path.join(data_folder, 'temp_objects', 'AvrRpt2_alone2.h5'))
seq_data.obs['modality'] = 'seq'
seq_data.write(os.path.join(data_folder, 'temp_objects', 'seq_reference_00.h5ad'))

##### Here we create adata objects from the Baysor segmentation outputs

In [None]:
list_of_baysor_segmentations = []
for seg_name in ['mock', '4hr_avr', '6hr_avr', '9hr_avr', 'avrrpt24']:
    list_of_baysor_segmentations.append(os.path.join(data_folder, 'segmentations', seg_name))

In [None]:
for input_file in list_of_baysor_segmentations:
    try: 
        os.mkdir(os.path.join(input_file, 'adatas'))
    except:
        print('Adatas dir already exists')
    cell_stats = pd.read_csv(glob.glob(os.path.join(input_file, '*_cell_stats.csv'))[0], index_col=0)
    cell_counts = pd.read_csv(glob.glob(os.path.join(input_file, '*_counts.tsv'))[0], sep='\t', index_col=0)
    adata = sc.AnnData(X=cell_counts.values.T, obs = cell_stats, var = pd.DataFrame(index = cell_counts.index.values))
    adata.obs = cell_stats
    adata.write(os.path.join(input_file, 'adatas', 'adata.h5ad'))

##### We save the raw counts and normalized counts into an object called preprocessed

In [None]:
for input_file in list_of_baysor_segmentations:
    adata = sc.read(os.path.join(input_file, 'adatas', 'adata.h5ad'))
    adata.layers['counts'] = adata.X.copy()
    sc.pp.normalize_total(adata)
    sc.pp.log1p(adata)
    adata.write(os.path.join(input_file, 'adatas', 'preprocessed_00.h5ad'))

##### Now we QC filter all objects to remove low quality cells

In [None]:
for input_file in list_of_baysor_segmentations:
    min_transcript_threshold=50
    max_transcript_threshold=1200
    min_cell_area = 5
    max_cell_area = 15000
    min_tc_area = 0.01
    max_tc_area = 10

    #load in the current adata
    experiment = sc.read(os.path.join(input_file, 'adatas', 'preprocessed_00.h5ad'))

    print('QC metrics for batch '+os.path.basename(input_file))
    print(f'{len(experiment.obs.index)} cells before QC filtering')
    #remove blank barcodes
    experiment = experiment[:, ~experiment.var.index.str.contains('Blank')]
    experiment.obs['n_transcripts'] = np.array(experiment.layers['counts'].sum(axis=1))

    #remove cells that fall outside of area constraints
    experiment = experiment[(experiment.obs['area'] > min_cell_area) & (experiment.obs['area'] < max_cell_area), :]


    experiment = experiment[(experiment.obs['n_transcripts'] > min_transcript_threshold) & (experiment.obs['n_transcripts'] < max_transcript_threshold), :]

    experiment.obs['transcript_counts_div_cell_area'] = experiment.obs['n_transcripts']/experiment.obs['area']

    experiment = experiment[experiment.obs['transcript_counts_div_cell_area'] < np.percentile(experiment.obs['transcript_counts_div_cell_area'], 99.5)]
    
    experiment = experiment[(experiment.obs['transcript_counts_div_cell_area'] > min_tc_area) & (experiment.obs['transcript_counts_div_cell_area'] < max_tc_area), :]
    print(f'{len(experiment.obs.index)} cells after QC filtering')
    
    df = pd.DataFrame(experiment.X, columns=experiment.var.index.values, index=experiment.obs.index.values)
    metadata = experiment.obs
    
    sc.pl.violin(experiment, keys=['n_transcripts', 'area', 'transcript_counts_div_cell_area'])
    
    experiment.write(os.path.join(input_file, 'adatas', 'preprocessed_and_filtered_01.h5ad'))
