# Quality Control

This notebook contains the code to reproduce the quality control of a set of ST rat liver samples transfected with AAV2 and AAV9.

In [None]:
import scanpy as sc
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import squidpy as sq
import re
from wrapper_functions import *
sns.set()

In [None]:
# Automatically re-load wrapper functions after an update
# Find details here: https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

In [None]:
sc.logging.print_versions()
sc.set_figure_params(facecolor="white", figsize=(6, 6))
sc.settings.verbosity = 3

## Parameters to be set

We set up some parameters that are used by the wrapper functions. 

In [None]:
organism = Organism.mouse
analyze_params = Analyze(protocol=Protocol.FF, organism=organism)

## Define Input, sample names and metadata

We here define the location of the raw data and the most relevant metadata associated to the samples under consideration. 

In [None]:
root_path = os.getcwd()
inpath='your_inpath_folder' # Replace with the location of your samples
outpath='where_your_data_will_be_saved' # Replace with the output location
results_folder = os.path.join(root_path, 'analyzed')

In [None]:
mysamples=["SN275_B1_202", 
           "SN275_C1_151", 
           "SN275b_D1_153", 
           "SN326_B1_352", 
           "SN326_C1_152", 
           "SN326_D1_203", 
           "SN327_A1_252", 
           "SN327_C1_305", 
           "SN327_D1_204", 
           "SN382_A1_355", 
           "SN382_B1_303", 
           "SN382_C1_251", 
           "SN387_B1_103", 
           "SN393_A1_102", 
           "SN393_B1_253", 
           "SN393_D1_351", 
           "SN394_B1_304", 
           "SN394_C1_101"]

In [None]:
myconditions=pd.Categorical(["AAV2-CMV-GFP",
                            "Untreated",
                            "Untreated",
                            "AAV9-CMV-GFP",
                            "Untreated",
                            "AAV2-CMV-GFP",
                            "AAV2-CMV-GFP",
                            "AAV9-CMV-GFP",
                            "AAV2-CMV-GFP",
                            "AAV9-CMV-GFP",
                            "AAV9-CMV-GFP",
                            "AAV2-CMV-GFP",
                            "Untreated",
                            "Untreated",
                            "AAV2-CMV-GFP",
                            "AAV9-CMV-GFP",
                            "AAV9-CMV-GFP",
                            "Untreated"])
myGender=pd.Categorical(["Male",
                        "Female",
                        "Female",
                        "Female",
                        "Female",
                        "Male",
                        "Female",
                        "Male",
                        "Male",
                        "Female",
                        "Male",
                        "Female",
                        "Male",
                        "Male",
                        "Female",
                        "Female",
                        "Male",
                        "Male"])
myslides=pd.Categorical(["SN275",
                         "SN275",
                         "SN275b",
                         "SN326",
                         "SN326",
                         "SN326",
                         "SN327",
                         "SN327",
                         "SN327",
                         "SN382",
                         "SN382",
                         "SN382",
                         "SN387",
                         "SN393",
                         "SN393",
                         "SN393",
                         "SN394",
                         "SN394"])
myanimalID=pd.Categorical(["I202",
                           "I151",
                           "I153",
                           "I352",
                           "I152",
                           "I203",
                           "I252",
                           "I305",
                           "I204",
                           "I355",
                           "I303",
                           "I251",
                           "I103",
                           "I102",
                           "I253",                           
                           "I351",
                           "I304",
                           "I101"])
mybatch=pd.Categorical(["Batch1",
                        "Batch1",
                        "Batch5",
                        "Batch4",
                        "Batch4",
                        "Batch4",
                        "Batch5",
                        "Batch5",
                        "Batch5",
                        "Batch2",
                        "Batch2",
                        "Batch2",
                        "Batch2",
                        "Batch3",
                        "Batch3",
                        "Batch3",
                        "Batch3",
                        "Batch3",])

In [None]:
metadata = {'Condition' : myconditions, 
            'Gender': myGender, 
            'Slide_ID': myslides,
            'Individual_ID': myanimalID,
            'Batch_ID': mybatch}
metadata_df = pd.DataFrame(metadata, index = mysamples)

In [None]:
metadata_df

## 1.1 Quality Control: Goblal Metrics

We first take a look to the global metrics that come out from the SpaceRanger pipeline for each sample and we plot them together into barplots for comparison. We can color the barplots by the different values in our metadata in order to detect batch or condition related effects.

In [None]:
globalQC_df = get_global_QCmetrics(inpath, mysamples)

In [None]:
globalQC_df

We first colored the barplot by VISIUM slides and the by batch number to see if there is any trend.

In [None]:
get_barplot_qc(globalQC_df, myslides, globalQC_df.columns.values)

In [None]:
get_barplot_qc(globalQC_df, mybatch, globalQC_df.columns.values)

## 1.2 Image-based Quality control

We are now going to look more into the QC details of the individual samples. We will explore potential contamination issues in the spots non covered by tissue, the number of counts and genes per spot the percentage of mithocondrial genes in the different regions of the samples. This analysis and the associated plots will help us to set up some parameters to filter out low quality spots or genes expressed in a very limited number of spots. We will also check for the spatial location of large amounts of hemoglobin related genes that are indicative of spots localized in blood vessels and not overlaying cells

In [None]:
%%capture --no-display
# %%capture --no-display: Removes warnings for this cell
# Here we ant to hide this warning: 'UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.'

adatas_filter = generate_adata_objects(path = inpath, samples_names = mysamples, metadata = metadata_df, analyze_params=analyze_params)
adatas_raw = generate_adata_objects(path = inpath, samples_names = mysamples, metadata = metadata_df, analyze_params=analyze_params, count_file='raw_feature_bc_matrix.h5')

In [None]:
adatas_filter[0].obs

In [None]:
perform_qc_analysis(adatas_filter, adatas_raw, color_map="Reds", sample_id="Sample_ID",
    condition_name="Condition",
    batch_name="Batch_ID")

We are going to implement some QC filtering of spots based inspired by the cutoffs proposed in the following publication: 
*"The spatiotemporal program of zonal liver regeneration following acute injury"*

https://pubmed.ncbi.nlm.nih.gov/35659879/

We will however rely on the mean absolute deviation (MAD, https://en.wikipedia.org/wiki/Median_absolute_deviation), which is supposed to be more robust to outliers than the standard deviation. 

In [None]:
for i, adata in enumerate(adatas_filter):
    
    print(adata.obs["Sample_ID"].unique())
    
    ## Based on UMIs
    
    # Calculate mean and standard deviation of UMI counts for the current AnnData
    mean_umi = adata.obs['total_counts'].mean()
    std_umi = adata.obs['total_counts'].std()
    mad_umi = adata.obs['total_counts'].mad()

    # Calculate the UMIs threshold
    threshold_umi_min = mean_umi - 2 * mad_umi
    threshold_umi_max = mean_umi + 3 * mad_umi
    
    # Filter spots based on the threshold
    
    print(f"# threshold_umi_min: {threshold_umi_min}")
    print(f"# Spots before removing min_counts: {adata.n_obs}")
    sc.pp.filter_cells(adata, min_counts=threshold_umi_min, inplace=True)
    print(f"# Spots after removing min_counts: {adata.n_obs}")
    
    
    print(f"# threshold_umi_max: {threshold_umi_max}")
    sc.pp.filter_cells(adata, max_counts=threshold_umi_max, inplace=True)
    print(f"# Spots after removing max_counts: {adata.n_obs}")
    
    ## Based on Mitochondrial content 
    
    # Calculate mean and standard deviation for the mitochondrial fraction
    mean_mito_fraction = adata.obs['pct_counts_mt'].mean()
    std_mito_fraction = adata.obs['pct_counts_mt'].std()
    mad_mito_fraction = adata.obs['pct_counts_mt'].mad()

    # Threshold to filter out cells with high mitochondrial gene fraction
    threshold_mito = mean_mito_fraction + 4 * mad_mito_fraction
    
    print(f"# threshold_mito: {threshold_mito}")
    adata = adata[adata.obs["pct_counts_mt"] <= threshold_mito, :]
    print(f"# Spots after removing high mito content: {adata.n_obs}")
     
    # In addition, we remove genes which are expressed in less than 10 of the reamaining spots. 
    
    print(f"# Genes before filter: {adata.n_vars}")
    sc.pp.filter_genes(adata, min_cells=10, inplace=True)
    print(f"# Genes after filter: {adata.n_vars}")    
    
    # Store the filtered AnnData back to the list
    adatas_filter[i] = adata

In [None]:
path_exclude_spots = f"{outpath}/morphology_csv"

for a in range(len(adatas_filter)):
    current_sample = np.asarray(adatas_filter[a].obs["Sample_ID"].unique())
    
    print(current_sample)
    
    print(f"# Spots before removing excluded spots: {adatas_filter[a].n_obs}")
    path_current_sample = "Morpholgy_" + current_sample[0] + ".csv"
    df = pd.read_csv(os.path.join(path_exclude_spots, path_current_sample))
    adatas_filter[a] = adatas_filter[a][~adatas_filter[a].obs_names.isin(df[df['Morphology']=='Exclude']['Barcode'].tolist()), :]
    print(f"# Spots before removing excluded spots: {adatas_filter[a].n_obs}")


We finally save the data for future retrieval in other scripts

In [None]:
for current_adata in adatas_filter:
    current_sample = np.asarray(current_adata.obs["Sample_ID"].unique())
    filename = 'adata_filter_' + current_sample[0] + '.h5ad'
    current_adata.write(os.path.join(results_folder, filename))

In [None]:
! jupyter nbconvert --to html 00_Quality_Control.ipynb