In [None]:
import query_construction_utils_01 as qcu
import pandas as pd
import numpy as np
import scanpy as sc
import anndata as ad

%matplotlib inline
import matplotlib.pyplot as plt

# Reference Construction

## 1. Data Input

In [None]:
#### Load relevant reference data file(s) 
adata_ref = sc.read_h5ad("/tscc/lustre/ddn/scratch/aopatel/mtg_ref/Reference_MTG_RNAseq_final-nuclei.2022-06-07.h5ad")
adata_ref

## 2. Quality Control (QC) and Preprocessing

In [None]:
#### Initial QC

adata_ref=qcu.quality_controller(adata_ref,min_genes=1000,is_indexed=False)

In [None]:
#### Note* There is no cell loss with min_genes=500, 137303
adata_ref

In [None]:
#### Violin plot visualization
sc.pl.violin(
            adata_ref,
            ["n_genes_by_counts", "total_counts", "pct_counts_mt", "pct_counts_ribo", "pct_counts_hb",'pct_counts_in_top_20_genes'],
            jitter=0.4,
            multi_panel=True,
            show=True
        )

    

In [None]:
adata_ref=qcu.pre_processor(adata_ref, mt_thresh=5, hb_thresh=1, is_list=False, MADS=False)

In [None]:
#### Filter genes that are not expressed in at least x cells 
sc.pp.filter_genes(adata_ref, min_cells=25)

print(f"Number of genes after filtering: {adata_ref.shape[1]}")
print(f"Number of cells after filtering: {adata_ref.shape[0]}")

## 3. Change Select Attributes to Match Query

In [None]:
#### Change these for the scVI model 
adata_ref.obs['sex'] = adata_ref.obs['donor_sex_label'].map({'M': 'male', 'F': 'female'})

adata_ref.obs = adata_ref.obs.rename(columns={'external_donor_name_label': 'individualID'})

adata_ref.obs['ADNC'] = 'Not AD'

adata_ref.obs['libraryBatch'] = 'REF'

adata_ref.obs['Consensus clinical diagnosis'] = 'Neurotypical'

## 4. Finishing touches

In [None]:
#### Remove samples/cells without any cell type annotation
print("Empty count: ", adata_ref.obs['subclass_label'].isna().sum())

adata_ref = adata_ref[~adata_ref.obs['subclass_label'].isna()].copy()

print("Empty count (after removal): ", adata_ref.obs['subclass_label'].isna().sum())

In [None]:
adata_ref.obs['age_numeric'] = adata_ref.obs['age_label'].str.replace(' yrs','').astype(int)


In [None]:
adata_ref.obs["sample"] = adata_ref.obs["sample_name"].str.split("-", n=1).str[1]
adata_ref.obs["sample"].nunique()

In [None]:
#### Save final reference file
adata_ref.write_h5ad("/tscc/lustre/ddn/scratch/aopatel/adata_ref.h5ad")