In [None]:
import reference_construction_utils as rcu
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scanpy as sc
import anndata as ad

# Reference Construction

## 1. Create AnnData Object

In [None]:
#### Load relevant reference data files 
adata_ref = sc.read_h5ad("/tscc/lustre/ddn/scratch/aopatel/mtg_ref/Reference_MTG_RNAseq_final-nuclei.2022-06-07.h5ad")
adata_ref

In [None]:
print(adata_ref.var_names[:100])

## 2. Perform Quality Control

In [None]:
#### Initial QC
adata_ref = rcu.QC_performer(adata_ref, index_is_gene_symbol=True, show_violin_plot=True, filter=False)

In [None]:
adata_ref

In [None]:
#### Filter genes that are not expressed in at least x cells 
sc.pp.filter_genes(adata_ref, min_cells=25)
#sc.pp.filter_cells(adata_ref, min_genes=250)

print(f"Number of genes after filtering: {adata_ref.shape[1]}")
print(f"Number of cells after filtering: {adata_ref.shape[0]}")

## 3. Change Select Attributes to Match Query

In [None]:
#### Change these for the scVI model 
adata_ref.obs['sex'] = adata_ref.obs['donor_sex_label'].map({'M': 'male', 'F': 'female'})
adata_ref.obs = adata_ref.obs.rename(columns={'external_donor_name_label': 'individualID'})
adata_ref.obs['ADNC'] = 'Not AD'
adata_ref.obs['libraryBatch'] = 'REF'
adata_ref.obs['Consensus clinical diagnosis'] = 'Neurotypical'

## 4. Finishing touches

In [None]:
#### Remove samples/cells without any cell type annotation
print("Empty count: ", adata_ref.obs['subclass_label'].isna().sum())

adata_ref = adata_ref[~adata_ref.obs['subclass_label'].isna()].copy()

print("Empty count (after removal): ", adata_ref.obs['subclass_label'].isna().sum())

In [None]:
adata_ref.obs['age_numeric'] = adata_ref.obs['age_label'].str.replace(' yrs','').astype(int)


In [None]:
#### Save final reference file
adata_ref.write_h5ad("/tscc/lustre/ddn/scratch/aopatel/adata_ref.h5ad")