# Query Construction (snRNAseq)

In [None]:
import query_construction_utils_01 as qcu
import pandas as pd
import scanpy as sc
import anndata as ad
import doubletdetection

%matplotlib inline
import matplotlib.pyplot as plt


## 1. Data Input

###  A. Load data and metadata

In [None]:
meta_data = pd.read_csv('mtg_meta_data.csv')

## Change accordingly
# For cellbender: '/tscc/lustre/ddn/scratch/aopatel/cellbender_results/'
# For Cell Ranger: '/tscc/lustre/ddn/scratch/aopatel/mtg_h5_for_analytics'
base_dir = '/tscc/lustre/ddn/scratch/aopatel/mtg_h5_for_analytics'

adatas=qcu.read_h5_files(meta_data, base_dir, cellbender=False, filtered=True , make_unique=True)


<div class="alert alert-block alert-info">
<b> CHECK DUPLICATED GENES. In the MTG the duplicated gene symbols are MKKS', 'DNAJC9-AS1', 'DDX11L16', 'TNFRSF10A-DT', 'LINC01605', 'LINC02256', 'LSP1P5', 'RAET1E-AS1', 'LINC03025', 'NPIPA9', 'PRICKLE2-AS1', 'ARMCX5-GPRASP2', 'SPATA13', 'ELFN2', 'LINC01238', 'GPR84-AS1', 'LINC00484', 'LINC03023', 'LINC03021', 'LINC01115', 'GOLGA8M. They are duplicated in 127/127 samples. We will not change them.

</div>

In [None]:
qcu.check_unique(adatas)

In [None]:
#### Calculate number of cells in each adata object BEFORE pre-processing

path= "pre_pre_processing_cellbender_adatas_sizes.csv"
qcu.cell_count_caclulator(adatas, path)

### B. Add metadata to each sample (object in adatas[] list)

<div class="alert alert-block alert-info">
<b> Check sizes of each object in adatas and add metadata correctly!

</div>


In [None]:
adatas=qcu.meta_data_adder(adatas,meta_data)

In [None]:
#### Sanity Check!
print(adatas[101].obs)

## 2. Quality Control (QC) and Preprocessing

### A. Initial view and assessment

In [None]:
#### min_genes=1000 
#### to get rid of cells that don't express at least 1000 genes even before any QC 

adatas=[qcu.quality_controller(ad,min_genes=1000, is_indexed=True) for ad in adatas]



In [None]:
#### Violin plot visualization

for ad in adatas:
    qcu.violin_plots(ad)
    

### B. Preprocessing

In [None]:
adatas=[qcu.pre_processor(ad,mt_thresh=5, hb_thresh=1) for ad in adatas]

In [None]:
#### OPTIONAL: For toggling through different QC metrics
#path= "mt_5_pre_pre_processing_cellbender_adatas_sizes.csv"
#qcu.cell_count_caclulator(adatas, path)

### C. Doublet removal (Scrublet & DoubletDetection)

In [None]:
#### Create clf object for DoubletDetection

clf = doubletdetection.BoostClassifier(
    n_iters=10,
    clustering_algorithm="leiden",
    standard_scaling=True,
    pseudocount=0.1,
    n_jobs=-1)

In [None]:
#### Expected doublet rate is estimated from technology used, estimated input and expected yield.
#### Please check associated helper functions page for more info

adatas=[qcu.de_doubletor(ad, to_filter=True, expected_doublet_rate=0.08,
                p_thresh=1e-16, voter_thresh=0.5, clf=clf) for ad in adatas]

In [None]:
#### Calculate number of cells in each adata object AFTER pre-processing

path= "post_pre_processing_cellbender_adatas_sizes.csv"
qcu.cell_count_caclulator(adatas, path)

### D. Concatenation

In [None]:
#### Concatenate all datasets into 1
merged_adata = sc.concat(adatas, join='outer', index_unique="-") #index is for barcodes that may similar beteen samples


In [None]:
merged_adata

In [None]:
#### sc.concat removes .vars data 
#### Is is critical to attach .vars data CORRECTLY!

## grab all var DataFrames from our list
all_var = [x.var for x in adatas]
## concatenate them
all_var = pd.concat(all_var, join="outer")
## remove duplicates
all_var = all_var[~all_var.index.duplicated()]

## assign
merged_adata.var = all_var.loc[merged_adata.var_names]

In [None]:
print(merged_adata)

### E. Gene Filter and re-calculate QC metrics (its okay if the %s are different, we are removing some genes)

In [None]:
#### Add gene filter
sc.pp.filter_genes(merged_adata, min_cells=25)

#### Recalculate qc metrics now that file is merged
sc.pp.calculate_qc_metrics(merged_adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20], log1p=True)

In [None]:
## Another sanity check!
merged_adata.var

<div class="alert alert-block alert-info">
<b> Save this file. After reference is constructed, we will proceed with this file (as query) for cell type classification next

</div>

In [None]:
# Save progress up to this point, just in case
merged_adata.write_h5ad("/tscc/lustre/ddn/scratch/aopatel/preprocessed_adata_mtg.h5ad")