# Query Construction (snRNAseq)

In [None]:
import query_construction_utils as qcu
import pandas as pd
import scanpy as sc
import anndata as ad
import numpy as np
import scanpy.external as sce
import scrublet as scr
import scipy.sparse as sparse


import random
# Setting seed
random.seed(11)

## 1. Data Processing

###  A. Load Data and Metadata

In [None]:
meta_data = pd.read_csv('mtg_meta_data.csv')
adatas=qcu.read_h5_files(meta_data=meta_data,base_dir='/tscc/lustre/ddn/scratch/aopatel/mtg_h5', filtered=True)

<div class="alert alert-block alert-info">
<b> CHECK DUPLICATED GENES. In the MTG the duplicated gene symbols are MKKS', 'DNAJC9-AS1', 'DDX11L16', 'TNFRSF10A-DT', 'LINC01605', 'LINC02256', 'LSP1P5', 'RAET1E-AS1', 'LINC03025', 'NPIPA9', 'PRICKLE2-AS1', 'ARMCX5-GPRASP2', 'SPATA13', 'ELFN2', 'LINC01238', 'GPR84-AS1', 'LINC00484', 'LINC03023', 'LINC03021', 'LINC01115', 'GOLGA8M. They are duplicated in 127/127 samples. We will not change them.

</div>

In [None]:
for i, adata in enumerate(adatas):
    if adata.var_names.is_unique:
        print(f"AnnData object {i}: Variable names are unique.")
    else:
        counts = adata.var_names.value_counts()
        duplicates = counts[counts > 1]
        print(f"AnnData object {i}: Variable names are not unique. Duplicated names: {list(duplicates.index)}")

<div class="alert alert-block alert-info">
<b> Check sizes of each object in adatas, ensure number of "genes" is same and in the correct gene id format. In the MTG, all samples have 38,606 gene symbols

</div>


In [None]:
for index, item in enumerate(adatas):
    print(f"Index: {index}, Value: {item}")

### B. Add Metadata to each sample (object in adatas[] list)

In [None]:
if meta_data["10X_ID"].duplicated().any():
    raise ValueError("meta_data contains duplicate 10X_IDs – fix your CSV!")

if meta_data["10X_ID"].isna().any():
    raise ValueError("meta_data contains missing 10X_IDs")

In [None]:
for adata in adatas:
    sample_id = adata.obs['10X_ID'].iloc[0]                     # all cells in this adata have the same value → safe
    meta_row = meta_data.set_index('10X_ID').loc[sample_id]     # fast, exact lookup
    adata.obs = adata.obs.assign(**meta_row.to_dict())          # adds every metadata column to every cell in this sample

In [None]:
# Check
print(adatas[107].obs)

### C. Change index column of each object in adatas[] to gene_id

In [None]:
for adata in adatas:
    # Ensure required column exists
    if "gene_ids" not in adata.var.columns:
        raise ValueError("Expected 'gene_ids' column in adata.var")

    # Check uniqueness before switching
    if not adata.var["gene_ids"].is_unique:
        raise ValueError("gene_ids are not unique—cannot safely set as index")

    # Move gene symbols to a separate column if not already present
    if "gene_symbols" not in adata.var.columns:
        adata.var["gene_symbols"] = adata.var.index

    # Set gene_ids as index
    adata.var.set_index("gene_ids", inplace=True)


In [None]:
adatas[120].var

## 2. Quality Control (QC) and Final Merge

### A. Initial view and assessment

In [None]:
adatas=qcu.pre_QC_view(adatas)

### B. Filtering

In [None]:
filtered_adatas, summary_df = qcu.QC_filtering(adatas=adatas,
                                                   min_genes=800,  #minimum number of non-zero valued genes a cell must have to be kept
                                                   mt_thresh=5, hb_thresh=1) 

### C. Doublet Removal

In [None]:
filtered_adatas, doublet_summary_df = qcu.run_scrublet(filtered_adatas,expected_doublet_rate=0.08)


In [None]:
filtered_adatas[90]

In [None]:
#  Filter cells with really high gene counts
filtered_adatas_99 = []

for ad in filtered_adatas:
    # compute 99.75 percentile threshold for each sample
    thresh = np.percentile(ad.obs['n_genes_by_counts'], 99.75)

    # boolean mask
    mask = ad.obs['n_genes_by_counts'] <= thresh
    print(f"Sample before: {ad.n_obs}, after: {mask.sum()} (removed {ad.n_obs - mask.sum()})")

    # subset
    ad_filtered = ad[mask].copy()
    filtered_adatas_99.append(ad_filtered)

### D. Concatenation

In [None]:
# Concatenate all datasets into 1
merged_adata = sc.concat(filtered_adatas_99, join='outer', index_unique="-") #index is for barcodes that may similar beteen samples


In [None]:
merged_adata

In [None]:
# grab all var DataFrames from our list
all_var = [x.var for x in filtered_adatas_99]
# concatenate them
all_var = pd.concat(all_var, join="outer")
# remove duplicates
all_var = all_var[~all_var.index.duplicated()]

merged_adata.var = all_var.loc[merged_adata.var_names]

In [None]:
print(merged_adata)

### Gene filter

In [None]:
sc.pp.filter_genes(merged_adata, min_cells=25) #filter genes that are not in at least X cells
print(f"Number of genes after filtering: {merged_adata.shape[1]}")

In [None]:
merged_adata.obs['age_numeric'] = merged_adata.obs['samplingAge'].str.replace('90+', '90', regex=False).astype(int)


In [None]:
merged_adata

<div class="alert alert-block alert-info">
<b> Save the final file (that we are going to use for clustering and analysis!)

</div>


In [None]:
# Save progress up to this point, just in case
merged_adata.write_h5ad("/tscc/lustre/ddn/scratch/aopatel/fin_adata_mtg.h5ad")

### F. Check UMAP prior to any Batch Correction

In [None]:
##### Create layer that is not manipulated
merged_adata.layers["counts"] = merged_adata.X.copy()

##### Normalize, log transform and scale

sc.pp.normalize_total(merged_adata, target_sum=1e4)
sc.pp.log1p(merged_adata)
#sc.pp.scale(merged_adata, max_value=10)

##### Find highly variable genes using "seurat_v3", takes raw data only, provide non-manipulated layer
sc.pp.highly_variable_genes(merged_adata, n_top_genes=2000, flavor="seurat_v3", layer="counts",  batch_key='libraryBatch')

##### PCA
sc.tl.pca(merged_adata, n_comps=50, use_highly_variable=True)
sc.pl.pca_variance_ratio(merged_adata, log=True, n_pcs= 50)

In [None]:
##### Find neighbors, use leiden clustering, and crate umap
sc.pp.neighbors(merged_adata,n_neighbors=15,random_state=11)  #n_neighbors=30 can be used as well
sc.tl.leiden(merged_adata, resolution=0.50, key_added='leiden',random_state=11)  # Adjust resolution as needed

# Changed min_dist for optimal graphing
sc.tl.umap(merged_adata, random_state=11, min_dist=0.15)

In [None]:
sc.pl.umap(
    merged_adata,
    color="leiden",
    size=2
)

In [None]:
# Will only work SEA-AD data
qcu.umi_distribution(merged_adata)
qcu.umi_distribution_diagnosis(merged_adata)
