In [None]:
# parameters

In [None]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import anndata as ad
# import scanpy as sc
# import scipy.stats
# from statsmodels.stats.multitest import multipletests
# from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42
plt.rcParams['figure.dpi'] = 300

from datetime import datetime 
current_datetime = datetime.now().strftime("%Y-%m-%d_%H:%M")
image_path = "/home/x-aklein2/projects/aklein/BICAN/BG/images"

np.random.seed(13)

In [None]:
adata_rna = ad.read_h5ad("/home/x-aklein2/projects/aklein/BICAN/data/reference/AIT/AIT_PU.h5ad")
adata_rna

In [None]:
levels = ["Class", "Subclass", "Group", "Cluster"]
for _level in levels:
    print(f"Level: {_level}, Unique categories: {adata_rna.obs[_level].nunique()}")

## Dispersion based approach

In [None]:
output_path = Path("/home/x-aklein2/projects/aklein/BICAN/BG/data/cluster_dispersion")
output_path.mkdir(parents=True, exist_ok=True)

In [None]:
from sklearn.svm import SVR
def calculate_hvf_svr(adata, max_cells = 50000, min_cells = 20): 
    if adata.n_obs > max_cells:
        adata = adata[adata.obs.sample(max_cells).index, :].copy()
    if adata.n_obs < min_cells:
        raise ValueError(f"Not enough cells ({adata.n_obs}) to calculate HVF.")
    expr = adata.X.toarray().copy()
    expr_mean = np.mean(expr, axis=0)
    expr_var = np.var(expr, axis=0, ddof=1)
    dispersion = expr_var / (expr_mean + 1e-8)
    log2_disp = np.log2(dispersion + 1e-8)
    log2_expr_mean = np.log2(expr_mean + 1e-8)
    X = np.vstack([log2_expr_mean]).T

    svr_gamma = 1000 / X.shape[0]
    svr = SVR(kernel='rbf', C=1.0, gamma=svr_gamma)
    svr.fit(X, log2_disp)

    score = log2_disp - svr.predict(X)

    hvf_df = pd.DataFrame({
        "gene": adata.var_names,
        "svr_score": score,
        "dispersion": dispersion
    }).sort_values("svr_score", ascending=False)

    return hvf_df

In [None]:
# hvf_df = calculate_hvf_svr(adata_rna)
# fout = output_path / f"hvf_svr_all.csv"
# hvf_df.to_csv(fout)
levels = ["Cluster"]

In [None]:
for _level in levels:
    for _cell_type in adata_rna.obs[_level].unique().tolist():
        print(f"Level: {_level}, Cell type: {_cell_type}, #cells: {(adata_rna.obs[_level] == _cell_type).sum()}")
        try: 
            hvf_df = calculate_hvf_svr(adata_rna[adata_rna.obs[_level] == _cell_type, :])
        except ValueError as e:
            print(f"  Skipping {_cell_type} due to error: {e}")
            continue
        out_ct = _cell_type.replace("/", "_").replace(" ", "_")
        fout = output_path / f"hvf_svr_{_level}_{out_ct}.csv"
        hvf_df.to_csv(fout) 

In [None]:
### TODO: 
# - Load at the class, subclass, group, and cluster level 
# - For each gene look at the ratio of both its dispersion and its score to the dispersion / score from entire dataset calculations 
# - For each gene classify the difference between the new and original as its value on a min-max scaling with the min is 0 and the max is the original value 
# - (account for side cases where there was 0 dispersion in old and new!) 
# - For varying thresholds np.arange(0, 2, 0.05) get the count per cell type for genes that are above that said threshold
# - plot the histogram colored by level for those counts (different histogram for the different thresholds!). # 

In [None]:
hvf_df_all = pd.read_csv(output_path / "hvf_svr_all.csv", index_col=0)
hvf_df_all = hvf_df_all.set_index("gene")
for _level in levels:
    for _cell_type in adata_rna.obs[_level].unique().tolist():
        out_ct = _cell_type.replace("/", "_").replace(" ", "_")
        try: 
            hvf_df = pd.read_csv(output_path / f"hvf_svr_{_level}_{out_ct}.csv", index_col=0)
            hvf_df = hvf_df.set_index("gene")
        except FileNotFoundError as e:
            print(f"  Skipping {_cell_type} due to error: {e}")
            continue
        hvf_df = hvf_df.join(hvf_df_all, lsuffix="_sub", rsuffix="_all", how="inner")
        hvf_df['dispersion_ratio'] = hvf_df['dispersion_sub'] / (hvf_df['dispersion_all'] + 1e-8)
        hvf_df['score_ratio'] = hvf_df['svr_score_sub'] / (hvf_df['svr_score_all'] + 1e-8)
        # min-max scaling of the ratios 
        hvf_df['dispersion_ratio_mm'] = (hvf_df['dispersion_ratio'] - hvf_df['dispersion_ratio'].min()) / (hvf_df['dispersion_ratio'].max() - hvf_df['dispersion_ratio'].min() + 1e-8)
        hvf_df['score_ratio_mm'] = (hvf_df['score_ratio'] - hvf_df['score_ratio'].min()) / (hvf_df['score_ratio'].max() - hvf_df['score_ratio'].min() + 1e-8)
        out_ct = _cell_type.replace("/", "_").replace(" ", "_")
        fout = output_path / f"hvf_svr_ratios_{_level}_{out_ct}.csv"
        hvf_df.to_csv(fout)

In [None]:
all_levels = {}
for _level in levels:
    level_specific = {}
    for _cell_type in adata_rna.obs[_level].unique().tolist():
        out_ct = _cell_type.replace("/", "_").replace(" ", "_")
        try: 
            hvf_df = pd.read_csv(output_path / f"hvf_svr_ratios_{_level}_{out_ct}.csv", index_col=0)
            hvf_df = hvf_df.set_index("gene")
        except FileNotFoundError as e:
            print(f"  Skipping {_cell_type} due to error: {e}")
            continue
        thr_list = []
        for thr in np.arange(0, 2.05, 0.05):
            count_disp = (hvf_df['dispersion_ratio_mm'] > thr).sum()
            count_score = (hvf_df['score_ratio_mm'] > thr).sum()
            thr_list.append((thr, count_disp, count_score))
        level_specific[_cell_type] = thr_list
    all_levels[_level] = level_specific