In [None]:
import os
from pathlib import Path
import ast

import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import scvi

from spida.P.setup_adata import multi_round_clustering, _calc_embeddings

import matplotlib.pyplot as plt
import seaborn as sns
from spida.pl import plot_categorical, plot_continuous
plt.rcParams['axes.facecolor'] = 'white'

from datetime import datetime 
current_datetime = datetime.now().strftime("%Y-%m-%d_%H:%M")

In [None]:
#parameters
EXPERIMENT = "PU" 
prefix = "BICAN_BG"
suffix = "proseg_fv38_filt"
output_dir = "/home/x-aklein2/projects/aklein/BICAN/BG/data/annotation"
image_path = "/home/x-aklein2/projects/aklein/BICAN/BG/images/annotations"
model_path = "/home/x-aklein2/projects/aklein/BICAN/BG/data/annotation/models"

In [None]:
adata_path = Path(f"{output_dir}/{prefix}_{EXPERIMENT}/{EXPERIMENT}.h5ad")

In [None]:
# ref_adata_path = Path(f"/home/x-aklein2/projects/aklein/BICAN/data/reference/AIT/AIT_{REF_EXP}_filtered.h5ad")
deg_path = Path("/home/x-aklein2/projects/aklein/BICAN/data/reference/DEGs/summary_neuron.csv")
degs = pd.read_csv(deg_path)
image_path = Path(image_path) / EXPERIMENT
model_path = Path(model_path) / EXPERIMENT / "clustering.pt"
image_path.mkdir(parents=True, exist_ok=True)
model_path.parent.mkdir(parents=True, exist_ok=True)

In [None]:
#papermill_description=Reading AnnData
adata = ad.read_h5ad(adata_path)
adata

In [None]:
# From SCVI SCANVI tutorial 
def get_score(normalized_adata, gene_set):
    """Returns the score per cell given a dictionary of + and - genes.

    Parameters
    ----------
    normalized_adata
      anndata dataset that has been log normalized and scaled to mean 0, std 1
    gene_set
      a dictionary with two keys: 'positive' and 'negative'
      each key should contain a list of genes
      for each gene in gene_set['positive'], its expression will be added to the score
      for each gene in gene_set['negative'], its expression will be subtracted from its score

    Returns
    -------
    array of length of n_cells containing the score per cell
    """
    score = np.zeros(normalized_adata.n_obs)
    for gene in gene_set["positive"]:
        expression = np.array(normalized_adata[:, gene].X.toarray())
        score += expression.flatten()
    for gene in gene_set["negative"]:
        expression = np.array(normalized_adata[:, gene].X.toarray())
        score -= expression.flatten()
    return score


def get_cell_mask(normalized_adata, gene_set):
    """Get cell mask.

    Calculates the score per cell for a list of genes, then returns a mask for
    the cells with the highest 50 scores.

    Parameters
    ----------
    normalized_adata
      anndata dataset that has been log normalized and scaled to mean 0, std 1
    gene_set
      a dictionary with two keys: 'positive' and 'negative'
      each key should contain a list of genes
      for each gene in gene_set['positive'], its expression will be added to the score
      for each gene in gene_set['negative'], its expression will be subtracted from its score

    Returns
    -------
    Mask for the cells with the top 50 scores over the entire dataset
    """
    score = get_score(normalized_adata, gene_set)
    cell_idx = score.argsort()[-20:]
    mask = np.zeros(normalized_adata.n_obs)
    mask[cell_idx] = 1
    return mask.astype(bool)

In [None]:
nonneuron = ['COBLL1', 'ARHGAP29', 'EBF1', 'ARHGAP15', 'PTPRC', 'MBNL1', 'PCDH9', 'BCAS1', 'PDE4B', 'MSI2', 'GLIS3', 'NPAS3', 'CFAP299', 'SPAG17', 'CFAP54', 'CFAP299', 'CFAP54', 'TMEM232', 'NTM', 'LSAMP', 'CTNND2', 'KLHL1', 'CA10', 'CNTNAP2', 'TRPM3', 'DPP6', 'RGS6', 'ADGRV1', 'EPHA6', 'CADM1']

In [None]:
adata.obs.columns

In [None]:
nonneuron = ["MAG", "MOBP", "BCAS1", "EBF1", "NPAS3", "COBLL1", "GAB1", "DOCK5", "GLIS3"]
neuron = ["RBFOX3", "GAD1", "GAD2", "CHAT", "VIP", "PVALB", "SST", "RORB", "LAMP5", "LHX6", "LHX8", "FOXP2", "PENK", "CELF2", "ST18", "MEIS2"]

In [None]:
'MEIS2' in adata.var_names

In [None]:
# ['SKAP1', 'FYN', 'PRKCH']	['PTPRC', 'MBNL1', 'ARHGAP15']
# ['NAV3', 'SYNDIG1', 'PLXDC2']	['PLXDC2', 'LRMDA', 'DOCK4']
# ['F13A1', 'ITSN1', 'FRMD4B']	['LRMDA', 'SLC9A9', 'DOCK2']
# ['PTPRZ1', 'LRRC4C', 'SNTG1']	['TNR', 'PTPRZ1', 'VCAN']
# ['CTNNA3', 'ST18', 'MAP7']	['CTNNA3', 'ST18', 'MBP']
# ['ARHGAP24', 'PLXDC2', 'DOCK8']	['CTNNA3', 'ST18', 'DOCK4']
# ['TMEM108', 'TRIO', 'FYN']	['BCAS1', 'TNS3', 'FYN']
# ['FLT1', 'ABCB1', 'ATP10A']	['FLT1', 'ABCB1', 'ELOVL7']
# ['DLC1', 'PDE8B', 'GPC5']	['DLC1', 'EPS8', 'CALD1']
# ['FLRT2', 'BICC1', 'ABCA8']	['LAMA2', 'CEMIP', 'UACA']
# ['AKAP6', 'MYH11', 'SLIT3']	['IGFBP7', 'LPP', 'TAGLN']

In [None]:
markers = {"neuron" : {"positive": neuron, "negative": []}, #nonneuron},
           "nonneuron" : {"positive": nonneuron, "negative":[]},# neuron}
}

In [None]:
top_n = 50

In [None]:
markers = {}
for c, row in degs.iterrows(): 
    # print(c, row)
    _ct = row['cell_type']
    positive_markers = ast.literal_eval(row['top_upregulated'])[:top_n]
    negative_markers = ast.literal_eval(row['top_downregulated'])[:top_n]
    print(_ct, positive_markers, negative_markers)
    markers[_ct] = {"positive" : positive_markers, "negative": negative_markers}

In [None]:
adata.X = adata.layers['volume_norm'].copy()
sc.pp.scale(adata, max_value=10)

In [None]:
#papermill_description=Calculating seed labels with marker genes
scores = {}
for cell_type, gene_list in markers.items():
    score = get_score(adata, gene_list)
    scores[cell_type] = score
    adata.obs[cell_type + "_score"] = score
adata.layers['scaled'] = adata.X.copy()
adata.X = adata.layers['volume_norm'].copy()

In [None]:
score_diff = adata.obs['neuron_score'] - adata.obs['nonneuron_score']
adata.obs['Neuron_vs_Nonneuron'] = score_diff

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
sns.histplot(scores['neuron'], ax=axes[0])
axes[0].set_title('Neuron Score Distribution')
sns.histplot(scores['nonneuron'], ax=axes[1])
axes[1].set_title('Nonneuron Score Distribution')
sns.histplot(score_diff, ax=axes[2])
axes[2].set_title('Neuron - Nonneuron Score Distribution')
# plt.savefig(image_path / f"{prefix}_{EXPERIMENT}_neuron_nonneuron_score
plt.show()

In [None]:
adata.obs.loc[(adata.obs['Neuron_vs_Nonneuron'] < -1), "is_neuron"] = "nonneuron"
adata.obs.loc[(adata.obs['Neuron_vs_Nonneuron'] > 1), "is_neuron"] = "neuron"

In [None]:
composition = adata.obs.groupby('is_neuron')['allcools_Subclass'].value_counts().to_frame()

In [None]:
display(composition.loc['neuron'].head(n=10))
display(composition.loc['nonneuron'].head(n=10))