In [None]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc

import matplotlib.pyplot as plt
import seaborn as sns
from PyComplexHeatmap import *

### Functions

In [None]:
def _plot_overlap_heatmap(use_adata, ref_col, qry_col, image_path=None, current_datetime=None):
    vc = use_adata.obs.loc[:, [qry_col, ref_col]].value_counts().reset_index()
    D = vc.groupby(qry_col)['count'].sum()
    vc['N']=vc[qry_col].map(D).astype(int)
    vc['fraction']=vc['count']/vc['N']
    data = vc.pivot(index=qry_col, columns=ref_col, values='fraction')
    data.head()

    df_rows=data.index.to_series().to_frame()
    cols=data.columns.tolist()
    max_idx=np.argmax(data.fillna(0).values,axis=1)
    df_rows["GROUP"]=[cols[i] for i in max_idx]
    use_rows=[]
    for col in data.columns.tolist(): 
        df1=df_rows.loc[df_rows['GROUP']==col]
        if df1.shape[0]==0:
            continue
        use_rows.extend(df1[qry_col].unique().tolist())
    df_rows=df_rows.loc[use_rows]
    ct2code=use_adata.obs.assign(code=use_adata.obs[qry_col].cat.codes).loc[:,[qry_col,'code']].drop_duplicates().set_index(qry_col).code.to_dict()
    # df_rows['Label']=df_rows[cluster_col].apply(lambda x: f"{ct2code[x]}: {x}")
    ret = []
    for x in df_rows[qry_col].tolist():
        ret.extend([f"{ct2code[x]}: {x}"])
    df_rows['Label']=ret
    df_rows.head()

    # Plot
    row_ha=HeatmapAnnotation(
        label=anno_label(df_rows.Label,colors='black',relpos=(0,0.5)),
        axis=0,orientation='right',
    )

    plt.figure(figsize=(24,12))
    ClusterMapPlotter(
        data.loc[df_rows.index.tolist()],row_cluster=False,col_cluster=False,cmap='Reds',
        right_annotation=row_ha,row_split=df_rows['GROUP'],row_split_gap=0.5,
        row_split_order=df_rows['GROUP'].unique().tolist(),
        show_rownames=False,show_colnames=True,yticklabels=True,xticklabels=True,
        xticklabels_kws=dict(labelrotation=-60,labelcolor='blue',labelsize=10),
        yticklabels_kws=dict(labelcolor='red',labelsize=10),
        annot=True,fmt='.2g',linewidth=0.05,linecolor='gold',linestyle='-:',
        label='fraction',legend_kws=dict(extend='both',extendfrac=0.1),
        xlabel=ref_col,ylabel=qry_col,
        xlabel_kws=dict(color='blue',fontsize=14,labelpad=5),xlabel_side='top',
        ylabel_kws=dict(color='red',fontsize=14,labelpad=5), #increace labelpad manually using labelpad (points)
        # xlabel_bbox_kws=dict(facecolor='green'),
        # ylabel_bbox_kws=dict(facecolor='chocolate',edgecolor='red'),|
        # standard_scale=0,
    )
    plt.show()
    plt.close()


def plot_regional_composition_stacked(adata, region_col='brain_region', subclass_col='RNA.Subclass', 
                                     palette=None, figsize=(10, 6), dpi=300, 
                                     title="Regional Subclass Composition", show_percentages=True):
    """
    Create a stacked barplot showing the cumulative distribution of subclasses across brain regions.
    
    Parameters:
    -----------
    adata : AnnData
        Annotated data object containing observations
    region_col : str, default 'brain_region'
        Column name for brain regions
    subclass_col : str, default 'RNA.Subclass'
        Column name for subclass annotations
    palette : dict, optional
        Color palette for subclasses. If None, will try to use adata.uns palette
    figsize : tuple, default (10, 6)
        Figure size (width, height)
    dpi : int, default 300
        Figure resolution
    title : str, default "Regional Subclass Composition"
        Plot title
    show_percentages : bool, default True
        Whether to show percentages instead of raw counts
    
    Returns:
    --------
    fig, ax : matplotlib figure and axes objects
    """
    
    # Create composition data
    composition_data = adata.obs.groupby([region_col, subclass_col]).size().to_frame(name="count").reset_index()
    
    # Convert to percentage if requested
    if show_percentages:
        region_totals = composition_data.groupby(region_col)['count'].sum()
        composition_data['percentage'] = composition_data.apply(
            lambda x: (x['count'] / region_totals[x[region_col]]) * 100, axis=1
        )
        value_col = 'percentage'
        ylabel = 'Percentage (%)'
    else:
        value_col = 'count'
        ylabel = 'Cell Count'
    
    # Pivot for stacked plotting
    pivot_data = composition_data.pivot(index=region_col, columns=subclass_col, values=value_col).fillna(0)
    
    # Set up color palette
    if palette is None:
        # Try to get palette from adata.uns
        if hasattr(adata, 'uns') and 'AIT_subclass_palette' in adata.uns:
            palette = adata.uns['AIT_subclass_palette']
        else:
            # Generate a default palette
            import matplotlib.cm as cm
            n_colors = len(pivot_data.columns)
            palette = {cat: cm.tab20(i/n_colors) for i, cat in enumerate(pivot_data.columns)}
    
    # Create the plot
    fig, ax = plt.subplots(figsize=figsize, dpi=dpi)
    
    # Create stacked bar plot
    bottom = np.zeros(len(pivot_data))
    colors = [palette.get(col, '#888888') for col in pivot_data.columns]
    
    bars = []
    for i, (subclass, color) in enumerate(zip(pivot_data.columns, colors)):
        bar = ax.bar(pivot_data.index, pivot_data[subclass], bottom=bottom, 
                    label=subclass, color=color, edgecolor='white', linewidth=0.5)
        bars.append(bar)
        bottom += pivot_data[subclass]
    
    # Formatting
    ax.set_xlabel(region_col.replace('_', ' ').title())
    ax.set_ylabel(ylabel)
    ax.set_title(title)
    ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    
    # Rotate x-axis labels if needed
    if len(pivot_data.index) > 6:
        ax.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    
    return fig, ax

# Read Files

In [None]:
rna_neu_path = "/home/x-aklein2/projects/aklein/BICAN/BG/data/annotation/BICAN_BG_ALL/BG_pfv8_neu.h5ad"
rna_neu_joint_path = "/home/x-aklein2/projects/aklein/BICAN/BG/data/annotation/BICAN_BG_ALL/BG_pfv8_neu_joint.h5ad"
rna_nn_path = "/home/x-aklein2/projects/aklein/BICAN/BG/data/annotation/BICAN_BG_ALL/BG_pfv8_nn.h5ad"
rna_nn_joint_path = "/home/x-aklein2/projects/aklein/BICAN/BG/data/annotation/BICAN_BG_ALL/BG_pfv8_nn_joint.h5ad"
mc_neu_path = "/home/x-aklein2/projects/aklein/BICAN/BG/data/annotation/BICAN_BG_ALL/BG_pfv8_mc_neu_joint.h5ad"
mc_nn_path = "/home/x-aklein2/projects/aklein/BICAN/BG/data/annotation/BICAN_BG_ALL/BG_pfv8_mc_nn_joint.h5ad"
mc_neu_ct_mapping = "/home/x-aklein2/projects/aklein/BICAN/BG/data/annotation/BICAN_BG_ALL/BG_pfv8_mc_neu_cell_mapping.tsv"
mc_nn_ct_mapping = "/home/x-aklein2/projects/aklein/BICAN/BG/data/annotation/BICAN_BG_ALL/BG_pfv8_mc_nn_cell_mapping.tsv"

neu_path = "/home/x-aklein2/projects/aklein/BICAN/BG/data/annotation/BICAN_BG_ALL/BG_pfv8_neu.h5ad"
nn_path = "/home/x-aklein2/projects/aklein/BICAN/BG/data/annotation/BICAN_BG_ALL/BG_pfv8_nn.h5ad"

## RNA Neurons

In [None]:
rna_neu_adata = ad.read_h5ad(rna_neu_path)
rna_neu_joint_adata = ad.read_h5ad(rna_neu_joint_path)

In [None]:
print("Overall agreement b/w cluster 2 cluster and cell 2 cell : %.3f%%" % 
      ((rna_neu_adata.obs['c2c_allcools_label_Subclass'].astype(str) == rna_neu_adata.obs['allcools_Subclass_filt'].astype(str)).sum() / rna_neu_adata.obs.shape[0] * 100))

In [None]:
# _plot_overlap_heatmap(rna_neu_adata, ref_col="c2c_allcools_label_Subclass", qry_col="allcools_Subclass_filt")
# _plot_overlap_heatmap(rna_neu_adata, ref_col="allcools_Subclass_filt", qry_col="c2c_allcools_label_Subclass")

In [None]:
ref_batch_key = "ref"
qry_batch_key = "query"
integrated_col = "integrated_leiden"
ref_col = "Subclass"
qry_col = f"all_round1_leiden"

ref_vc=rna_neu_joint_adata.obs.query(f"Modality=='{ref_batch_key}'").groupby(integrated_col)[ref_col].value_counts(normalize=True).sort_values(ascending=False).reset_index()
spatial_vc=rna_neu_joint_adata.obs.query(f"Modality=='{qry_batch_key}'").groupby(integrated_col)[qry_col].value_counts(normalize=True).sort_values(ascending=False).reset_index()
ref_vc.drop_duplicates(integrated_col,keep='first',inplace=True)
spatial_vc.drop_duplicates(integrated_col,keep='first',inplace=True)
ref_vc.rename(columns={'proportion':f'{ref_col}_proportion'},inplace=True)
spatial_vc.rename(columns={'proportion':f'{qry_col}_proportion'},inplace=True)
df_map=pd.concat([ref_vc.set_index(integrated_col),spatial_vc.set_index(integrated_col)],axis=1)
df_map[f'{ref_col}_cell_count']=df_map.index.to_series().map(rna_neu_joint_adata.obs.query(f"Modality=='{ref_batch_key}'").groupby(integrated_col)[ref_col].count()).astype(int)
df_map[f'{qry_col}_cell_count']=df_map.index.to_series().map(rna_neu_joint_adata.obs.query(f"Modality=='{qry_batch_key}'").groupby(integrated_col)[qry_col].count()).astype(int)
all_cat = set(df_map[ref_col].unique().tolist())

keeper_clusters = df_map.query("Subclass_proportion>0.75").index.tolist()
df_rna = rna_neu_adata.obs[['allcools_Subclass_filt', 'c2c_allcools_label_Subclass', 'allcools_Subclass', 'allcools_Subclass_transfer_score', 'integrated_leiden']].copy()
display(df_rna.head())

df_rna['final_Subclass'] = "U"
df_rna['eql_col'] = df_rna.apply(lambda x: x['allcools_Subclass_filt'] if x['allcools_Subclass_filt'] == x['c2c_allcools_label_Subclass'] else x['final_Subclass'], axis=1)
df_rna['keeper_clust'] = df_rna.apply(lambda x: x['c2c_allcools_label_Subclass'] if x['integrated_leiden'] in keeper_clusters else "U", axis=1)
df_rna['ff'] = df_rna.apply(lambda x: x['allcools_Subclass'] if x['allcools_Subclass_transfer_score'] > 0.75 else "U", axis=1)

du = pd.concat((df_rna['eql_col'].value_counts().to_frame(name='eql_cols'),
           df_rna['keeper_clust'].value_counts().to_frame(name="keeper_clust"),
           df_rna['ff'].value_counts().to_frame(name='ff')),
        axis=1).fillna(0).astype(int)

for i, _cell in enumerate(df_rna.index): 
    ff_annot = df_rna.at[_cell, 'ff']
    keeper_annot = df_rna.at[_cell, 'keeper_clust']
    eql_annot = df_rna.at[_cell, 'eql_col']

    if ff_annot == "U" and keeper_annot == "U":
        annot = eql_annot
    elif ff_annot == keeper_annot:
        annot = ff_annot
    elif ff_annot == "U":
        annot = keeper_annot
    elif keeper_annot == "U":
        annot = ff_annot
    elif ff_annot != keeper_annot: 
        annot = "unknown"
    df_rna.at[_cell, 'final_Subclass'] = annot

df_rna['final_Subclass'] = df_rna['final_Subclass'].replace("U", "unknown")
final_annots = df_rna['final_Subclass'].unique()
print("Missing final annotations: ", all_cat - set(final_annots), set(df_rna['allcools_Subclass'].unique()) - set(final_annots))

vc_ef = df_rna[['eql_col', 'ff', 'keeper_clust', 'final_Subclass']].value_counts().reset_index()
vc_ef

In [None]:
rna_neu_adata.obs['RNA.Subclass'] = df_rna['final_Subclass'].astype("category").cat.remove_unused_categories().copy()
rna_neu_adata.obs['RNA.Subclass.Prob'] = df_rna['allcools_Subclass_transfer_score'].copy()
rna_neu_adata.obs['RNA.Subclass'].value_counts()

In [None]:
### print Entropy: 
from scipy.stats import entropy

subclass_entropies = {}
for _class in rna_neu_adata.obs['RNA.Subclass'].cat.categories:
    probs = rna_neu_adata.obs.loc[rna_neu_adata.obs['RNA.Subclass'] == _class, 'RNA.Subclass.Prob']
    subclass_entropies[_class] = entropy(probs.round(3).value_counts().sort_index())
    print(f"Class: {_class}, Entropy: {subclass_entropies[_class]}")

In [None]:
df_rna_entropy = pd.DataFrame(index=subclass_entropies.keys(), data=subclass_entropies.values(), columns=['Entropy']).sort_values(by='Entropy', ascending=False)
fig, ax = plt.subplots(figsize=(6,4), dpi=200)
bars = sns.barplot(data=df_rna_entropy, x=df_rna_entropy.index, y='Entropy', ax=ax, edgecolor='black', linewidth=0.5, color='coral')
ax.set_xticklabels(bars.get_xticklabels(), rotation=45, horizontalalignment='right', fontsize=8)
ax.set_title("Entropy of RNA Subclass assignment probabilities")
ax.set_xlabel("RNA Subclass")
plt.tight_layout()
plt.show()

In [None]:
# Calculate donor composition for each RNA.Subclass
donor_composition = rna_neu_adata.obs.groupby(['RNA.Subclass', 'donor']).size().to_frame(name="count").reset_index()

# Calculate percentage composition within each subclass
subclass_totals = donor_composition.groupby('RNA.Subclass')['count'].sum()
donor_composition['percentage'] = donor_composition.apply(
    lambda x: (x['count'] / subclass_totals[x['RNA.Subclass']]) * 100, axis=1
)

# Pivot the data for stacked bar chart
pivot_data = donor_composition.pivot(index='RNA.Subclass', columns='donor', values='percentage').fillna(0)

# Create the stacked bar chart
fig, ax = plt.subplots(figsize=(12, 8), dpi=200)

# Create stacked bar chart
pivot_data.plot(kind='bar', stacked=True, ax=ax, 
                edgecolor='black', linewidth=0.5,
                colormap='tab10')

ax.set_title("Donor Composition (%) for each RNA.Subclass", fontsize=14, fontweight='bold')
ax.set_xlabel("RNA.Subclass", fontsize=12)
ax.set_ylabel("Percentage (%)", fontsize=12)
ax.legend(title='Donor', bbox_to_anchor=(1.05, 1), loc='upper left')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
rna_neu_adata.write_h5ad(rna_neu_path)

## Methylation Neurons

In [None]:
mc_neu_adata = ad.read_h5ad(mc_neu_path)
df_map = pd.read_csv(mc_neu_ct_mapping, sep='\t', header=0, index_col=0)

In [None]:
# ref_vc=mc_neu_adata.obs.query(f"Modality=='mC'").groupby('leiden')["Subclass"].value_counts(normalize=True).sort_values(ascending=False).reset_index()
# spatial_vc=mc_neu_adata.obs.query(f"Modality=='MERSCOPE'").groupby('leiden')["all_round1_leiden"].value_counts(normalize=True).sort_values(ascending=False).reset_index()
# ref_vc.drop_duplicates('leiden',keep='first',inplace=True)
# spatial_vc.drop_duplicates('leiden',keep='first',inplace=True)
# ref_vc.rename(columns={'proportion':f'Subclass_proportion'},inplace=True)
# spatial_vc.rename(columns={'proportion':f'all_round1_leiden_proportion'},inplace=True)
# df_map=pd.concat([ref_vc.set_index('leiden'),spatial_vc.set_index('leiden')],axis=1)
# df_map[f'Subclass_cell_count']=df_map.index.to_series().map(mc_neu_adata.obs.query(f"Modality=='mC'").groupby('leiden')["Subclass"].count()).astype(int)
# df_map[f'all_round1_leiden_cell_count']=df_map.index.to_series().map(mc_neu_adata.obs.query(f"Modality=='MERSCOPE'").groupby('leiden')["all_round1_leiden"].count()).astype(int)
all_subclass = df_map['Subclass'].unique().tolist()

In [None]:
df_map = pd.read_csv(mc_neu_ct_mapping, sep='\t', header=0, index_col=0)
print(df_map.shape[0], df_map['Subclass'].nunique(), df_map['all_round1_leiden_cell_count'].sum() / mc_neu_adata.obs.query(f"Modality=='MERSCOPE'").shape[0])
df_map = df_map.query("Subclass_cell_count > 20 & all_round1_leiden_cell_count > 20")
print(df_map.shape[0], df_map['Subclass'].nunique(), set(all_subclass) - set(df_map['Subclass'].unique().tolist()), df_map['all_round1_leiden_cell_count'].sum() / mc_neu_adata.obs.query(f"Modality=='MERSCOPE'").shape[0])
df_map = df_map.query("Subclass_proportion > 0.5")
print(df_map.shape[0], df_map['Subclass'].nunique(), set(all_subclass) - set(df_map['Subclass'].unique().tolist()), df_map['all_round1_leiden_cell_count'].sum() / mc_neu_adata.obs.query(f"Modality=='MERSCOPE'").shape[0])

In [None]:
mc_neu_adata.obs['MC.Subclass.ct'] = mc_neu_adata.obs['leiden'].astype(int).map(df_map['Subclass'].to_dict()).fillna("unknown").astype('category')
mc_neu_adata.obs['MC.Subclass.ct'].value_counts()

In [None]:
mc_neu_adata = mc_neu_adata[mc_neu_adata.obs['Modality'] == "MERSCOPE"].copy()
mc_neu_adata

In [None]:
mc_neu_adata.obs['MC.Subclass'] = mc_neu_adata.obs['infer_Subclass'].astype("category").cat.remove_unused_categories().copy()
mc_neu_adata.obs['MC.Subclass.Prob'] = mc_neu_adata.obs['infer_Subclass_prob'].copy()
mc_neu_adata.obs['MC.Subclass'] = mc_neu_adata.obs['MC.Subclass'].cat.add_categories("unknown")
mc_neu_adata.obs.loc[mc_neu_adata.obs['MC.Subclass.Prob'] < 0.6, 'MC.Subclass'] = "unknown"
mc_neu_adata.obs['MC.Subclass'].value_counts()

In [None]:
print("Overall agreement b/w cluster 2 cluster and cell 2 cell : %.3f%%" % 
      ((mc_neu_adata.obs['MC.Subclass.ct'].astype(str) == mc_neu_adata.obs['MC.Subclass'].astype(str)).sum() / mc_neu_adata.obs.shape[0] * 100))

In [None]:
# _plot_overlap_heatmap(mc_neu_adata, ref_col="MC.Subclass.ct", qry_col="MC.Subclass")
# _plot_overlap_heatmap(mc_neu_adata, ref_col="MC.Subclass", qry_col="MC.Subclass.ct")

In [None]:
keeper_clusters = df_map.query("Subclass_proportion>0.75").index.tolist()
df_mc = mc_neu_adata.obs[['MC.Subclass', 'MC.Subclass.Prob', 'MC.Subclass.ct', 'leiden']].copy()
df_mc.head()

In [None]:
df_mc['final_Subclass'] = "U"
df_mc['eql_col'] = df_mc.apply(lambda x: x['MC.Subclass'] if x['MC.Subclass'] == x['MC.Subclass.ct'] else x['final_Subclass'], axis=1)
df_mc['keeper_clust'] = df_mc.apply(lambda x: x['MC.Subclass.ct'] if int(x['leiden']) in keeper_clusters else "U", axis=1)
df_mc['ff'] = df_mc.apply(lambda x: x['MC.Subclass'] if x['MC.Subclass.Prob'] > 0.75 else "U", axis=1)
# vc_ef = df_mc[['eql_col', 'ff', 'keeper_clust']].value_counts().reset_index()
# vc_ef

In [None]:
du = pd.concat((df_mc['eql_col'].value_counts().to_frame(name='eql_cols'),
           df_mc['keeper_clust'].value_counts().to_frame(name="keeper_clust"),
           df_mc['ff'].value_counts().to_frame(name='ff')),
        axis=1).fillna(0).astype(int)

In [None]:
for i, _cell in enumerate(df_mc.index): 
    ff_annot = df_mc.at[_cell, 'ff']
    keeper_annot = df_mc.at[_cell, 'keeper_clust']
    eql_annot = df_mc.at[_cell, 'eql_col']

    if ff_annot =="U" and keeper_annot == "U":
        annot = eql_annot
    elif ff_annot == keeper_annot:
        annot = ff_annot
    elif ff_annot == "U":
        annot = keeper_annot
    elif keeper_annot == "U":
        annot = ff_annot
    elif ff_annot != keeper_annot: 
        annot = "unknown"
    df_mc.at[_cell, 'final_Subclass'] = annot

In [None]:
df_mc['final_Subclass'] = df_mc['final_Subclass'].replace("U", "unknown")
final_annots = df_mc['final_Subclass'].unique()
print("Missing final annotations: ",  set(df_mc['MC.Subclass.ct'].unique()) - set(final_annots), set(df_mc['MC.Subclass'].unique()) - set(final_annots))
vc_ef = df_mc[['eql_col', 'ff', 'keeper_clust', 'final_Subclass']].value_counts().reset_index()
vc_ef

In [None]:
mc_neu_adata.obs['MC.Subclass'] = df_mc['final_Subclass'].astype("category").cat.remove_unused_categories().copy()
mc_neu_adata.obs['MC.Subclass.Prob'] = df_mc['MC.Subclass.Prob'].copy()
mc_neu_adata.obs['MC.Subclass'].value_counts()

In [None]:
### print Entropy: 
from scipy.stats import entropy

subclass_entropies = {}
for _class in mc_neu_adata.obs['MC.Subclass'].cat.categories:
    probs = mc_neu_adata.obs.loc[mc_neu_adata.obs['MC.Subclass'] == _class, 'MC.Subclass.Prob']
    subclass_entropies[_class] = entropy(probs.round(3).value_counts().sort_index())
    print(f"Class: {_class}, Entropy: {subclass_entropies[_class]}")

In [None]:
df_rna_entropy = pd.DataFrame(index=subclass_entropies.keys(), data=subclass_entropies.values(), columns=['Entropy']).sort_values(by='Entropy', ascending=False)
fig, ax = plt.subplots(figsize=(6,4), dpi=200)
bars = sns.barplot(data=df_rna_entropy, x=df_rna_entropy.index, y='Entropy', ax=ax, edgecolor='black', linewidth=0.5, color='coral')
ax.set_xticklabels(bars.get_xticklabels(), rotation=45, horizontalalignment='right', fontsize=8)
ax.set_title("Entropy of MC Subclass assignment probabilities")
ax.set_xlabel("MC Subclass")
plt.tight_layout()
plt.show()

In [None]:
mc_neu_adata.write_h5ad(mc_neu_path)

## All Neurons

In [None]:
# rna_neu_adata = ad.read_h5ad(rna_neu_path)
# mc_neu_adata = ad.read_h5ad(mc_neu_path)

In [None]:
rna_neu_adata.obs.drop(columns=['MC.Subclass', 'MC.Subclass.Prob'], errors='ignore', inplace=True)

In [None]:
neu_adata = rna_neu_adata.copy()
neu_adata.obs = neu_adata.obs.join(mc_neu_adata.obs[['MC.Subclass', 'MC.Subclass.Prob']])
neu_adata.obs['Combined.Subclass'] = "Unassigned"
# neu_adata.obs.loc[~neu_adata.obs['RNA.Subclass'].

In [None]:
print("Overall agreement b/w two modalities : %.3f%%" % 
      ((neu_adata.obs['RNA.Subclass'].astype(str) == neu_adata.obs['MC.Subclass'].astype(str)).sum() / neu_adata.obs.shape[0] * 100))

In [None]:
neu_adata.obs['Combined.Subclass'] = neu_adata.obs.apply(lambda x: x['RNA.Subclass'] if x['RNA.Subclass'] != "unknown" else x['MC.Subclass'], axis=1)

In [None]:
# neu_adata.obs['Combined.Subclass'].value_counts()

In [None]:
# _plot_overlap_heatmap(neu_adata, ref_col="RNA.Subclass", qry_col="Combined.Subclass")
# _plot_overlap_heatmap(neu_adata, ref_col="MC.Subclass", qry_col="Combined.Subclass")

In [None]:
neu_adata.write_h5ad(neu_path)

## Nonneuron RNA

In [None]:
rna_nn_adata = ad.read_h5ad(rna_nn_path)
rna_nn_joint_adata = ad.read_h5ad(rna_nn_joint_path)

In [None]:
print("Overall agreement b/w cluster 2 cluster and cell 2 cell : %.3f%%" % 
      ((rna_nn_adata.obs['c2c_allcools_label_Subclass'].astype(str) == rna_nn_adata.obs['allcools_Subclass_filt'].astype(str)).sum() / rna_nn_adata.obs.shape[0] * 100))

In [None]:
# _plot_overlap_heatmap(rna_nn_adata, ref_col="c2c_allcools_label_Subclass", qry_col="allcools_Subclass_filt")
# _plot_overlap_heatmap(rna_nn_adata, ref_col="allcools_Subclass_filt", qry_col="c2c_allcools_label_Subclass")

In [None]:
ref_batch_key = "ref"
qry_batch_key = "query"
integrated_col = "integrated_leiden"
ref_col = "Subclass"
qry_col = f"all_round1_leiden"

ref_vc=rna_nn_joint_adata.obs.query(f"Modality=='{ref_batch_key}'").groupby(integrated_col)[ref_col].value_counts(normalize=True).sort_values(ascending=False).reset_index()
spatial_vc=rna_nn_joint_adata.obs.query(f"Modality=='{qry_batch_key}'").groupby(integrated_col)[qry_col].value_counts(normalize=True).sort_values(ascending=False).reset_index()
ref_vc.drop_duplicates(integrated_col,keep='first',inplace=True)
spatial_vc.drop_duplicates(integrated_col,keep='first',inplace=True)
ref_vc.rename(columns={'proportion':f'{ref_col}_proportion'},inplace=True)
spatial_vc.rename(columns={'proportion':f'{qry_col}_proportion'},inplace=True)
df_map=pd.concat([ref_vc.set_index(integrated_col),spatial_vc.set_index(integrated_col)],axis=1)
df_map[f'{ref_col}_cell_count']=df_map.index.to_series().map(rna_nn_joint_adata.obs.query(f"Modality=='{ref_batch_key}'").groupby(integrated_col)[ref_col].count()).astype(int)
df_map[f'{qry_col}_cell_count']=df_map.index.to_series().map(rna_nn_joint_adata.obs.query(f"Modality=='{qry_batch_key}'").groupby(integrated_col)[qry_col].count()).astype(int)
all_cat = set(df_map[ref_col].unique().tolist())

In [None]:
keeper_clusters = df_map.query("Subclass_proportion>0.8").index.tolist()
df_rna = rna_nn_adata.obs[['allcools_Subclass_filt', 'c2c_allcools_label_Subclass', 'allcools_Subclass', 'allcools_Subclass_transfer_score', 'integrated_leiden']].copy()
df_rna.head()

In [None]:
df_rna['final_Subclass'] = "U"
df_rna['eql_col'] = df_rna.apply(lambda x: x['allcools_Subclass_filt'] if x['allcools_Subclass_filt'] == x['c2c_allcools_label_Subclass'] else x['final_Subclass'], axis=1)
df_rna['keeper_clust'] = df_rna.apply(lambda x: x['c2c_allcools_label_Subclass'] if x['integrated_leiden'] in keeper_clusters else "U", axis=1)
df_rna['ff'] = df_rna.apply(lambda x: x['allcools_Subclass'] if x['allcools_Subclass_transfer_score'] > 0.8 else "U", axis=1)
# vc_ef = df_rna[['eql_col', 'ff', 'keeper_clust']].value_counts().reset_index()
# vc_ef.head()
du = pd.concat((df_rna['eql_col'].value_counts().to_frame(name='eql_cols'),
           df_rna['keeper_clust'].value_counts().to_frame(name="keeper_clust"),
           df_rna['ff'].value_counts().to_frame(name='ff')),
        axis=1).fillna(0).astype(int)

In [None]:
for i, _cell in enumerate(df_rna.index): 
    ff_annot = df_rna.at[_cell, 'ff']
    keeper_annot = df_rna.at[_cell, 'keeper_clust']
    eql_annot = df_rna.at[_cell, 'eql_col']

    if ff_annot == "U" and keeper_annot == "U":
        annot = eql_annot
    elif ff_annot == keeper_annot:
        annot = ff_annot
    elif ff_annot == "U":
        annot = keeper_annot
    elif keeper_annot == "U":
        annot = ff_annot
    elif ff_annot != keeper_annot: 
        annot = "unknown"
    df_rna.at[_cell, 'final_Subclass'] = annot

In [None]:
df_rna['final_Subclass'] = df_rna['final_Subclass'].replace("U", "unknown")
final_annots = df_rna['final_Subclass'].unique()
print("Missing final annotations: ", all_cat - set(final_annots), set(df_rna['allcools_Subclass'].unique()) - set(final_annots))
vc_ef = df_rna[['eql_col', 'ff', 'keeper_clust', 'final_Subclass']].value_counts().reset_index()
vc_ef

In [None]:
rna_nn_adata.obs['RNA.Subclass'] = df_rna['final_Subclass'].astype("category").cat.remove_unused_categories().copy()
rna_nn_adata.obs['RNA.Subclass.Prob'] = df_rna['allcools_Subclass_transfer_score'].copy()
rna_nn_adata.obs['RNA.Subclass'].value_counts()

In [None]:
### print Entropy: 
from scipy.stats import entropy

subclass_entropies = {}
for _class in rna_nn_adata.obs['RNA.Subclass'].cat.categories:
    probs = rna_nn_adata.obs.loc[rna_nn_adata.obs['RNA.Subclass'] == _class, 'RNA.Subclass.Prob']
    subclass_entropies[_class] = entropy(probs.round(3).value_counts().sort_index())
    print(f"Class: {_class}, Entropy: {subclass_entropies[_class]}")

In [None]:
df_rna_entropy = pd.DataFrame(index=subclass_entropies.keys(), data=subclass_entropies.values(), columns=['Entropy']).sort_values(by='Entropy', ascending=False)
fig, ax = plt.subplots(figsize=(6,4), dpi=200)
bars = sns.barplot(data=df_rna_entropy, x=df_rna_entropy.index, y='Entropy', ax=ax, edgecolor='black', linewidth=0.5, color='coral')
ax.set_xticklabels(bars.get_xticklabels(), rotation=45, horizontalalignment='right', fontsize=8)
ax.set_title("Entropy of RNA Subclass assignment probabilities")
ax.set_xlabel("RNA Subclass")
plt.tight_layout()
plt.show()

In [None]:
rna_nn_adata.write_h5ad(rna_nn_path)

## Nonneuron Methylation 

In [None]:
mc_nn_adata = ad.read_h5ad(mc_nn_path)
df_map = pd.read_csv(mc_nn_ct_mapping, sep='\t', header=0, index_col=0)

In [None]:
all_subclass = df_map['Subclass'].unique().tolist()
df_map = pd.read_csv(mc_nn_ct_mapping, sep='\t', header=0, index_col=0)
print(df_map.shape[0], df_map['Subclass'].nunique(), df_map['all_round1_leiden_cell_count'].sum() / mc_nn_adata.obs.query(f"Modality=='MERSCOPE'").shape[0])
df_map = df_map.query("Subclass_cell_count > 20 & all_round1_leiden_cell_count > 20")
print(df_map.shape[0], df_map['Subclass'].nunique(), set(all_subclass) - set(df_map['Subclass'].unique().tolist()), df_map['all_round1_leiden_cell_count'].sum() / mc_nn_adata.obs.query(f"Modality=='MERSCOPE'").shape[0])
df_map = df_map.query("Subclass_proportion > 0.5")
print(df_map.shape[0], df_map['Subclass'].nunique(), set(all_subclass) - set(df_map['Subclass'].unique().tolist()), df_map['all_round1_leiden_cell_count'].sum() / mc_nn_adata.obs.query(f"Modality=='MERSCOPE'").shape[0])

In [None]:
mc_nn_adata.obs['MC.Subclass.ct'] = mc_nn_adata.obs['leiden'].astype(int).map(df_map['Subclass'].to_dict()).fillna("unknown").astype('category')
mc_nn_adata.obs['MC.Subclass.ct'].value_counts()

In [None]:
mc_nn_adata = mc_nn_adata[mc_nn_adata.obs['Modality'] == "MERSCOPE"].copy()
mc_nn_adata

In [None]:
mc_nn_adata.obs['MC.Subclass'] = mc_nn_adata.obs['infer_Subclass'].astype("category").cat.remove_unused_categories().copy()
mc_nn_adata.obs['MC.Subclass.Prob'] = mc_nn_adata.obs['infer_Subclass_prob'].copy()
mc_nn_adata.obs['MC.Subclass'] = mc_nn_adata.obs['MC.Subclass'].cat.add_categories("unknown")
mc_nn_adata.obs.loc[mc_nn_adata.obs['MC.Subclass.Prob'] < 0.6, 'MC.Subclass'] = "unknown"
mc_nn_adata.obs['MC.Subclass'].value_counts()

In [None]:
print("Overall agreement b/w cluster 2 cluster and cell 2 cell : %.3f%%" % 
      ((mc_nn_adata.obs['MC.Subclass.ct'].astype(str) == mc_nn_adata.obs['MC.Subclass'].astype(str)).sum() / mc_nn_adata.obs.shape[0] * 100))

In [None]:
# _plot_overlap_heatmap(mc_nn_adata, ref_col="MC.Subclass.ct", qry_col="MC.Subclass")
# _plot_overlap_heatmap(mc_nn_adata, ref_col="MC.Subclass", qry_col="MC.Subclass.ct")

In [None]:
keeper_clusters = df_map.query("Subclass_proportion>0.8").index.tolist()
df_mc = mc_nn_adata.obs[['MC.Subclass', 'MC.Subclass.Prob', 'MC.Subclass.ct', 'leiden']].copy()
df_mc.head()

In [None]:
df_mc['final_Subclass'] = "U"
df_mc['eql_col'] = df_mc.apply(lambda x: x['MC.Subclass'] if x['MC.Subclass'] == x['MC.Subclass.ct'] else x['final_Subclass'], axis=1)
df_mc['keeper_clust'] = df_mc.apply(lambda x: x['MC.Subclass.ct'] if int(x['leiden']) in keeper_clusters else "U", axis=1)
df_mc['ff'] = df_mc.apply(lambda x: x['MC.Subclass'] if x['MC.Subclass.Prob'] > 0.8 else "U", axis=1)
# vc_ef = df_mc[['eql_col', 'ff', 'keeper_clust']].value_counts().reset_index()
# vc_ef
du = pd.concat((df_mc['eql_col'].value_counts().to_frame(name='eql_cols'),
           df_mc['keeper_clust'].value_counts().to_frame(name="keeper_clust"),
           df_mc['ff'].value_counts().to_frame(name='ff')),
        axis=1).fillna(0).astype(int)

In [None]:
for i, _cell in enumerate(df_mc.index): 
    ff_annot = df_mc.at[_cell, 'ff']
    keeper_annot = df_mc.at[_cell, 'keeper_clust']
    eql_annot = df_mc.at[_cell, 'eql_col']

    if ff_annot == "U" and keeper_annot == "U":
        annot = eql_annot
    elif ff_annot == keeper_annot:
        annot = ff_annot
    elif ff_annot == "U":
        annot = keeper_annot
    elif keeper_annot == "U":
        annot = ff_annot
    elif ff_annot != keeper_annot: 
        annot = "unknown"
    df_mc.at[_cell, 'final_Subclass'] = annot

In [None]:
df_mc['final_Subclass'] = df_mc['final_Subclass'].replace("U", "unknown")
final_annots = df_mc['final_Subclass'].unique()
print("Missing final annotations: ",  set(df_mc['MC.Subclass.ct'].unique()) - set(final_annots), set(df_mc['MC.Subclass'].unique()) - set(final_annots))
vc_ef = df_mc[['eql_col', 'ff', 'keeper_clust', 'final_Subclass']].value_counts().reset_index()
vc_ef

In [None]:
mc_nn_adata.obs['MC.Subclass'] = df_mc['final_Subclass'].astype("category").cat.remove_unused_categories().copy()
mc_nn_adata.obs['MC.Subclass.Prob'] = df_mc['MC.Subclass.Prob'].copy()
mc_nn_adata.obs['MC.Subclass'].value_counts()

In [None]:
### print Entropy: 
from scipy.stats import entropy

subclass_entropies = {}
for _class in mc_nn_adata.obs['MC.Subclass'].cat.categories:
    probs = mc_nn_adata.obs.loc[mc_nn_adata.obs['MC.Subclass'] == _class, 'MC.Subclass.Prob']
    subclass_entropies[_class] = entropy(probs.round(3).value_counts().sort_index())
    print(f"Class: {_class}, Entropy: {subclass_entropies[_class]}")

In [None]:
df_rna_entropy = pd.DataFrame(index=subclass_entropies.keys(), data=subclass_entropies.values(), columns=['Entropy']).sort_values(by='Entropy', ascending=False)
fig, ax = plt.subplots(figsize=(6,4), dpi=200)
bars = sns.barplot(data=df_rna_entropy, x=df_rna_entropy.index, y='Entropy', ax=ax, edgecolor='black', linewidth=0.5, color='coral')
ax.set_xticklabels(bars.get_xticklabels(), rotation=45, horizontalalignment='right', fontsize=8)
ax.set_title("Entropy of MC Subclass assignment probabilities")
ax.set_xlabel("MC Subclass")
plt.tight_layout()
plt.show()

In [None]:
mc_nn_adata.write_h5ad(mc_nn_path)

## ALL Nonneuron 

In [None]:
# rna_nn_adata = ad.read_h5ad(rna_nn_path)
# mc_nn_adata = ad.read_h5ad(mc_nn_path)

In [None]:
rna_nn_adata.obs.drop(columns=['MC.Subclass', 'MC.Subclass.Prob'], errors='ignore', inplace=True)

In [None]:
nn_adata = rna_nn_adata.copy()
nn_adata.obs = nn_adata.obs.join(mc_nn_adata.obs[['MC.Subclass', 'MC.Subclass.Prob']])
nn_adata.obs['Combined.Subclass'] = "Unassigned"

In [None]:
print("Overall agreement b/w two modalities : %.3f%%" % 
      ((nn_adata.obs['RNA.Subclass'].astype(str) == nn_adata.obs['MC.Subclass'].astype(str)).sum() / nn_adata.obs.shape[0] * 100))

In [None]:
nn_adata.obs['Combined.Subclass'] = nn_adata.obs.apply(lambda x: x['RNA.Subclass'] if x['RNA.Subclass'] != "unknown" else x['MC.Subclass'], axis=1)
nn_adata.obs['Combined.Subclass'] = nn_adata.obs['Combined.Subclass'].astype("category").cat.remove_unused_categories().copy()

In [None]:
# _plot_overlap_heatmap(nn_adata, ref_col="RNA.Subclass", qry_col="Combined.Subclass")
# _plot_overlap_heatmap(nn_adata, ref_col="MC.Subclass", qry_col="Combined.Subclass")

In [None]:
nn_adata.write_h5ad(nn_path)

## Regional Neuronal Composition

In [None]:
from plottable import Table, ColumnDefinition
from plottable.formatters import decimal_to_percent

In [None]:
neu_adata = ad.read_h5ad(neu_path)
nn_adata = ad.read_h5ad(nn_path)

In [None]:
neu_adata.uns['AIT_subclass_palette']['unknown'] = '#d3d3d3'
nn_adata.uns['AIT_subclass_palette']['unknown'] = '#d3d3d3'

In [None]:
region_neuron_composition = neu_adata.obs.groupby(['brain_region', 'Combined.Subclass']).size().to_frame(name="count").reset_index()
reg_comp = region_neuron_composition.pivot(index='brain_region', columns='Combined.Subclass', values='count').fillna(0).T

fig, ax = plt.subplots(figsize=(10,6), dpi=200)
tab = Table(
    reg_comp, 
    textprops={'fontsize':6},
    col_label_divider=True,
    col_label_divider_kw={'color':'black', 'linewidth':0.5},
    odd_row_color='#f0f0f0',
    column_definitions=[ColumnDefinition(name="Combined.Subclass", width=1.5)]
    )
tab.columns['Combined.Subclass'].set_linewidth(0)
for _r, _row in tab.rows.items(): 
    lab = _row.cells[0].text.get_text()
    col = neu_adata.uns['m3c_subclass_palette'][lab] if lab in neu_adata.uns['m3c_subclass_palette'] else '#888888'
    _row.set_facecolor(col)
# tab.columns['CAB'].set_facecolor("green")

plt.show()
plt.close()

In [None]:
reg_comp = region_neuron_composition.pivot(index='brain_region', columns='Combined.Subclass', values='count').fillna(0).T.drop("unknown")
reg_comp = reg_comp.div(reg_comp.sum(axis=0), axis=1)

fig, ax = plt.subplots(figsize=(10,6), dpi=200)
tab = Table(
    reg_comp, 
    textprops={'fontsize':6},
    col_label_divider=True,
    col_label_divider_kw={'color':'black', 'linewidth':0.5},
    odd_row_color='#f0f0f0',
    column_definitions=(
        [
            ColumnDefinition(name="Combined.Subclass", width=1.5),
        ] + 
        [
            ColumnDefinition(name=_reg, formatter=decimal_to_percent, width=0.8) for _reg in reg_comp.columns.tolist()
        ]
    )
)
tab.columns['Combined.Subclass'].set_linewidth(0)
for _r, _row in tab.rows.items(): 
    lab = _row.cells[0].text.get_text()
    col = neu_adata.uns['m3c_subclass_palette'][lab] if lab in neu_adata.uns['m3c_subclass_palette'] else '#888888'
    _row.set_facecolor(col)
# tab.columns['CAB'].set_facecolor("green")

plt.show()
plt.close()

In [None]:
region_nonneuron_composition = nn_adata.obs.groupby(['brain_region', 'Combined.Subclass']).size().to_frame(name="count").reset_index()
reg_comp = region_nonneuron_composition.pivot(index='brain_region', columns='Combined.Subclass', values='count').fillna(0).T

fig, ax = plt.subplots(figsize=(10,6), dpi=200)
tab = Table(
    reg_comp, 
    textprops={'fontsize':6},
    col_label_divider=True,
    col_label_divider_kw={'color':'black', 'linewidth':0.5},
    odd_row_color='#f0f0f0',
    column_definitions=[ColumnDefinition(name="Combined.Subclass", width=1.5)]
    )
tab.columns['Combined.Subclass'].set_linewidth(0)
for _r, _row in tab.rows.items(): 
    lab = _row.cells[0].text.get_text()
    col = neu_adata.uns['AIT_subclass_palette'][lab] if lab in neu_adata.uns['AIT_subclass_palette'] else '#888888'
    _row.set_facecolor(col)

# plt.show()
plt.close()

In [None]:
reg_comp = region_nonneuron_composition.pivot(index='brain_region', columns='RNA.Subclass', values='count').fillna(0).T.drop("unknown")
reg_comp = reg_comp.div(reg_comp.sum(axis=0), axis=1)

fig, ax = plt.subplots(figsize=(10,6), dpi=200)
tab = Table(
    reg_comp, 
    textprops={'fontsize':6},
    col_label_divider=True,
    col_label_divider_kw={'color':'black', 'linewidth':0.5},
    odd_row_color='#f0f0f0',
    column_definitions=(
        [
            ColumnDefinition(name="RNA.Subclass", width=1.5),
        ] + 
        [
            ColumnDefinition(name=_reg, formatter=decimal_to_percent, width=0.8) for _reg in reg_comp.columns.tolist()
        ]
    )
)
tab.columns['RNA.Subclass'].set_linewidth(0)
for _r, _row in tab.rows.items(): 
    lab = _row.cells[0].text.get_text()
    col = neu_adata.uns['AIT_subclass_palette'][lab] if lab in neu_adata.uns['AIT_subclass_palette'] else '#888888'
    _row.set_facecolor(col)

# plt.show()
plt.close()

In [None]:
region_neuron_composition = neu_adata.obs.groupby(['brain_region', 'Combined.Subclass']).size().to_frame(name="count").reset_index()
# Create stacked barplot showing regional composition
fig, ax = plot_regional_composition_stacked(
    neu_adata, #[neu_adata.obs['RNA.Subclass'] != "unknown"], 
    region_col='brain_region', 
    subclass_col='Combined.Subclass',
    palette=neu_adata.uns['AIT_subclass_palette'],
    figsize=(12, 6),
    title="Regional Neuron Subclass Composition (Stacked)",
    show_percentages=True
)
plt.show()
fig, ax = plot_regional_composition_stacked(
    neu_adata[neu_adata.obs['Combined.Subclass'] != "unknown"], 
    region_col='brain_region', 
    subclass_col='Combined.Subclass',
    palette=neu_adata.uns['AIT_subclass_palette'],
    figsize=(12, 6),
    title="Regional Neuron Subclass Composition (Stacked)",
    show_percentages=True
)
plt.show()

region_neuron_composition = nn_adata.obs.groupby(['brain_region', 'Combined.Subclass']).size().to_frame(name="count").reset_index()
# Create stacked barplot showing regional composition
fig, ax = plot_regional_composition_stacked(
    nn_adata, #[nn_adata.obs['RNA.Subclass'] != "unknown"], 
    region_col='brain_region', 
    subclass_col='Combined.Subclass',
    palette=nn_adata.uns['AIT_subclass_palette'],
    figsize=(12, 6),
    title="Regional Nonneuron Subclass Composition (Stacked)",
    show_percentages=True
)
plt.show()
fig, ax = plot_regional_composition_stacked(
    nn_adata[nn_adata.obs['RNA.Subclass'] != "unknown"], 
    region_col='brain_region', 
    subclass_col='RNA.Subclass',
    palette=nn_adata.uns['AIT_subclass_palette'],
    figsize=(12, 6),
    title="Regional Nonneuron Subclass Composition (Stacked)",
    show_percentages=True
)
plt.show()