In [None]:
import os
import sys
import pickle
from pathlib import Path

import numpy as np
import pandas as pd
import anndata as ad


import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import seaborn as sns
from spida.pl import plot_continuous, plot_categorical

In [None]:
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['figure.figsize'] = (4, 4)
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = 'Arial'
plt.rcParams['font.size'] = 16
plt.rcParams['axes.facecolor'] = 'white'
    
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['savefig.transparent'] = True
plt.rcParams['savefig.bbox'] = 'tight'
plt.rcParams['savefig.pad_inches'] = 0.01

# Report Metrics

In [None]:
from plottable import Table, ColumnDefinition
from plottable.formatters import decimal_to_percent
from plottable.cmap import normed_cmap

In [None]:
adata_path = "/anvil/projects/x-mcb130189/aklein/BICAN/BG/data/BICAN_BG_CPS.h5ad"
adata = ad.read_h5ad(adata_path, backed='r')
adata

In [None]:
adata[adata.obs['Subclass'] != "unknown"].shape[0], adata[adata.obs['Group'] != "unknown"].shape[0]

In [None]:
br_palette = adata.uns['brain_region_corr_palette']
donor_palette = adata.uns['donor_palette']
lab_palette = adata.uns['replicate_palette']

In [None]:
df_obs = adata.obs[['brain_region_corr', 'donor', 'replicate', 'neuron_type', 'nCount_RNA', 'nFeature_RNA', 'nCount_RNA_per_Volume', 'volume']].copy()
df_obs.rename(columns={"brain_region_corr": "brain_region"}, inplace=True)
df_obs.head()

In [None]:
df_count = df_obs.groupby(['brain_region', 'donor', 'replicate', 'neuron_type'])['nCount_RNA'].median().reset_index()
# df_count.head()

df_ft = df_obs.groupby(['brain_region', 'donor', 'replicate', 'neuron_type'])['nFeature_RNA'].median().reset_index()
# df_count.head()

df_v = df_obs.groupby(['brain_region', 'donor', 'replicate', 'neuron_type'])['volume'].median().reset_index()
# df_count.head()

In [None]:
df_tog = (
    df_count
    .merge(df_ft, on=['brain_region', 'donor', 'replicate', 'neuron_type'], suffixes=('_count', '_feature'), how='inner')
    .merge(df_v, on=['brain_region', 'donor', 'replicate', 'neuron_type'], how='inner', suffixes=('', '_volume'))
)

In [None]:
toplot = df_tog.pivot(index=['brain_region', 'donor', 'replicate'], columns='neuron_type', values=['nCount_RNA', 'nFeature_RNA', 'volume']).reset_index() # .drop(columns=['neuron_type'])
toplot = toplot.drop(toplot[(toplot['brain_region'] == "CaT") & (toplot['donor'] == "UWA7648")].index)
toplot.columns = ['_'.join(col).strip() if col[1] else col[0] for col in toplot.columns.values]
toplot.head()

In [None]:
toplot.to_csv("/home/x-aklein2/projects/aklein/BICAN/BG/images/figures/qcf/summary_table.csv", index=False)

In [None]:
types = ["Neuron", "Nonneuron", "unknown"]
titles = ["Neuron", "Non-neuron", "Unknown"]
prefix = ["nCount_RNA_", "nFeature_RNA_", "volume_"]
prefix_to_title = {
    "nCount_RNA_": "Median Transcript Count",
    "nFeature_RNA_": "Median Unique Gene Count",
    "volume_": "Median Volume (µm³)",
}
prefix_to_cmap = {
    "nCount_RNA_": "Greens",
    "nFeature_RNA_": "Blues",
    "volume_": "Oranges",
}

In [None]:
coldefs = [
    [
        ColumnDefinition(
            name=_prefix + _t, title=_title, 
            textprops={"ha": "center"},
            width = 1.0,
            cmap = normed_cmap(toplot[[_prefix + _at for _at in types]].melt()['value'], cmap=prefix_to_cmap[_prefix]),
            group = prefix_to_title[_prefix],
            formatter = lambda x: round(x, 2) if "volume" in _prefix else None, 
            border='r' if _t == "unknown" else None,
        )
        for _t, _title in zip(types, titles)
    ] 
    for _prefix in prefix
]

In [None]:
col_defs = (
    [
        ColumnDefinition(
            name="brain_region", title = "Brain Region",
            textprops={"ha": "center"},
            width = 0.5, 
            text_cmap = br_palette,
            border="both"
        ),
        ColumnDefinition(
            name="donor", title = "Donor",
            textprops={"ha": "center"},
            width = 0.6, 
            text_cmap = donor_palette,
            border="both"
        ),
        ColumnDefinition(
            name="replicate", title="Lab", 
            textprops={"ha": "center"}, 
            width = 0.5,
            text_cmap = lab_palette,
            border="both"
        )
    ] + coldefs[0] + coldefs[1] + coldefs[2]
)

In [None]:
fig, ax = plt.subplots(figsize=(35, 20))

table = Table(
    toplot, 
    index_col="brain_region",
    column_definitions = col_defs, 
    row_dividers=True, 
    footer_divider=True,

)

for i, _v in enumerate(toplot['brain_region']):
    table.cells[i,0].rectangle_patch.set_facecolor(br_palette.get(_v, "#FFFFFF"))
for i, _v in enumerate(toplot['donor']):
    table.cells[i,1].rectangle_patch.set_facecolor(donor_palette.get(_v, "#FFFFFF"))
for i, _v in enumerate(toplot['replicate']):
    table.cells[i,2].rectangle_patch.set_facecolor(lab_palette.get(_v, "#FFFFFF"))

plt.savefig("/home/x-aklein2/projects/aklein/BICAN/BG/images/figures/qcf/summary_table.png", dpi=300, bbox_inches='tight')
plt.savefig("/home/x-aklein2/projects/aklein/BICAN/BG/images/figures/qcf/summary_table.pdf", dpi=300, bbox_inches='tight')

plt.show()

# Filtering QC

In [None]:
adata_path = "/anvil/projects/x-mcb130189/aklein/BICAN/BG/data/BICAN_BG_CPS.h5ad"
adata = ad.read_h5ad(adata_path)
adata

In [None]:
sys.path.append("/anvil/projects/x-mcb130189/aklein/BICAN/spida_dev/helper_scripts")
from qc_plots import _plot_violin_QC, plot_violin_QC

In [None]:
plots = adata.obs[['brain_region', 'donor', 'replicate']].drop_duplicates().sort_values(by=['brain_region', 'donor' ,'replicate']).values
order = plots[:, 0] + "_" + plots[:, 1] + "_" + plots[:, 2]
n_plots = len(plots)
n_plots

In [None]:
toplot = adata.obs
toplot['indexer'] = toplot['brain_region'].astype(str) + "_" + toplot['donor'].astype(str) + "_" + toplot['replicate'].astype(str)
toplot['indexer'] = toplot['indexer'].astype('category')
# toplot

In [None]:
metrics_to_plot = ['nCount_RNA', 'nFeature_RNA', 'nBlank', 'volume', 'nCount_RNA_per_Volume']
titles = ['Total RNA Count', 'Total RNA Features', 'Blank Count', 'Volume', 'RNA Count per Volume']
top_cut_names = ['n_count_max', 'n_gene_max', 'n_blank_max', 'volume_max', 'n_count_per_volume_max']
bottom_cut_names = ['n_count_min', 'n_gene_min', 'n_blank_min', 'volume_min', 'n_count_per_volume_min']

In [None]:
uns_combos = adata.obs[['replicate', 'brain_region']].drop_duplicates().sort_values(by=['brain_region', 'replicate']).values
salk_uns_dicts = {}
ucsd_uns_dicts = {}
for _uc in uns_combos: 
    if _uc[1] == "MGM1": 
        _uc[1] = "MGM"
    if _uc[1] == "SUBTH": 
        _uc[1] = "STH"

    uns_path = f"/anvil/projects/x-mcb130189/aklein/BICAN/data/aggregated/BICAN_BG_{_uc[1]}_{_uc[0]}_CPSfilt_uns_dict.pkl"
    with open(uns_path, "rb") as f:
        uns_dict = pickle.load(f)
    if _uc[0] == "salk":
        salk_uns_dicts.update(uns_dict)
    elif _uc[0] == "ucsd":
        ucsd_uns_dicts.update(uns_dict)

In [None]:
from spida._constants import ren_to_exp_map
ucsd_uns_dict_rename = {}
for k, v in ucsd_uns_dicts.items(): 
    u_name = k.split("_")
    u_name[0] = ren_to_exp_map[u_name[0]]
    k = "_".join(u_name)
    ucsd_uns_dict_rename[k] = v
    # print(k)

In [None]:
uns_dict = {"salk" : salk_uns_dicts, "ucsd": ucsd_uns_dict_rename}

In [None]:
br_to_brc_map = adata.obs[['brain_region', 'brain_region_corr']].drop_duplicates().set_index('brain_region')['brain_region_corr'].to_dict()
br_to_brc_map

In [None]:
for _metric, _title, _top_cut_name, _bottom_cut_name in zip(metrics_to_plot, titles, top_cut_names, bottom_cut_names):
    _plot_violin_QC(
        adata, metric=_metric, title=_title, cutoffs_dict=uns_dict, top_cut_name=_top_cut_name, bottom_cut_name=_bottom_cut_name,
        save_fig=True, out_path=f"/home/x-aklein2/projects/aklein/BICAN/BG/images/figures/qcf/{_metric}_violin_plot.pdf",
        tick_label_rename_dict=br_to_brc_map, rasterize=False, show=False, def_fontsize=16, title_fontsize=24, legend_fontsize=16
    )
    _plot_violin_QC(
        adata, metric=_metric, title=_title, cutoffs_dict=uns_dict, top_cut_name=_top_cut_name, bottom_cut_name=_bottom_cut_name,
        save_fig=True, out_path=f"/home/x-aklein2/projects/aklein/BICAN/BG/images/figures/qcf/{_metric}_violin_plot.png",
        tick_label_rename_dict=br_to_brc_map, rasterize=True, show=False, def_fontsize=16, title_fontsize=24, legend_fontsize=16
    )
    # break

# Corr. QC

### function

In [None]:
import numpy as np
from scipy import stats

def plot_correlation(df_merged, ax, x_col='count_salk', y_col='count_ucsd', 
                    title="Correlation Plot", xlabel="X", ylabel="Y",
                    scatter_color='#2E86AB', line_color='#A23B72',
                    title_fontsize=20, def_fontsize=14, rasterize=False):
    """
    Plot a correlation scatter plot with log scales and dotted correlation line.
    
    Parameters:
    -----------
    df_merged : pandas.DataFrame
        DataFrame containing the data to plot
    ax : matplotlib.axes.Axes
        Axis object to plot on
    x_col : str, default 'count_salk'
        Column name for x-axis data
    y_col : str, default 'count_ucsd'
        Column name for y-axis data
    title : str, default "Correlation Plot"
        Plot title
    xlabel : str, default "X"
        X-axis label
    ylabel : str, default "Y"
        Y-axis label
    scatter_color : str, default '#2E86AB'
        Color for scatter points
    line_color : str, default '#A23B72'
        Color for correlation line
    
    Returns:
    --------
    float
        Correlation coefficient
    """
    
    # Filter out zero values for better log scale visualization
    mask = (df_merged[x_col] > 0) & (df_merged[y_col] > 0)
    x_data = df_merged.loc[mask, x_col]
    y_data = df_merged.loc[mask, y_col]
    
    # Create scatter plot with a nice color scheme
    ax.scatter(x_data, y_data, marker='.', alpha=0.7, s=8, 
              color=scatter_color, edgecolors='none',
              rasterized=rasterize)
    
    corr_coef = np.nan
    
    # Calculate correlation and regression line
    if len(x_data) > 1:
        # Calculate correlation coefficient
        corr_coef = np.corrcoef(np.log10(x_data), np.log10(y_data))[0, 1]
        
        # Calculate regression line in log space
        log_x = np.log10(x_data)
        log_y = np.log10(y_data)
        slope, intercept, r_value, p_value, std_err = stats.linregress(log_x, log_y)
        
        # Create line points
        x_min, x_max = x_data.min(), x_data.max()
        x_line = np.logspace(np.log10(x_min), np.log10(x_max), 100)
        y_line = 10**(slope * np.log10(x_line) + intercept)
        
        # Plot correlation line as dotted line
        ax.plot(x_line, y_line, '--', color=line_color, linewidth=1.5, 
               alpha=0.9, label=f'r={corr_coef:.3f}', rasterized=rasterize)
    
    # Set log scales
    ax.set_xscale("log")
    ax.set_yscale("log")
    
    # Remove all tick marks and labels
    ax.set_xticks([])
    ax.set_yticks([])
    ax.tick_params(left=False, bottom=False, top=False, right=False)
    
    # Set labels and title
    ax.set_title(title, fontsize=title_fontsize, pad=10)
    ax.set_xlabel(xlabel, fontsize=def_fontsize)
    ax.set_ylabel(ylabel, fontsize=def_fontsize)
    ax.legend(fontsize=def_fontsize)
    
    # Clean up the plot appearance
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_color('#666666')
    ax.spines['bottom'].set_color('#666666')
    
    return corr_coef

## Do

In [None]:
root_path = Path("/home/x-aklein2/projects/aklein/BICAN/data/zarr_store")
adata_path = "/anvil/projects/x-mcb130189/aklein/BICAN/BG/data/BICAN_BG_CPS.h5ad"
adata = ad.read_h5ad(adata_path, backed='r')
adata

In [None]:
df_obs = adata.obs[['experiment', 'region', 'brain_region', 'donor', 'replicate', 'brain_region_corr']].drop_duplicates()

In [None]:
df_obs.groupby("brain_region").size()

In [None]:
store_list = []
for i, _row in enumerate(df_obs.itertuples()):
    _exp = _row.experiment
    _reg = _row.region
    _br = _row.brain_region
    _brc = _row.brain_region_corr
    _donor = _row.donor
    _lab = _row.replicate
    print(i, _br, _donor, _lab)
    exp_path = root_path / _exp
    print(exp_path, exp_path.exists())
    for _reg_path in exp_path.glob(f"region_*{_donor[-4:]}*"):
        dt_path = _reg_path / "points" / f"CPS_{_exp}_{_reg}_transcripts"
        if dt_path.exists():
            print(dt_path)
            dt = pd.read_parquet(dt_path)
            dt = dt['gene'].value_counts().to_frame().reset_index()
            dt['experiment'] = _exp
            dt['brain_region'] = _br
            dt['brain_region_corr'] = _brc
            dt['donor'] = _donor
            dt['replicate'] = _lab
    store_list.append(dt)


In [None]:
df_all = pd.concat(store_list)

In [None]:
brain_region = df_obs['brain_region_corr'].unique().tolist()
brain_region = ["CaH", "CaB", "CaT", "Pu", "NAC", "GP", "STH", "MGM1"]
donors = df_obs['donor'].unique().tolist()
labs = df_obs['replicate'].unique().tolist()

In [None]:
fig, axes = plt.subplots(ncols=len(brain_region), nrows=len(donors), figsize=(3*len(brain_region), 3*len(donors)), constrained_layout=True)
for i, _br in enumerate(brain_region): 
    for j, _donor in enumerate(donors): 
        df1 = df_all[(df_all['brain_region_corr'] == _br) & (df_all['donor'] == _donor) & (df_all['replicate'] == 'salk')].set_index("gene")['count'].to_frame()
        df2 = df_all[(df_all['brain_region_corr'] == _br) & (df_all['donor'] == _donor) & (df_all['replicate'] == 'ucsd')].set_index("gene")['count'].to_frame()
        df_merged = df1.merge(df2, left_index=True, right_index=True, how='outer', suffixes=('_salk', '_ucsd')).fillna(0)
        df_merged = df_merged.drop_duplicates()
        # print(_br, _donor, df_merged.shape)
        if df_merged.shape[0] < 10:
            print(f"Skipping {_br} {_donor} due to insufficient data ({df_merged.shape[0]} genes)")
            axes[j, i].axis('off')
            continue

        # Use the correlation plot function
        corr_coef = plot_correlation(
            df_merged, 
            axes[j, i], 
            x_col='count_salk', 
            y_col='count_ucsd',
            title=f"{_br} {_donor}",
            xlabel="R1",
            ylabel="R2",
            scatter_color='#2E86AB',
            line_color='#f00e3b'
        )

plt.savefig("/home/x-aklein2/projects/aklein/BICAN/BG/images/figures/qcf/correlation_ucsd_salk_all_regions.pdf", bbox_inches='tight')
plt.savefig("/home/x-aklein2/projects/aklein/BICAN/BG/images/figures/qcf/correlation_ucsd_salk_all_regions.png", bbox_inches='tight')
plt.show()
plt.close()

In [None]:
fig, axes = plt.subplots(nrows=len(brain_region), ncols=len(donors), figsize=(3*len(donors), 3*len(brain_region)), constrained_layout=True)
for i, _br in enumerate(brain_region): 
    for j, _donor in enumerate(donors): 
        df1 = df_all[(df_all['brain_region_corr'] == _br) & (df_all['donor'] == _donor) & (df_all['replicate'] == 'salk')].set_index("gene")['count'].to_frame()
        df2 = df_all[(df_all['brain_region_corr'] == _br) & (df_all['donor'] == _donor) & (df_all['replicate'] == 'ucsd')].set_index("gene")['count'].to_frame()
        df_merged = df1.merge(df2, left_index=True, right_index=True, how='outer', suffixes=('_salk', '_ucsd')).fillna(0)
        df_merged = df_merged.drop_duplicates()
        # print(_br, _donor, df_merged.shape)
        if df_merged.shape[0] < 10:
            print(f"Skipping {_br} {_donor} due to insufficient data ({df_merged.shape[0]} genes)")
            axes[i, j].axis('off')
            continue

        # Use the correlation plot function
        corr_coef = plot_correlation(
            df_merged, 
            axes[i, j], 
            x_col='count_salk', 
            y_col='count_ucsd',
            title=f"{_br} {_donor}",
            xlabel="R1",
            ylabel="R2",
            scatter_color='#2E86AB',
            line_color='#f00e3b'
        )

plt.savefig("/home/x-aklein2/projects/aklein/BICAN/BG/images/figures/qcf/correlation_ucsd_salk_all_regions_T.pdf", bbox_inches='tight')
plt.savefig("/home/x-aklein2/projects/aklein/BICAN/BG/images/figures/qcf/correlation_ucsd_salk_all_regions_T.png", bbox_inches='tight')
plt.show()
plt.close()