In [None]:
import sys
sys.path.insert(0,'/home/cane/Documents/yoseflab/can/resolVI')
from scvi.external import RESOLVI

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import scvi
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
scvi.settings.seed = 0
sc.set_figure_params(dpi=100, dpi_save=300, format='png', frameon=False, vector_friendly=True, fontsize=14, color_map='viridis', figsize=(8,8))
sc.settings.figdir = 'figure3/'

In [None]:
sys.path.append('..')
import _utils

In [None]:
path = '/external_data/other/resolvi_final_other_files/'

## Subsetting and spatial

In [None]:
resolvi = RESOLVI.load('/external_data/other/resolvi_final_other_files/liver_nanostring/baysor_newest_bg_semisupervised/resolvae')

In [None]:
nanostring = {}
nanostring['baysor'] = sc.read_h5ad(f'{path}liver_nanostring/baysor_newest_bg/complete_adata.h5ad')
nanostring['baysor'].obsm['resolvi_semisupervised'] = sc.read_h5ad(f'{path}liver_nanostring/baysor_newest_bg_semisupervised/complete_adata.h5ad').obsm['X_resolVI']

In [None]:
nanostring['proseg'] = sc.read_h5ad(f'{path}liver_nanostring/proseg_newest_bg/complete_adata.h5ad')
nanostring['proseg'].obsm['resolvi_semisupervised'] = sc.read_h5ad(f'{path}liver_nanostring/proseg_newest_bg_semisupervised/complete_adata.h5ad').obsm['X_resolVI']
nanostring['original'] = sc.read_h5ad(f'{path}liver_nanostring/original_newest_bg/complete_adata.h5ad')
nanostring['original'].obsm['resolvi_semisupervised'] = sc.read_h5ad(f'{path}liver_nanostring/original_newest_bg_semisupervised/complete_adata.h5ad').obsm['X_resolVI']

In [None]:
sub = nanostring['baysor'][nanostring['baysor'].obs['health']=='normal']

In [None]:
sc.pp.normalize_total(sub, layers=['generated_expression', 'raw_counts', 'corrected_counts', 'generated_expression_mean'])

In [None]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np
import matplotlib.colors as mcolors
import scanpy as sc

def plot_umap_genes(adata, genes, colors, layer='generated_expression', plt_axis="off", extra=''):
    # Extract and normalize the expression levels of the specified genes
    normalized_expressions = []
    expr_min_max = []
    for gene in genes:
        expr = adata[:, gene].layers[layer].A
        min_val, max_val = np.min(expr), np.quantile(expr, 0.999) + 1e-3
        normalized_expr = np.clip((expr - min_val) / (max_val - min_val), 0, 1)
        normalized_expressions.append(normalized_expr)
        expr_min_max.append((min_val, max_val))

    # Compute RGB values and alpha for each cell based on gene expressions
    alphas = np.any([expr > 0 for expr in normalized_expressions], axis=0) * 1.0  # Alpha is 1 if any expression is non-zero
    rgb_colors = np.stack([
        normalized_expr.reshape(-1, 1) * np.array(color)
        for normalized_expr, color in zip(normalized_expressions, colors)
    ]).sum(axis=0)
    order = np.argsort(rgb_colors.sum(axis=1))
    rgb_colors = np.hstack([rgb_colors, alphas.reshape(-1, 1)])  # Append alpha as the fourth channel

    # Create plot using GridSpec for layout
    plt.figure(figsize=(22, 12))
    gs = gridspec.GridSpec(1, 4, width_ratios=[10, 1, 1, 1], wspace=0.3)
    ax_main = plt.subplot(gs[0, 0])
    
    plt.gcf().patch.set_facecolor('black')  # Set the figure background to white
    ax_main.set_facecolor('black')  # Set the scatter plot background to black
    
    # Configure axes and ticks for visibility against a black background
    ax_main.tick_params(axis='both', colors='white')  # Ensure ticks are visible
    ax_main.spines['top'].set_color('white')
    ax_main.spines['bottom'].set_color('white')
    ax_main.spines['left'].set_color('white')
    ax_main.spines['right'].set_color('white')
    
    # Scatter plot of UMAP coordinates colored by the gene expression mixture
    ax_main.scatter(adata.obsm['X_spatial'][:, 0][order], -adata.obsm['X_spatial'][:, 1][order], color=rgb_colors[order, :], s=1)
    
    # Optionally turn off the axis
    if plt_axis == "off":
        ax_main.axis('off')
    
    # Add color bars for each gene with white backgrounds
    for i, (gene, color) in enumerate(zip(genes, colors)):
        ax_colorbar = plt.subplot(gs[0, 1+i])
        cmap = mcolors.LinearSegmentedColormap.from_list("custom", [np.zeros(3), color], N=100)
        norm = mcolors.Normalize(vmin=expr_min_max[i][0], vmax=expr_min_max[i][1])
        cb = plt.colorbar(plt.cm.ScalarMappable(norm=norm, cmap=cmap), cax=ax_colorbar)
        cb.set_label(gene)
    plt.savefig(f'figure3/coexpression_cholangiocytes_{layer}_new.pdf', facecolor=plt.gcf().get_facecolor())
    plt.show()

plot_umap_genes(sub, ['ANXA4', 'EPCAM', 'FLT1'], colors=[[0, 0, 1], [1, 0, 0], [0, 1, 0]], plt_axis="off")

In [None]:
plot_umap_genes(sub, ['ANXA4', 'EPCAM', 'FLT1'], colors=[[0, 0, 1], [1, 0, 0], [0, 1, 0]], plt_axis="off", layer='raw_counts')

In [None]:
sc.pl.spatial(sub[sub.obs['health']=='normal'], color=['cluster'], spot_size=0.03, save='celltypes_spatial.pdf')

In [None]:
sub.obs['zonation_raw'] = (
    sub[:, 'SAA1'].layers['raw_counts'].A/np.mean(sub[:, 'SAA1'].layers['raw_counts'].A) -
    sub[:, 'GLUL'].layers['raw_counts'].A/np.mean(sub[:, 'SAA1'].layers['raw_counts'].A)
)
sc.pl.spatial(sub[sub.obs['health']=='normal'], color=['zonation_raw'], spot_size=0.03, cmap='bwr', vmax=5, vmin=-5,
              save='celltype_spatial_raw_generated_zonation.pdf', sort_order=False)

In [None]:
sub.obs['zonation'] = (
    sub[:, 'SAA1'].layers['generated_expression'].A/np.mean(sub[:, 'SAA1'].layers['generated_expression'].A) -
    sub[:, 'GLUL'].layers['generated_expression'].A/np.mean(sub[:, 'SAA1'].layers['generated_expression'].A)
)
sc.pl.spatial(sub[sub.obs['health']=='normal'], color=['zonation'], spot_size=0.03, cmap='bwr', vmax=5, vmin=-5,
              save='celltype_spatial_generated_zonation.pdf', sort_order=False)

In [None]:
sub.obs['corrected_cholangiocyte'] = sub[:, ['ANXA4', 'EPCAM', 'SOX9']].layers['generated_expression'].sum(1)
sub.obs['raw_cholangiocyte'] = sub[:, ['ANXA4', 'EPCAM', 'SOX9']].layers['counts'].sum(1)
sc.pl.spatial(sub[sub.obs['health']=='normal'], color=['raw_cholangiocyte', 'corrected_cholangiocyte'], spot_size=0.03,
              save='celltype_spatial_raw_generated_cholangiocyte.pdf', vmax=3.)

In [None]:
sc.pl.dotplot(
    sub[sub.obs['health']=='normal'], var_names=['raw_cholangiocyte', 'corrected_cholangiocyte'], groupby='cluster', swap_axes=True, save='healthy_cholangiocyte_scores.pdf')

In [None]:
sub.obs['coarse_celltypes'] = [i if i in ['Cholangiocytes', 'Central.venous.LSECs', 'Periportal.LSECs'] else None for i in sub.obs['cluster']]

In [None]:
sc.pl.spatial(sub, color='coarse_celltypes', spot_size=0.03, groups=[
    'Cholangiocytes', 'Central.venous.LSECs', 'Periportal.LSECs', 'Portal.endothelial.cells'], save='lsec_cholangiocyte.pdf')

In [None]:
ax = sc.pl.scatter(
    sub[sub.obs['coarse_celltypes'].isin(['Cholangiocytes', 'Central.venous.LSECs', 'Periportal.LSECs'])],
    x='VWF', y='EPCAM', layers='generated_expression', use_raw=False, size=200, color='coarse_celltypes', show=False,
    groups=['Cholangiocytes', 'Central.venous.LSECs', 'Periportal.LSECs'])
ax.set_xlim(-0.3, 10)
ax.set_ylim(-1, 30)
plt.savefig('figure3/scatterepcam_vwf_generated_dp.pdf')
plt.show()

In [None]:
ax = sc.pl.scatter(
    sub[sub.obs['coarse_celltypes'].isin(['Cholangiocytes', 'Central.venous.LSECs', 'Periportal.LSECs'])],
    x='VWF', y='EPCAM', layers='raw_counts', use_raw=False, size=200, color='coarse_celltypes', show=False,
    groups=['Cholangiocytes', 'Central.venous.LSECs', 'Periportal.LSECs'])
ax.set_xlim(-0.3, 10)
ax.set_ylim(-1, 30)
plt.savefig('figure3/scatterepcam_vwf_raw_dp.pdf')
plt.show()

## UMAP all

In [None]:
import os

In [None]:
for key in nanostring:
    nanostring[key].obs['total_counts'] = nanostring[key].layers['raw_counts'].sum(1)

In [None]:
for key in nanostring:
    if os.path.exists(f'{path}liver_nanostring/baysor_newest_bg/complete_adata_filtered.h5ad'):
        nanostring[key] = sc.read_h5ad(f'{path}liver_nanostring/{key}_newest_bg/complete_adata_filtered.h5ad')
        continue
    print(key)
    sc.pp.filter_cells(nanostring[key], min_genes=10)
    _utils.compute_umap_embedding(nanostring[key], representation_key="X_resolVI", n_comps=None, show=True, key='resolvi_latent', extra_save=key)
    _utils.compute_umap_embedding(nanostring[key], representation_key="resolvi_semisupervised", n_comps=None, show=True, key='resolvi_semisupervised', extra_save=key)
    _utils.compute_umap_embedding(nanostring[key], representation_key="raw_counts", show=True, key='raw_counts', extra_save=key)
    _utils.compute_umap_embedding(nanostring[key], representation_key="raw_counts", show=True, key='raw_counts_harmony', extra_save=key, batch_key='fov_batch')
    _utils.compute_umap_embedding(nanostring[key], representation_key="generated_expression", show=True, key='resolvi_generated', extra_save=key)
    _utils.compute_umap_embedding(nanostring[key], representation_key="corrected_counts", show=True, key='resolvi_corrected', n_neighbors=30, extra_save=key)
    nanostring[key].write_h5ad(f'{path}liver_nanostring/{key}_newest_bg/complete_adata_filtered.h5ad')

In [None]:
pd.options.display.max_rows=300

In [None]:
def plot_umap_embedding(adata, key, ax, color='cluster'):
    adata.obsm['X_umap'] = adata.obsm[f'X_umap_{key}']
    sc.pl.umap(adata, color=color, frameon=False, show=False, ax=ax)

In [None]:
nanostring['original'].obs['health'] = nanostring['original'].obs['Run_Tissue_name'].map({'CancerousLiver': 'cancer', 'NormalLiver': 'normal'})

In [None]:
fig, axs = plt.subplots(3, 8, figsize=(45, 25))

# Iterate over each AnnData object and each axis
for ind, key in enumerate(nanostring):
    print(key)
    plot_umap_embedding(nanostring[key], key='resolvi_latent', ax=axs[ind, 0])
    axs[ind, 0].get_legend().remove()
    plot_umap_embedding(nanostring[key], key='resolvi_latent', ax=axs[ind, 1], color='health')
    plot_umap_embedding(nanostring[key], key='resolvi_semisupervised', ax=axs[ind, 2])
    axs[ind, 1].get_legend().remove()
    plot_umap_embedding(nanostring[key], key='resolvi_semisupervised', ax=axs[ind, 3], color='health')
    axs[ind, 2].get_legend().remove()
    plot_umap_embedding(nanostring[key], key='raw_counts', ax=axs[ind, 4])
    axs[ind, 3].get_legend().remove()
    plot_umap_embedding(nanostring[key], key='raw_counts', ax=axs[ind, 5], color='health')
    axs[ind, 4].get_legend().remove()
    plot_umap_embedding(nanostring[key], key='raw_counts_harmony', ax=axs[ind, 6])
    plot_umap_embedding(nanostring[key], key='raw_counts_harmony', ax=axs[ind, 7], color='health')

# Adjust layout
plt.tight_layout()
plt.savefig('figure3/umap_comparison.pdf')
plt.show()


In [None]:
for key in nanostring:
    sc.pl.violin(nanostring[key], groupby='cluster', keys=['background_proportion'], rotation=90, save=f'per_ct_background_{key}.pdf')
    sc.pl.violin(nanostring[key], groupby='cluster', keys=['diffusion_proportion'], rotation=90, save=f'per_ct_diffusion_{key}.pdf')

In [None]:
from scib_metrics.benchmark._core import BatchCorrection

batch_correction = BatchCorrection(
    silhouette_batch=True,
    ilisi_knn=True,
    kbet_per_label=True,
    graph_connectivity=True,
    pcr_comparison=False,
)

from contextlib import contextmanager
from scib_metrics.benchmark import Benchmarker

@contextmanager
def default_rcparams():
    default_params = plt.rcParams.copy()  # Store current rcParams
    plt.rcdefaults()   # Reset all rcParams to their defaults
    yield
    plt.rcParams.update(default_params)   # Restore rcParams to their original values

for key in nanostring:
    #if os.path.exists(f'figure3/scib_results_{key}.csv'):
    #    print(key)
    #    continue
    print(key)
    bm = Benchmarker(
        nanostring[key],
        batch_key="health",
        label_key="cluster",
        batch_correction_metrics=batch_correction,
        embedding_obsm_keys=["X_resolVI", "X_pca_resolvi_generated", "X_pca_raw_counts_harmony", "X_pca_raw_counts", "X_resolvi_transferred", "resolvi_semisupervised"],
        pre_integrated_embedding_obsm_key='X_pca_raw_counts',
        n_jobs=12,
    )
    bm.benchmark()
    #os.mkdir(f'figure3_vizgen/{key}/')
    with default_rcparams():
        bm.plot_results_table(min_max_scale=False, save_dir=f'figure3/{key}/')
    
    bm.get_results(min_max_scale=False).to_csv(f'figure3/scib_results_{key}.csv')

## Tcell

In [None]:
nanostring = {}
nanostring['baysor'] = sc.read_h5ad(f'{path}liver_nanostring/baysor_newest_bg_semisupervised/complete_adata.h5ad')

In [None]:
key = 'baysor'
nanostring[key].obs['total_counts'] = nanostring[key].X.sum(-1)
_utils.compute_umap_embedding(nanostring[key], representation_key="X_resolVI", n_comps=None, show=True, key='resolvi_semisupervised', extra_save=key)
_utils.compute_umap_embedding(nanostring[key], representation_key="raw_counts", show=True, key='raw_counts', extra_save=key)

In [None]:
subset = nanostring[key]
subset.X = subset.layers['raw_counts'].copy()
sc.pp.normalize_total(subset)
sc.pp.log1p(subset)

In [None]:
subset.obs['true_transcripts'] = subset.obs['true_proportion'] * subset.obs['n_transcripts']

In [None]:
subset = subset[subset.obs['cluster']!='Portal.endothelial.cells']

In [None]:
sc.tl.rank_genes_groups(subset, groupby='cluster')
sc.pl.rank_genes_groups_dotplot(subset, n_genes=3) # categories_order
sc.pl.rank_genes_groups_dotplot(subset[subset.obs['true_transcripts'] < 20], n_genes=3, save='marker_genes_low_counts.pdf')
sc.pl.rank_genes_groups_dotplot(subset[subset.obs['true_transcripts'] > 20], n_genes=3, save='marker_genes_high_counts.pdf')

In [None]:
subset.obsm['X_umap'] = subset.obsm['X_umap_resolvi_semisupervised']
sc.pl.umap(subset[subset.obs['true_transcripts'] > 20], color=['true_transcripts', 'n_transcripts'], vmax=500, save='kept_baysor_counts.pdf')
sc.pl.umap(subset[subset.obs['true_transcripts'] < 20], color=['true_transcripts', 'n_transcripts'], vmax=50, save='filtered_out_baysor_counts.pdf')

In [None]:
subset = subset[subset.obs['true_transcripts'] > 20]
_utils.compute_umap_embedding(subset, representation_key="X_resolVI", n_comps=None, show=True, key='resolvi_latent', extra_save=key)
_utils.compute_umap_embedding(subset, representation_key="raw_counts", show=True, key='raw_counts', extra_save=key)

In [None]:
sc.pp.neighbors(subset, use_rep='X_pca', method='rapids')
sc.tl.umap(subset, min_dist=0.3)
sc.pl.umap(subset, color=["cluster"], ncols=1)
subset.obsm['X_tsne'] = subset.obsm['X_umap']
subset.obsm['X_umap_pca'] = subset.obsm['X_umap']

In [None]:
sc.pp.neighbors(subset, use_rep='X_resolVI', method='rapids')
sc.tl.umap(subset, min_dist=0.3)
sc.pl.umap(subset, color=["cluster"], ncols=1)
subset.obsm['X_umap_resolvi'] = subset.obsm['X_umap']

In [None]:
subset = sc.read_h5ad(f'{path}liver_nanostring/baysor_filtered_lq_all_cells.h5ad')

In [None]:
subset2 = subset

In [None]:
sc.pp.neighbors(subset2, use_rep='X_pca', method='rapids')
sc.tl.umap(subset2, min_dist=0.3)
sc.pl.umap(subset2, color=["cluster", "health"], ncols=1, save='liver_nanostring_pca.pdf')
subset.obsm['X_umap_raw_counts'] = subset2.obsm['X_umap']

In [None]:
subset.obsm['X_umap'] = subset.obsm['X_umap_resolvi']

In [None]:
sc.pl.umap(subset, color=["cluster", "health"], ncols=1, save='umap_liver_nanostring.pdf')

In [None]:
sc.pl.spatial(subset[subset.obs['health']=='cancer'],
              color=['S100A6', 'TGFB1'], spot_size=0.03, vmin=0.01, layer='generated_expression',
              vmax=[20, 0.05])
sc.pl.spatial(subset[subset.obs['health']=='normal'],
              color=['S100A6', 'TGFB1'], spot_size=0.03, vmin=0.01, layer='generated_expression',
              vmax=[20, 0.05])

In [None]:
subset.write_h5ad(f'{path}liver_nanostring/baysor_filtered_lq_all_cells.h5ad')

In [None]:
sc.pp.neighbors(subset, use_rep='X_resolvi', n_neighbors=30, method='rapids')
sc.tl.louvain(subset, resolution=0.6, flavor='rapids')
sc.pl.umap(subset, color='louvain')

In [None]:
tumor_cells = subset[np.logical_and(subset.obs['louvain'].isin(['6', '4', '9', '5', '12']), subset.obs['health']=='cancer')]
sc.pl.umap(tumor_cells, color=['louvain'], ncols=10)

In [None]:
tumor_cells.write_h5ad(f'figure3/tumor_cells.h5ad')

In [None]:
import hotspot

In [None]:
tumor_cells = sc.read_h5ad(f'figure3/tumor_cells.h5ad')

In [None]:
tumor_cells_filtered = tumor_cells[:, tumor_cells.layers['generated_expression'].sum(0) > 50]

In [None]:
import hotspot
hs2 = hotspot.Hotspot(
    tumor_cells_filtered,
    layer_key="raw_counts",
    model='danb',
    latent_obsm_key="X_spatial",
    umi_counts_obs_key="total_counts"
)
hs2.create_knn_graph(weighted_graph=False, n_neighbors=100)
hs2_results = hs2.compute_autocorrelations()
hs2_genes = hs2_results.loc[hs2_results.C > 0.1].index
local_correlations = hs2.compute_local_correlations(hs2_genes, jobs=10)

In [None]:
modules = hs2.create_modules(
    min_gene_threshold=10, core_only=True, fdr_threshold=1e-3
)
module_scores = hs2.calculate_module_scores()
hs2.plot_local_correlations(vmin=-100, vmax=100)
plt.show()

In [None]:
for i in module_scores.columns:
    tumor_cells_filtered.obs[f'hotspot_raw_{i}'] = module_scores[i]
sc.pl.spatial(tumor_cells_filtered, color=['hotspot_raw_1', f'hotspot_raw_2',  'hotspot_raw_3', 'cluster'], spot_size=0.03, vmax='p98')
sc.pl.umap(tumor_cells_filtered, color=['hotspot_raw_1', f'hotspot_raw_2',  'hotspot_raw_3', 'cluster'], vmax='p98')

In [None]:
import hotspot
hs = hotspot.Hotspot(
    tumor_cells_filtered,
    layer_key="generated_expression",
    model='danb',
    latent_obsm_key="X_spatial",
    umi_counts_obs_key="total_counts"
)
hs.create_knn_graph(weighted_graph=False, n_neighbors=100)
hs_results = hs.compute_autocorrelations()

In [None]:
hs_genes = hs_results.loc[hs_results.C > 0.15].index
local_correlations = hs.compute_local_correlations(hs_genes, jobs=10)

In [None]:
modules = hs.create_modules(
    min_gene_threshold=10, core_only=True, fdr_threshold=1e-3
)
module_scores = hs.calculate_module_scores()
hs.plot_local_correlations(vmin=-100, vmax=100)
plt.show()

In [None]:
for i in module_scores.columns:
    tumor_cells_filtered.obs[f'hotspot_{i}'] = module_scores[i]
sc.pl.spatial(tumor_cells_filtered, color=['hotspot_1', f'hotspot_2',  'cluster'], spot_size=0.03, vmax='p98')
sc.pl.umap(tumor_cells_filtered, color=['hotspot_1', f'hotspot_2', 'cluster'], vmax='p98', save='hotspot_modules_cancer.pdf')

In [None]:
del tumor_cells_filtered.uns['cluster_colors']

In [None]:
tumor_cells_filtered.obs['tumor_niches'] = [
    'anti-inflammatory' if i>1.5 else 'pro-inflammatory' if j>1 else 'other' for i, j in zip(tumor_cells_filtered.obs['hotspot_2'], tumor_cells_filtered.obs['hotspot_1'])]
sc.pl.spatial(tumor_cells_filtered, color=['hotspot_1', f'hotspot_2', 'tumor_niches', 'cluster'], spot_size=0.02, vmax='p98', save='hotspot_modules_cancer.pdf')
sc.pl.umap(tumor_cells_filtered, color=['hotspot_1', f'hotspot_2', 'tumor_niches', 'cluster'], vmax='p98', save='hotspot_modules_cancer.pdf')

In [None]:
subset.obs['tumor_niches'] = tumor_cells_filtered.obs['tumor_niches']
subset.obs['tumor_hotspot_1'] = tumor_cells_filtered.obs['hotspot_1']
subset.obs['tumor_hotspot_2'] = tumor_cells_filtered.obs['hotspot_2']

subset.obs['tumor_niches'].astype(str).fillna('other_cells', inplace=True)

In [None]:
sc.pl.umap(subset, color=['tumor_hotspot_1', 'tumor_hotspot_2', 'tumor_niches'], vmax='p99', save='tumor_niches.pdf')

In [None]:
subset.write_h5ad(f'figure3/final_module_scores.h5ad')

In [None]:
subset.X = subset.layers['raw_counts']
sc.pp.normalize_total(subset)
sc.pp.log1p(subset)

In [None]:
sc.tl.rank_genes_groups(subset, groupby='tumor_niches', groups=['anti-inflammatory'], reference='pro-inflammatory')
sc.get.rank_genes_groups_df(subset, group='anti-inflammatory').head(100)

In [None]:
import decoupler as dc

In [None]:
subset.layers['raw_counts'] = subset.layers['counts'].copy()

In [None]:
de_result_importance = resolvi.differential_expression(
    adata=subset[subset.obs['tumor_niches'].isin(['anti-inflammatory', 'pro-inflammatory'])], groupby='tumor_niches', group1='anti-inflammatory', group2='pro-inflammatory', weights='importance',
    pseudocounts=1e-3, delta=0.05, filter_outlier_cells=True, mode='change', test_mode='three'#, batch_correction=True, batchid1=batch_index, batchid2=batch_index,
)

In [None]:
de_result_importance.head(50)

In [None]:
dc.plot_volcano_df(
    de_result_importance,
    x='lfc_mean',
    y='proba_not_de',
    sign_thr=0.1,
    lFCs_thr=0.4,
    top=30,
    figsize=(10, 10),
    save='figure3/tumor_niches_resolvi_de.pdf'
)
plt.show()

In [None]:
sc.tl.louvain(subset)
sc.pl.umap(subset, color=["louvain"], ncols=1, legend_loc='on data', size=1)

In [None]:
macrophage = subset[subset.obs['louvain'].isin(['2', '19'])]

In [None]:
sc.pp.neighbors(macrophage, use_rep='X_resolvi')
sc.tl.louvain(macrophage, resolution=0.6)
sc.tl.umap(macrophage)
sc.pl.umap(macrophage, color=['louvain', 'cluster'])

In [None]:
sc.tl.rank_genes_groups(macrophage, groupby='louvain', use_raw=False)
sc.pl.rank_genes_groups_dotplot(macrophage, layer='counts', n_genes=5, use_raw=False, standard_scale='var')

In [None]:
macrophage = macrophage[~macrophage.obs['louvain'].isin(['9', '6'])]

In [None]:
macrophage.obs['celltypes'] = macrophage.obs['louvain'].astype('category').replace({
    '0': 'Macrophage', '1': 'Macrophage', '2': 'Macrophage', '3': 'SPP1 Macrophage', '4': 'Kupffer cell', '5': 'Pro-inflammatory', '7': 'Kupffer cell', '8': 'Neutrophil'})

In [None]:
sc.pp.neighbors(macrophage, use_rep='X_resolvi')
sc.tl.umap(macrophage)
sc.pl.umap(macrophage, color=['louvain', 'celltypes'])

In [None]:
macrophage.obs['celltypes'].value_counts()

In [None]:
sc.pl.umap(macrophage,
           color=['CD74', 'HLA-DPA1', 'MALAT1', 'CIITA', 'ZBTB16', 'CD5L', 'CD163', 'MARCO', 'MIF', 'ENO1', 'SPP1', 'S100A9', 'S100A8', 'S100A6', 'C1QB',
                 'IFI27', 'IFI44L', 'IFI6', 'IFIH1', 'IFIT1', 'IFIT3', 'IFITM1','IFITM3', 'TGFB1', 'TGFB2', 'TGFB3', 'TGFBI'],
           layer='raw_counts', vmax='p98', size=4)

In [None]:
subset.X = subset.layers['generated_expression'].copy()
sc.pp.normalize_total(subset)

In [None]:
macrophage.X = macrophage.layers['generated_expression'].copy()
sc.pp.normalize_total(macrophage)

In [None]:
sc.pl.umap(macrophage,
           color=['MALAT1', 'C1QB', 'CD74', 'HLA-DPA1', 'CD5L', 'CD163', 'MARCO', 'MIF', 'ENO1', 'SPP1', 'S100A9', 'S100A8',
                 'IFI27', 'IFI6', 'CXCL9', 'TGFB1'],
           vmax='p99', size=4, save='marker_macrophages.pdf')

In [None]:
sc.pl.spatial(macrophage[macrophage.obs['health']=='cancer'],
              color=['MALAT1', 'C1QB', 'CD74', 'HLA-DPA1', 'CD5L', 'CD163', 'MARCO', 'MIF', 'ENO1', 'SPP1', 'S100A9', 'S100A8',
                 'IFI27', 'IFI6', 'S100A6', 'TGFB1'], spot_size=0.09, vmin=0.01, save='marker_macrophages_cancer.pdf',
              vmax=[20, 8, 20, 10, 7, 4, 7, 1.5, 2, 25, 1, 2, 0.5, 2, 1, 0.2])
sc.pl.spatial(macrophage[macrophage.obs['health']=='normal'],
              color=['MALAT1', 'C1QB', 'CD74', 'HLA-DPA1', 'CD5L', 'CD163', 'MARCO', 'MIF', 'ENO1', 'SPP1', 'S100A9', 'S100A8',
                 'IFI27', 'IFI6', 'S100A6', 'TGFB1'], spot_size=0.09, vmin=0.01, save='marker_macrophages_healthy.pdf',
              vmax=[20, 8, 20, 10, 7, 4, 7, 1.5, 2, 25, 1, 2, 0.5, 2, 1, 0.2])

In [None]:
import pandas as pd
import numpy as np

def compute_log_ratio(adata, condition_key, condition1, condition2, celltype_key):
    """
    Compute log2-ratio of cell type frequencies between two conditions in an AnnData object.
    
    Parameters:
    -----------
    adata : AnnData
        AnnData object containing observations.
    condition_key : str
        Key in adata.obs indicating the condition (e.g., 'condition').
    condition1 : str
        First condition (e.g., "healthy").
    condition2 : str
        Second condition (e.g., "liver").
    celltype_key : str
        Key in adata.obs indicating the cell type (e.g., "cell_type").
    
    Returns:
    --------
    pd.DataFrame
        DataFrame containing cell types and their log2-ratio.
    """
    # Subset data by condition
    adata_condition1 = adata[adata.obs[condition_key] == condition1]
    adata_condition2 = adata[adata.obs[condition_key] == condition2]
    
    # Get counts of cell types for each condition
    counts_condition1 = adata_condition1.obs[celltype_key].value_counts()
    counts_condition2 = adata_condition2.obs[celltype_key].value_counts()

    # Total cells in each condition
    total_cells_condition1 = len(adata_condition1)
    total_cells_condition2 = len(adata_condition2)

    # Normalize counts by total cells to get frequencies
    freqs_condition1 = counts_condition1 / total_cells_condition1
    freqs_condition2 = counts_condition2 / total_cells_condition2

    # Align indexes (ensure all cell types are represented)
    all_celltypes = freqs_condition1.index.union(freqs_condition2.index)
    freqs_condition1 = freqs_condition1.reindex(all_celltypes, fill_value=0)
    freqs_condition2 = freqs_condition2.reindex(all_celltypes, fill_value=0)

    # Compute log2-ratio (add small constant to avoid log(0))
    log_ratios = np.log2((freqs_condition1 + 1e-6) / (freqs_condition2 + 1e-6))
    
    # Convert to DataFrame
    result_df = pd.DataFrame({
        "cell_type": all_celltypes,
        "log2_ratio": log_ratios
    }).sort_values(by="log2_ratio", ascending=False)
    
    return result_df

# Example usage:
log_ratios = compute_log_ratio(macrophage, condition_key="health", condition1="cancer", condition2="normal", celltype_key="celltypes")
print(log_ratios)

In [None]:
normal = macrophage

In [None]:
custom_palette = {
    'Macrophage': (44/255, 160/255, 44/255, 0.4),       # green with 40% opacity
    'SPP1 Macrophage': (31/255, 119/255, 180/255, 0.4), # blue with 40% opacity
    'Kupffer cell': (255/255, 204/255, 0/255, 0.6),     # teal with 40% opacity
    'Pro-inflammatory': (148/255, 0/255, 211/255, 0.8), # purple with 80% opacity
    'Neutrophil': (214/255, 39/255, 40/255, 1.0),       # red with 100% opacity
}

In [None]:
sc.pl.spatial(macrophage[macrophage.obs['health']=='normal'],
              color=['celltypes'], spot_size=0.1, vmin=0.3, vmax='p99', save='celltypes_macrophages_healthy.pdf', palette=custom_palette)
sc.pl.spatial(macrophage[macrophage.obs['health']=='cancer'],
              color=['celltypes'], spot_size=0.1, vmin=0.3, vmax='p99', save='celltypes_macrophages_cancer.pdf', palette=custom_palette)
sc.pl.umap(macrophage, color=['celltypes'], vmax='p99', size=4, save='celltypes_macrophages.pdf', palette=custom_palette)

In [None]:
macrophage[:, 'TGFB1'].X = macrophage[:, 'TGFB1'].X.A + 1e-5

In [None]:
macrophage = sc.read_h5ad(f'figure3/macrophage.h5ad')

In [None]:
sc.pl.umap(macrophage, color=['health'], save='macrophage_health.pdf')

In [None]:
sc.pl.dotplot(macrophage, use_raw=False, standard_scale='var', dendrogram=False, groupby='celltypes',
    var_names=['MALAT1', 'C1QB', 'CD74', 'HLA-DPA1', 'CD5L', 'CD163', 'MARCO', 'MIF', 'ENO1', 'SPP1', 'S100A9', 'S100A8', 'IFI27', 'IFI6', 'CXCL9', 'TGFB1'], save='markers_macrophages.pdf')

In [None]:
macrophage.write_h5ad(f'figure3/macrophage.h5ad')
tumor_cells.write_h5ad(f'figure3/tumor_cells.h5ad')

In [None]:
tcell = subset[np.logical_and(subset.obs['louvain'].isin(['12']), subset.obs['true_transcripts']>30)]
sc.pp.neighbors(tcell, use_rep='X_resolVI', method='rapids', n_neighbors=20)
sc.tl.umap(tcell)
sc.tl.louvain(tcell, resolution=0.3)
sc.pl.umap(tcell, color=['louvain', 'cluster', 'health'], legend_loc='on data')

In [None]:
tcell = sc.read_h5ad(f'figure3/tcell.h5ad')

In [None]:
tcell_sub = tcell[~tcell.obs['louvain'].isin(['1', '3'])]
sc.pp.neighbors(tcell_sub, use_rep='X_resolVI', method='rapids', n_neighbors=20)
sc.tl.umap(tcell_sub)
sc.tl.louvain(tcell_sub, resolution=0.7)
sc.pl.umap(tcell_sub, color=['louvain', 'cluster', 'health'], legend_loc='on data')

In [None]:
sc.pl.umap(tcell_sub, color=['TIGIT', 'TNFRSF1B', 'TNFRSF9', 'RGS1', 'S100A4', 'LAG3', 'LGALS1', 'LGALS3', 'RBPJ', 'PFN1', 'VIM', 'HAVCR2', 'ICOS', 'IFIT3', 'IFI44L', 'IL18R1',
                          'IL2RA', 'CTLA4', 'CXCL13', 'CXCR3', 'FOXP3', 'GZMH', 'GZMK', 'NKG7', 'PRF1', 'CX3CR1'],
           sort_order=True, layer='corrected_counts', vmax='p98', size=30)

In [None]:
tcell_sub.obs['celltypes_re'] = tcell_sub.obs['louvain'].astype('category').replace({
    '0': 'low_quality', '1': 'TREG', '2': 'Central Memory', '3': 'Gamma-Delta', '4': 'CXCL13+', '5': 'CD8_GZMK', '6': 'NK cell', '7': 'Central Memory'})

In [None]:
tcell_sub.X = tcell_sub.layers['generated_expression'].copy()
sc.pp.normalize_total(tcell_sub)
sc.pp.log1p(tcell_sub)
sc.pl.dotplot(tcell_sub, var_names=['CD3D',  'CD3E',  'CD3G',  'CD4',  'CD8A',  'CD8B', 'IL7R', 'CCR7', 'RGS1', 'FOXP3', 'CTLA4', 'IL2RA', 'CXCR3', 'TNFRSF1B', 'GZMK', 'TIGIT', 'LAG3', 'RBPJ', 'HAVCR2', 'ICOS',
                          'CXCL13', 'GZMH', 'NKG7', 'PRF1', 'CX3CR1'], groupby='celltypes_re', standard_scale='var', smallest_dot=8., dendrogram=True, save='tcell_generated.pdf')

In [None]:
tcell_sub.X = tcell_sub.layers['raw_counts'].copy()
sc.pp.normalize_total(tcell_sub)
sc.pp.log1p(tcell_sub)
sc.pl.dotplot(tcell_sub, var_names=['CD3D',  'CD3E',  'CD3G',  'CD4',  'CD8A',  'CD8B', 'IL7R', 'CCR7', 'RGS1', 'FOXP3', 'CTLA4', 'IL2RA', 'CXCR3', 'TNFRSF1B', 'GZMK', 'TIGIT', 'LAG3', 'RBPJ', 'HAVCR2', 'ICOS',
                          'CXCL13', 'GZMH', 'NKG7', 'PRF1', 'CX3CR1'], groupby='celltypes_re', standard_scale='var', smallest_dot=8., dendrogram=True, save='tcell_raw.pdf')

In [None]:
tcell_sub.X = tcell_sub.layers['corrected_counts'].copy()
sc.pp.normalize_total(tcell_sub)
sc.pp.log1p(tcell_sub)
sc.pl.dotplot(tcell_sub, var_names=['CD3D',  'CD3E',  'CD3G',  'CD4',  'CD8A',  'CD8B', 'IL7R', 'CCR7', 'RGS1', 'FOXP3', 'CTLA4', 'IL2RA', 'CXCR3', 'TNFRSF1B', 'GZMK', 'TIGIT', 'LAG3', 'RBPJ', 'HAVCR2', 'ICOS',
                          'CXCL13', 'GZMH', 'NKG7', 'PRF1', 'CX3CR1'], groupby='celltypes_re', standard_scale='var', smallest_dot=8., dendrogram=True, save='tcell_corrected.pdf')

In [None]:
sc.pl.umap(tcell_sub, color=['celltypes_re', 'health', 'cluster'], save='tcell_celltypes.pdf')

In [None]:
sc.pl.spatial(tcell_sub[tcell_sub.obs['health']=='normal'], groups=['TREG', 'CD8_GZMK', 'CXCL13+', 'Central Memory', 'Gamma-Delta', 'NK cell'],
              color=['celltypes_re'], spot_size=0.1, vmin=0.3, vmax='p99', save='celltypes_tcell_healthy.pdf')
sc.pl.spatial(tcell_sub[tcell_sub.obs['health']=='cancer'], groups=['TREG', 'CD8_GZMK', 'CXCL13+', 'Central Memory', 'Gamma-Delta', 'NK cell'],
              color=['celltypes_re'], spot_size=0.1, vmin=0.3, vmax='p99', save='celltypes_tcell_cancer.pdf')

In [None]:
sc.pl.spatial(tcell_sub[tcell_sub.obs['health']=='cancer'], groups=['TREG', 'CXCL13+', 'CD8_GZMK'],
              color=['FOXP3', 'IL7R', 'CXCL13', 'IL2RA', 'CXCR3', 'PRF1', 'GNLY', 'LAG3', 'CXCR5'], spot_size=0.1, layer='raw_counts', vmax='p99', vmin=0)

In [None]:
tcell.write_h5ad(f'figure3/tcell.h5ad')
subset.write_h5ad(f'figure3/nanostring_finished_baysor.h5ad')

## sc Reference

In [None]:
import json

In [None]:
subset = sc.read_h5ad(f'{path}liver_nanostring/baysor_filtered_lq_all_cells.h5ad')

In [None]:
#!curl https://datasets.cellxgene.cziscience.com/0401c761-2112-4f10-ae7d-6d5e04b5e1a4.h5ad -O liver_nanostring/sc_reference.h5ad

In [None]:
sc_reference = sc.read('sc_reference_liver.h5ad')

In [None]:
sc_reference.X = sc_reference.raw.X

In [None]:
del sc_reference.raw

In [None]:
sc_reference.obs['disease'].value_counts()

In [None]:
sc_reference = sc_reference[sc_reference.obs['disease']=='normal']
sc_reference.obs['author_cell_type'].value_counts()

In [None]:
sc_reference.obs['coarse_ct'] = sc_reference.obs['author_cell_type'].astype(str).map({
    'P-Hepato': 'Hepatocyte',
    'C-Hepato': 'Hepatocyte',
    'P-Hepato2': 'Hepatocyte',
    'C-Hepato2': 'Hepatocyte',
    'cvLSEC': 'Endothelial',
    'Hepato-Doublet': 'low quality',
    'Chol': 'Cholangiocyte',
    'Stellate': 'Fibroblast',
    'cvLSEC-Doublet': 'low quality',
    'ppLSEC': 'Endothelial',
    'Stellate-Doublet': 'low quality',
    'Prolif': 'low quality', 
    'aStellate': 'Fibroblast',
    'Monocyte': 'Myeloid',
    'I-Hepato': 'Hepatocyte',
    'Kupffer': 'Myeloid',
    'Kupffer-Doublet': 'low quality', 
    'CD4T': 'Lympho',
    'Chol-Doublet': 'low quality',
    'lrNK': 'Lympho',
    'cvEndo': 'Endothelial',
    'Tcell-Doublet': 'low quality',
    'Fibroblast': 'Fibroblast',
    'CholMucus': 'Cholangiocyte',
    'VSMC': 'Fibroblast',
    'AntiB': 'Bcell',
    'cvLSEC--Mac': 'low quality',
    'Chol--Stellate-Doublet': 'low quality',
    'Prolif-Mac': 'low quality',
    'Chol--Kupffer-Doublet': 'low quality'
})
sc_reference = sc_reference[~(sc_reference.obs['coarse_ct']=='low quality')]

In [None]:
sc_reference.var_names = sc_reference.var['feature_name'].astype(str)
sc_reference.var_names_make_unique()
sc_reference.obs_names_make_unique()
sc_reference = sc_reference[:, np.intersect1d(subset.var_names, sc_reference.var['feature_name'])].copy()

In [None]:
sc_reference.layers['counts'] = sc_reference.X.copy()
sc.pp.normalize_total(sc_reference, layer='counts')
sc_reference.obsm['counts'] = pd.DataFrame(sc_reference.layers['counts'].A, columns=sc_reference.var_names, index=sc_reference.obs_names)

In [None]:
_utils.double_positive_pmm(sc_reference, sc_reference.var_names, layer_key="counts", output_dir='figure3')
plt.close('all')

In [None]:
sc_reference.obsm['positive_pmm_counts']['celltype'] = sc_reference.obs['coarse_ct']
per_celltype_positive = sc_reference.obsm['positive_pmm_counts'].groupby('celltype').mean()
#per_celltype_positive.drop('PeripheralGlia', inplace=True)

In [None]:
celltype_gene_dict = {}

# Iterate over each column
for col in per_celltype_positive.columns:
    # Check if only one value is above 0.2 and all other values are below 0.05
    if (per_celltype_positive[col] > 0.2).sum() == 1 and (per_celltype_positive[col] < 0.05).sum() == len(per_celltype_positive) - 1:
        # Get the celltype for which the value is above 0.2
        celltype = per_celltype_positive[per_celltype_positive[col] > 0.1].index[0]
        # If the celltype is not in the result dictionary, add it with an empty list
        if celltype not in celltype_gene_dict:
            celltype_gene_dict[celltype] = []
        # Append the column (gene) to the list of genes for this celltype
        celltype_gene_dict[celltype].append(col)

In [None]:
marker_dict = celltype_gene_dict
marker_dict

In [None]:
import json
with open('figure3/celltype_markers_sc_ref_extended.json', 'w') as fp:
    json.dump(marker_dict, fp)

In [None]:
import json
with open('figure3/celltype_markers_sc_ref_extended.json', 'r') as fp:
    marker_dict = json.load(fp)

In [None]:
marker_dict = {'Myeloid': ['ADGRE2',
  'C1QA',
  'C1QB',
  'C1QC',
  'CD163',
  'CD68',
  'CD86',
  'CLEC7A',
  'CMKLR1',
  'CSF1R',
  'CSF3R',
  'FPR1',
  'GPNMB',
  'HCK',
  'IL1R2',
  'LYZ',
  'MARCO',
  'MS4A4A',
  'MSR1',
  'TLR2'],
 'Endothelial': ['ADGRL4',
  'CD9',
  'CLEC1A',
  'FLT1',
  'FZD4',
  'IL33',
  'KDR',
  'NPR1',
  'RAMP3',
  'TIE1'],
 'Fibroblast': ['ANGPT1',
  'BGN',
  'BMP5',
  'CACNA1C',
  'CARMN',
  'CDH19',
  'COL12A1',
  'COL14A1',
  'COL1A1',
  'COL1A2',
  'COL3A1',
  'COL6A3',
  'EPHA3',
  'HGF',
  'IGFBP5',
  'MYL9',
  'PDGFRA',
  'RAMP1',
  'VEGFC'],
 'Cholangiocyte': ['CASR',
  'CCL28',
  'CD24',
  'IL20RA',
  'ITGA2',
  'ITGB8',
  'KRT7',
  'SPP1'],
 'Lympho': ['CCL5', 'CD2', 'IL7R', 'ITK', 'KLRF1', 'PRF1'],
 'Bcell': ['CD27',
  'IGHA1',
  'IGHG1',
  'IGHM',
  'IGKC',
  'JCHAIN',
  'MZB1',
  'SELL',
  'TNFRSF13B',
  'WNT5B']}

In [None]:
marker_list = sum(marker_dict.values(), [])

In [None]:
sc.pp.normalize_total(subset, layers=['generated_expression', 'raw_counts', 'corrected_counts', 'generated_expression_mean'])
subset.obsm['counts'] = pd.DataFrame(subset[:, marker_list].layers['raw_counts'].A, columns=marker_list, index=subset.obs_names)
subset.obsm['generated_expression'] = pd.DataFrame(subset[:, marker_list].layers['generated_expression'].A,  columns=marker_list, index=subset.obs_names)
subset.obsm['corrected_counts'] = pd.DataFrame(subset[:, marker_list].layers['corrected_counts'].A, columns=marker_list, index=subset.obs_names)
_utils.cosine_distance_celltype(subset[subset.obs['health']=='normal'], marker_dict, layer_key="generated_expression", output_dir='figure3', extra_save='baysor_healthy', vmax=0.3)
_utils.cosine_distance_celltype(subset[subset.obs['health']=='cancer'], marker_dict, layer_key="generated_expression", output_dir='figure3', extra_save='baysor_cancer', vmax=0.3)
plt.show()

In [None]:
pd.options.display.max_rows=200

In [None]:
pd.options.display.max_columns = 200

In [None]:
_utils.double_positive_pmm(
    subset[subset.obs['health']=='normal'], marker_list, marker_dict=marker_dict, layer_key="generated_expression", output_dir='figure3', file_save='baysor_healthy')
_utils.double_positive_pmm(
    subset[subset.obs['health']=='cancer'], marker_list, marker_dict=marker_dict, layer_key="generated_expression", output_dir='figure3', file_save='baysor_cancer')
plt.show()

In [None]:
macrophage = subset[subset.obs['louvain']=='7']

In [None]:
macrophage = sc.read_h5ad(f'figure3/macrophage.h5ad')

In [None]:
macrophage.obs

In [None]:
ax = sc.pl.scatter(
    subset[np.logical_and(subset.obs['cluster']=='Cholangiocytes'], x='SPP1', y='CD68',
    layers='generated_expression', use_raw=False, size=200, color='health', show=False)
ax.set_xlim(-5, 150)
ax.set_ylim(-1, 30)
plt.savefig('figure3/scatter_spp1_cd68_cholangiocytes.pdf')
plt.show()

In [None]:
ax = sc.pl.scatter(
    macrophage, x='SPP1', y='CD68',
    layers='generated_expression', use_raw=False, size=200, color='health', show=False)
ax.set_xlim(-5, 150)
ax.set_ylim(-1, 30)
plt.savefig('figure3/scatter_spp1_cd68_macrophages.pdf')
plt.show()

In [None]:
ax = sc.pl.scatter(
    subset[subset.obs['health']=='normal'], x='SPP1', y='CD68',
    layers='generated_expression', use_raw=False, size=200, color='cluster', show=False,
    groups=['Cholangiocytes', 'Non.inflammatory.macrophages', 'Inflammatory.macrophages', 'tumor_2'])
ax.set_xlim(-0.3, 40)
ax.set_ylim(-1, 15)
plt.savefig('figure3/scatterepcam_vwf_generated_dp.pdf')
plt.show()

In [None]:
_utils.double_positive_pmm(
    sc_reference, marker_list, marker_dict=marker_dict, layer_key="counts", output_dir='figure3', file_save='single_cell')
plt.show()