In [None]:
import scanpy as sc
import decoupler as dc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import anndata as ad
import scanorama

from wrapper_functions import *

In [None]:
sc.logging.print_versions()
sc.settings.verbosity = 3

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
analysis_name='Test01'

root_path = os.getcwd()
results_folder = os.path.join(root_path, 'results')
basepath=root_path+'/analyzed/'+analysis_name+"/"

In [None]:
global_clustering_folder = os.path.join(results_folder, 'global_clustering') 
adata=sc.read(os.path.join(global_clustering_folder , 'clustering_results_harmony.h5ad'))

In [None]:
adata.var_names

In [None]:
adata.obs['Sample_Layer'] = adata.obs['readout_id'].astype(str) + '-' + adata.obs['leiden'].astype(str)

In [None]:
file_names = [f for f in os.listdir(os.path.join(results_folder, 'qc_filtered')) if os.path.isfile(os.path.join(results_folder,'qc_filtered',f))]

adata_list = [ad.read(os.path.join(results_folder, 'qc_filtered', file)) for file in file_names if file.endswith('.h5ad')]

In [None]:
adata_concat = sc.concat(
    adata_list,
    label="library_id",
    uns_merge="unique",
    keys=[
        k
        for d in [adata.uns["spatial"] for adata in adata_list]
        for k, v in d.items()
    ],
    index_unique="-",
    join='outer' 
)

In [None]:
adata_concat.obs['batch']=adata_concat.obs['batch'].astype('category')
adata_concat.raw = adata_concat.copy()


In [None]:
adata_concat.obs['leiden']=adata[adata_concat.obs.index].obs['leiden'].copy()

In [None]:
# Get pseudo-bulk profile
pdata = dc.get_pseudobulk(
    adata_concat,
    sample_col='readout_id',
    groups_col='leiden',
    # layer='counts',
    mode='sum',
    min_cells=0,
    min_counts=0,
    use_raw=True
)
pdata

In [None]:
dc.plot_psbulk_samples(pdata, groupby=['readout_id','leiden','batch'], figsize=(10, 8))

In [None]:
# Get filtered pseudo-bulk profile
pdata = dc.get_pseudobulk(
    adata_concat,
    sample_col='readout_id',
    groups_col='leiden',
    # layer='counts',
    mode='sum',
    min_cells=5,
    min_counts=500,
    use_raw=True
)
pdata

In [None]:
pp_pdata = pdata.copy()
sc.pp.normalize_total(pp_pdata, target_sum=1e6)
sc.pp.log1p(pp_pdata)
sc.pp.scale(pp_pdata, max_value=10)
sc.tl.pca(pp_pdata, n_comps=10)

In [None]:
sc.pl.pca(pp_pdata, color=['readout_id'], ncols=1, show=True, size=300)

In [None]:
sc.pl.pca(pp_pdata, color=['treatment_id'], ncols=1, show=True, size=300)

In [None]:
sc.pl.pca(pp_pdata, color=['leiden'], ncols=1, show=True, size=300)

In [None]:
dc.get_metadata_associations(
    pp_pdata,
    obs_keys = ['readout_id', 'individual_id','treatment_id', 'leiden', 'psbulk_n_cells', 'psbulk_counts'], #metadata columns to associate to PCs
    obsm_key='X_pca',  
    uns_key='pca_anova', 
    inplace=True
)

In [None]:
plt.figure(figsize=(7,10))
ax, legend_axes = dc.plot_associations(
    pp_pdata,
    uns_key='pca_anova', 
    obsm_key='X_pca', 
    stat_col='p_adj', 
    obs_annotation_cols = ['treatment_id', 'leiden'], 
    titles=['Adjusted p-values from ANOVA', 'Principle component scores']
)
plt.show()

#### Are there genes only expressed in one condition? 

In [None]:
conditionT = pdata[pdata.obs['treatment_id'] == 'FAP_LTBR', :]
conditionC = pdata[pdata.obs['treatment_id'] == 'Untreated', :]

In [None]:
expr_conditionT = np.mean(conditionT.X, axis=0)
expr_conditionC = np.mean(conditionC.X, axis=0)

In [None]:
# Calculate the absolute difference in expression between the two conditions
expression_difference = np.abs(expr_conditionT - expr_conditionC)

In [None]:
# Identify genes that are exclusively expressed
exclusive_conditionT = (expr_conditionT > 0) & (expr_conditionC == 0)
exclusive_conditionC = (expr_conditionC > 0) & (expr_conditionT == 0)

In [None]:
# Create a DataFrame for sorting and filtering
genes_data = pd.DataFrame({
    'Gene': pdata.var_names,
    'Expr_Condition1': expr_conditionT,
    'Expr_Condition2': expr_conditionC,
    'Expression_Difference': expression_difference
})

In [None]:
# Filter and sort genes exclusively expressed in Condition 1
exclusive_genes_conditionT = genes_data[exclusive_conditionT]
exclusive_genes_conditionT_sorted = exclusive_genes_conditionT.sort_values(by='Expression_Difference', ascending=False)

In [None]:
exclusive_genes_conditionT_sorted.to_csv(results_folder+'/DGE_ALL_exclusive_genes_conditionT.tsv',sep='\t')

In [None]:
# Filter and sort genes exclusively expressed in Condition 2
exclusive_genes_conditionC = genes_data[exclusive_conditionC]
exclusive_genes_conditionC_sorted = exclusive_genes_conditionC.sort_values(by='Expression_Difference', ascending=False)

In [None]:
exclusive_genes_conditionC_sorted.to_csv(results_folder+'/DGE_ALL_exclusive_genes_conditionC.tsv',sep='\t')

In [None]:
exclusive_genes_conditionT_sorted.head()

In [None]:
exclusive_genes_conditionC_sorted.head()

In [None]:
for i, library in enumerate(
   adata.obs["readout_id"].unique().tolist()
):
    ad = adata[adata.obs["readout_id"] == library, :].copy()
    print(ad.obs["readout_id"].unique()[0])
    print(ad.obs["treatment_id"].unique()[0])
    sc.pl.spatial(
        ad,
        img_key="hires",
        library_id=library,
        color=["Chst4", 'Ptchd1', 'leiden'],
        size=1.5, color_map = 'RdBu_r', use_raw=False)
    
sc.pl.violin(adata, keys = ["Chst4", 'Chst4'], groupby='Sample_Layer', rotation=90, use_raw=False)

### T versus C in across all data

In [None]:
set(list(pdata.var.index)).intersection(set(['Glycam1']))

In [None]:
exclude=['V43J19-319_A1_B09','V43J24-078_D1_A06', 'V43J11-302_A1_B08',
         'V42D20-025_A1_B16', 'V42D20-002_D1_A02', 'V42D20-025_D1_A05']

In [None]:
set(pdata.obs.readout_id)

In [None]:
adata_T_C =  pdata[~pdata.obs['readout_id'].isin(exclude)].copy() #[(pdata.obs['leiden'] == 'White Matter')].copy()

In [None]:
set(adata_T_C.obs.readout_id)

In [None]:
#set(adata_T_C.var.index).intersection(set(plotgoi))

In [None]:
#dc.plot_filter_by_expr(adata_T_C, group='treatment_id', min_count=3, min_total_count=20)
dc.plot_filter_by_expr(adata_T_C, group='treatment_id', min_count=0.2, min_total_count=2)

In [None]:
#genes
genes = dc.filter_by_expr(adata_T_C, group='treatment_id', min_count=0.2, min_total_count=2)


#set(genes).intersection(set(plotgoi))

In [None]:
# Filter by these genes
adata_T_C = adata_T_C[:, genes].copy()
adata_T_C

In [None]:
#set(list(pdata.var.index)).intersection(set(['M']))

### Contrast between conditions

In [None]:
# Import DESeq2
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats

In [None]:
# Build DESeq2 object
dds = DeseqDataSet(
    adata=adata_T_C,
    design_factors="treatment_id",
    ref_level=['treatment_id', 'Untreated'],
    refit_cooks=True,
    n_cpus=16,
)

In [None]:
dds.deseq2()

In [None]:
dds.obsm['design_matrix']

In [None]:
# Extract contrast between treated vs control
stat_res = DeseqStats(dds, contrast=["treatment-id", 'FAP-LTBR', "Untreated"], n_cpus=16)

In [None]:
# Compute Wald test
stat_res.summary()

In [None]:
# Shrink LFCs
stat_res.lfc_shrink(coeff='treatment-id_FAP-LTBR_vs_Untreated')

In [None]:
# Extract results
results_df = stat_res.results_df
results_df.sort_values('stat')

In [None]:
pseudobulk_folder = os.path.join(results_folder, 'pseudo_bulkResults') 
    
## check if folder exists and create it otherwise
if not os.path.exists(pseudobulk_folder):
    os.makedirs(pseudobulk_folder)
    print(f"Folder '{pseudobulk_folder}' created.")
else:
    print(f"Folder '{pseudobulk_folder}' already exists.")

results_df.to_csv(os.path.join(pseudobulk_folder , 'FAP-LTBR_vs_Untreated.csv'))

In [None]:
dc.plot_volcano_df(results_df, x='log2FoldChange', y='padj', top=20, 
                  save=os.path.join(pseudobulk_folder , 'ALL-FAP-LTBR_vs_Untreated_volcano.pdf'))

In [None]:
plotgoi=['Glycam1', 'Meox1', 'Selp',  'Ms4a1', 'Cd79a', 'Igkc', 'Bcl2l14',
         'Fcer2a',  'H2-Ab1', 'H2-DMb1', 'H2-DMa', 'Cd274', 'Cd3d', 'Cd8a', 'Cd4', 
         'Sell','Tcf7', 'Slamf7', 'Pdcd1', 'Ifng', 'Gzmb', 'Gzma', 'Cxcl9', 'Cxcl10', 
         'Cxcl13',   'Il6', 'Il1b']

In [None]:
#set(adata.var.index).intersection(set(plotgoi))

In [None]:
set(genes).intersection(set(plotgoi))

In [None]:
set(results_df.index).intersection(set(plotgoi))

In [None]:
set(exclusive_genes_conditionT_sorted.Gene)

In [None]:
mat = results_df[['stat']].T.rename(index={'stat': 'WM_FAP-LTBR_vs_Untreated'})
mat

In [None]:
for i, library in enumerate(
   adata.obs["readout_id"].unique().tolist()
):
    ad = adata[adata.obs["readout_id"] == library, :].copy()
    print(ad.obs["readout_id"].unique()[0])
    print(ad.obs["treatment_id"].unique()[0])
    sc.pl.spatial(
        ad,
        img_key="hires",
        library_id=library,
        color=["Hvcn1",'leiden'],
        size=1.5, color_map = 'RdBu_r', use_raw=False)
    
#sc.pl.violin(adata, keys = ["Hvcn1"], groupby='treatment_id', rotation=90, use_raw=False)

In [None]:
for i, library in enumerate(
   adata.obs["readout_id"].unique().tolist()
):
    ad = adata[adata.obs["readout_id"] == library, :].copy()
    print(ad.obs["readout_id"].unique()[0])
    print(ad.obs["treatment_id"].unique()[0])
    sc.pl.spatial(
        ad,
        img_key="hires",
        library_id=library,
        color=["Glycam1",'HEV'],
        size=1.5, color_map = 'viridis', use_raw=False, save=ad.obs["readout_id"].unique()[0]+'MarkerOverview-Glycam1HEV.pdf')


In [None]:
for i, library in enumerate(
   adata.obs["readout_id"].unique().tolist()
):
    ad = adata[adata.obs["readout_id"] == library, :].copy()
    print(ad.obs["readout_id"].unique()[0])
    print(ad.obs["treatment_id"].unique()[0])
    sc.pl.spatial(
        ad,
        img_key="hires",
        library_id=library,
        color=["Selp",'leiden'],
        size=1.5, color_map = 'RdBu_r',  use_raw=False)
    
sc.pl.violin(adata, keys = ["Selp"], groupby='treatment_id', rotation=90,  use_raw=False)

### Transcription factor activity inference

In [None]:
# Retrieve CollecTRI gene regulatory network
collectri = dc.get_collectri(organism='mouse', split_complexes=False)
collectri

In [None]:
# Infer pathway activities with ulm
tf_acts, tf_pvals = dc.run_ulm(mat=mat, net=collectri)
tf_acts

In [None]:
tf_acts

In [None]:
#?dc.plot_barplot

In [None]:
tf_acts.to_csv(os.path.join(pseudobulk_folder , 'ALL-FAP-LTBR_vs_Untreated_TFact.tsv'),sep='\t')

In [None]:
dc.plot_barplot(tf_acts, 'WM_FAP-LTBR_vs_Untreated', top=25, vertical=True, 
               save=os.path.join(pseudobulk_folder , 'ALL-FAP-LTBR_vs_Untreated_TFact.pdf'))

In [None]:
### Adjust plot for publication
pathway_acts_t=tf_acts.transpose()
pathway_acts_t.columns=['Activity']

top_25 = pathway_acts_t.loc[pathway_acts_t["Activity"].abs().nlargest(25).index]
top_25_sorted = top_25.sort_values(by="Activity", ascending=True)
fig, ax = plt.subplots(figsize=(5, 5))
colors = ["red" if act > 0 else "blue" for act in top_25_sorted["Activity"]]
ax.barh(top_25_sorted.index, top_25_sorted["Activity"], color=colors)
ax.set_xlabel("Activity", fontsize=12)
ax.set_ylabel("TF", fontsize=12)
ax.set_title("Top 25 TFs by Activity", fontsize=10)
ax.axvline(0, color="black", linestyle="--", linewidth=0.8)  # Add a vertical line at Activity = 0
plt.tight_layout()
plt.savefig(os.path.join(pseudobulk_folder , 'ALL-FAP-LTBR_vs_Untreated_TFact.pdf'), format="pdf")
plt.show()

In [None]:
# Extract logFCs and pvals
logFCs = results_df[['log2FoldChange']].T.rename(index={'log2FoldChange': 'WM_FAP-LTBR_vs_Untreated'})
pvals = results_df[['padj']].T.rename(index={'padj': 'WM_FAP-LTBR_vs_Untreated'})

# Plot
dc.plot_volcano(logFCs, pvals, 'WM_FAP-LTBR_vs_Untreated', name='Irf1', net=collectri, top=10, sign_thr=0.05, lFCs_thr=0.5)

In [None]:
dc.plot_volcano(logFCs, pvals, 'WM_FAP-LTBR_vs_Untreated', name='Rfxap', net=collectri, top=10, sign_thr=0.05, lFCs_thr=0.5)

In [None]:
dc.plot_volcano(logFCs, pvals, 'WM_FAP-LTBR_vs_Untreated', name='Nfkb1', net=collectri, top=10, sign_thr=0.05, lFCs_thr=0.5)

In [None]:
dc.plot_volcano(logFCs, pvals, 'WM_FAP-LTBR_vs_Untreated', name='Rela', net=collectri, top=10, sign_thr=0.05, lFCs_thr=0.5)

In [None]:
dc.plot_volcano(logFCs, pvals, 'WM_FAP-LTBR_vs_Untreated', name='Srf', net=collectri, top=10, sign_thr=0.05, lFCs_thr=0.5)

### Pathway activity inference

In [None]:

# progeny = dc.get_progeny(organism='mouse', top=500)
progeny = pd.read_csv("external_files/model_progeny500_mouse_decoupleR.csv")

In [None]:
pathway_acts, pathway_pvals = dc.run_mlm(mat=mat, net=progeny)

In [None]:
pathway_acts

In [None]:
pathway_acts.to_csv(os.path.join(pseudobulk_folder , 'ALL-FAP-LTBR_vs_Untreated_Pathways.tsv'),sep='\t')

In [None]:
dc.plot_barplot(pathway_acts, 'WM_FAP-LTBR_vs_Untreated', top=25, vertical=True, 
               save=os.path.join(pseudobulk_folder , 'ALL-FAP-LTBR_vs_Untreated_Pathways.pdf'))

In [None]:
### Adjust plot for publication
pathway_acts_t=pathway_acts.transpose()
pathway_acts_t.columns=['Activity']

top_25 = pathway_acts_t.loc[pathway_acts_t["Activity"].abs().nlargest(25).index]
top_25_sorted = top_25.sort_values(by="Activity", ascending=True)
fig, ax = plt.subplots(figsize=(5, 4))
colors = ["red" if act > 0 else "blue" for act in top_25_sorted["Activity"]]
ax.barh(top_25_sorted.index, top_25_sorted["Activity"], color=colors)
ax.set_xlabel("Activity", fontsize=12)
ax.set_ylabel("Pathway", fontsize=12)
ax.set_title("Top 25 Pathways by Activity", fontsize=10)
ax.axvline(0, color="black", linestyle="--", linewidth=0.8)  # Add a vertical line at Activity = 0
plt.tight_layout()

plt.savefig(os.path.join(pseudobulk_folder , 'ALL-FAP-LTBR_vs_Untreated_Pathways.pdf'), format="pdf")
plt.show()

In [None]:
dc.plot_targets(results_df, stat='stat', source_name='JAK-STAT', net=progeny, top=15, 
               save=os.path.join(pseudobulk_folder , 'ALL-FAP-LTBR_vs_Untreated_Pathways-JAKSTAT.pdf'))

In [None]:
dc.plot_targets(results_df, stat='stat', source_name='NFkB', net=progeny, top=15, 
               save=os.path.join(pseudobulk_folder , 'ALL-FAP-LTBR_vs_Untreated_Pathways-NFKB.pdf'))

In [None]:
dc.plot_targets(results_df, stat='stat', source_name='TGFb', net=progeny, top=15, 
               save=os.path.join(pseudobulk_folder , 'ALL-FAP-LTBR_vs_Untreated_Pathways-TGFB.pdf'))

### Functional enrichment of biological terms

In [None]:
#msigdb = dc.get_resource('MSigDB', organism ='mouse')
msigdb = pd.read_csv("external_files/msigdb_mouse_hallmark.csv")
msigdb

In [None]:
# Remove duplicated entries
msigdb = msigdb[~msigdb.duplicated(['gene_symbol', 'gs_name'])]

# Rename
msigdb.loc[:, 'geneset'] = [name.split('HALLMARK_')[1] for name in msigdb['gs_name']]

In [None]:
msigdb

In [None]:
#categories_1 = ['immunesigdb', 'cell_type_signatures']
#categories_2 = ['go_biological_process', 'reactome_pathways', 'hallmark']
#filtered_msigdb_1 = msigdb[msigdb['collection'].isin(categories_1)]
#filtered_msigdb_2 = msigdb[msigdb['collection'].isin(categories_2)]

In [None]:
#filtered_msigdb_1 = filtered_msigdb_1[~filtered_msigdb_1.duplicated(['genesymbol', 'geneset'])]
#filtered_msigdb_2 = filtered_msigdb_2[~filtered_msigdb_2.duplicated(['genesymbol', 'geneset'])]

In [None]:
# Infer enrichment with ora using significant deg
top_genes_TvsC = results_df[(results_df['padj'] < 0.05) & (results_df['stat'] > 0)]

# Run ora
enr_pvals = dc.get_ora_df(
    df=top_genes_TvsC,
    net=msigdb,
    source='geneset',
    target='gene_symbol',
    verbose= True
)

enr_pvals.head()

In [None]:
top_genes_TvsC

In [None]:
enr_pvals_filtered = enr_pvals[enr_pvals['FDR p-value'] < 0.001]

In [None]:
enr_pvals_filtered

In [None]:
dc.plot_dotplot(enr_pvals_filtered, x='Combined score', y = 'Term', s='Odds ratio', c = 'FDR p-value', scale = 0.3, 
                figsize=(5,3))

In [None]:
# Infer enrichment with ora using significant deg
top_genes_upCvsT = results_df[(results_df['padj'] < 0.05) & (results_df['stat'] < 0)]

# Run ora
enr_pvals = dc.get_ora_df(
    df=top_genes_upCvsT,
    net=msigdb,
    source='geneset',
    target='gene_symbol',
    verbose= True
)

enr_pvals.head()

In [None]:
top_genes_upCvsT

In [None]:
enr_pvals_filtered = enr_pvals[enr_pvals['FDR p-value'] < 0.05]

In [None]:
enr_pvals_filtered

In [None]:
dc.plot_dotplot(enr_pvals_filtered, x='Combined score', y = 'Term', s='Odds ratio', c = 'FDR p-value', 
                scale = 0.3, figsize=(4,2))

In [None]:
# Run ora
enr_pvals = dc.get_gsea_df(
    df=results_df,
    stat = 'stat',
    net=msigdb,
    source='geneset',
    target='gene_symbol',
    verbose= True
)

enr_pvals.sort_values('NES', ascending=False)

In [None]:
enr_pvals_filtered = enr_pvals[enr_pvals['FDR p-value'] < 0.01]

In [None]:
enr_pvals_filtered_color = enr_pvals_filtered.sort_values(by='NES', key=abs, ascending=True).head(25)

In [None]:
enr_pvals_filtered_color['color'] = np.where(enr_pvals_filtered_color['NES'] < 0, 'blue', 'red')

In [None]:
enr_pvals_filtered_color=enr_pvals_filtered_color.sort_values(by='NES', ascending=True)

In [None]:
enr_pvals_filtered_color

In [None]:
enr_pvals_filtered_color.to_csv(os.path.join(pseudobulk_folder , 'ALL-FAP-LTBR_vs_Untreated_HallmarkGSEA.tsv'),sep='\t')

In [None]:
dc.plot_barplot_df(enr_pvals_filtered_color, x='NES', y= 'Term', figsize=(5, 4), 
                   color=enr_pvals_filtered_color.color.tolist(),  
                   save=os.path.join(pseudobulk_folder , 'ALL-FAP-LTBR_vs_Untreated_HallmarkGSEA.pdf'))

In [None]:
### alternative plot for publication

df_filtered = enr_pvals_filtered_color.loc[enr_pvals_filtered_color["NES"].abs().nlargest(25).index]
df_sorted = df_filtered.sort_values(by="NES", ascending=False)
fig, ax = plt.subplots(figsize=(7, 4))
ax.barh(df_sorted["Term"], df_sorted["NES"], color=df_sorted["color"])
ax.set_xlabel("NES (Normalized Enrichment Score)", fontsize=10)
ax.set_ylabel("Term", fontsize=12)
ax.set_title("Top 25 Terms by NES (Absolute Value)", fontsize=10)
ax.invert_yaxis()  # Invert y-axis to have the largest NES at the top
plt.tight_layout()


plt.savefig(os.path.join(pseudobulk_folder , 'ALL-FAP-LTBR_vs_Untreated_HallmarkGSEA.pdf'), format="pdf")
plt.show()

#### Custom signatures

In [None]:
import besca as bc

In [None]:
bescapath_full = os.path.dirname(bc.__file__)
bescapath = os.path.split(bescapath_full)[0]

species = "mouse"  ## or mouse for now
conversion = None
sigsuffix = ""
if species == "mouse":
    sigsuffix = ".mouse"

## Provided with besca; change this for own gmt file
gmt_file_anno = (
    bescapath + "/besca/datasets/genesets/CellNames_scseqCMs6_sigs" + sigsuffix + ".gmt"
)
## An extra set of signatures (less specific but informative) is also provided
gmt_file_anno_extra = (
    bescapath
    + "/besca/datasets/genesets/CellNames_scseqCMs6_Extrasigs"
    + sigsuffix
    + ".gmt"
)

In [None]:
from itertools import repeat

mymarkers = bc.tl.sig.read_GMT_sign(gmt_file_anno, directed=False)
mymarkers_extra = bc.tl.sig.read_GMT_sign(gmt_file_anno_extra, directed=False)


In [None]:
mymarkers['HEVEndothelial']=['Glycam1','Selp','Sele','Ackr1','Enpp6','Madcam1','Lipg','Enpp2','Cxcl1','Lifr','Serpina1b','Vwf','Syt15','Chst4','Fut7']

In [None]:
mymarkers['Bcell']

In [None]:

# optional conversion - if human-based signatures are read
# if species=='mouse':
#    for signature in mymarkers.keys():
#        mymarkers[signature] = [i for i in map(bc.tl.sig._helper._to_geneid, repeat(conversion), mymarkers[signature]) if i is not None]

mymarkers = bc.tl.sig.filter_siggenes(
    adata_concat, mymarkers
)  ### remove genes not present in dataset or empty signatures
mymarkers_extra = bc.tl.sig.filter_siggenes(adata_concat, mymarkers_extra)


In [None]:
genes=list()
sets=list()
for key in mymarkers:
    for gene in mymarkers[key]:
        genes.append(gene)
        sets.append(key)

In [None]:
mysigs=pd.DataFrame()

In [None]:
mysigs['gene_symbol']=genes
mysigs['geneset']=sets
mysigs['gs_name']=sets


In [None]:
mysigs.loc[mysigs.geneset.isin(['HEVEndothelial']),:]

In [None]:
# Run ora
enr_pvals = dc.get_gsea_df(
    df=results_df,
    stat = 'stat',
    net=mysigs,
    source='geneset',
    target='gene_symbol',
    verbose= True
)

enr_pvals.sort_values('NES', ascending=False)

In [None]:
enr_pvals

In [None]:
enr_pvals_filtered = enr_pvals[enr_pvals['FDR p-value'] < 0.05]

enr_pvals_filtered_color = enr_pvals_filtered.sort_values(by='NES', key=abs, ascending=False).head(25)

enr_pvals_filtered_color['color'] = np.where(enr_pvals_filtered_color['NES'] < 0, 'blue', 'red')

enr_pvals_filtered_color=enr_pvals_filtered_color.sort_values(by='NES', ascending=True)

enr_pvals_filtered_color

dc.plot_barplot_df(enr_pvals_filtered_color, x='NES', y= 'Term', figsize=(5, 3), 
                   color=enr_pvals_filtered_color.color.tolist())

In [None]:
#enr_pvals_filtered_color

In [None]:
# Infer enrichment with ora using significant deg
top_genes_TvsC = results_df[(results_df['padj'] < 0.1) & (results_df['stat'] > 0)]

# Run ora
enr_pvals = dc.get_ora_df(
    df=top_genes_TvsC,
    net=mysigs,
    source='geneset',
    target='gene_symbol',
    verbose= True
)

enr_pvals.head()

top_genes_TvsC

enr_pvals_filtered = enr_pvals[enr_pvals['FDR p-value'] < 0.05]

enr_pvals_filtered

dc.plot_dotplot(enr_pvals_filtered, x='Combined score', y = 'Term', s='Odds ratio', c = 'FDR p-value', scale = 0.3, 
                figsize=(5,3))

In [None]:
# Infer enrichment with ora using significant deg
top_genes_upCvsT = results_df[(results_df['padj'] < 0.1) & (results_df['stat'] < 0)]

# Run ora
enr_pvals = dc.get_ora_df(
    df=top_genes_upCvsT,
    net=mysigs,
    source='geneset',
    target='gene_symbol',
    verbose= True
)

enr_pvals.head()

top_genes_upCvsT



In [None]:
enr_pvals_filtered = enr_pvals[enr_pvals['FDR p-value'] < 0.2]

enr_pvals_filtered

In [None]:


dc.plot_dotplot(enr_pvals_filtered, x='Combined score', y = 'Term', s='Odds ratio', c = 'FDR p-value', 
                scale = 0.3, figsize=(4,2))

In [None]:
gomarkers = bc.tl.sig.read_GMT_sign('external_files/m5.go.bp.v2023.2.Mm.symbols.gmt', directed=False)
cellmarkers = bc.tl.sig.read_GMT_sign('external_files/m8.all.v2023.2.Mm.symbols.gmt', directed=False)



In [None]:

gomarkers = bc.tl.sig.filter_siggenes(
    adata_concat, gomarkers
)  ### remove genes not present in dataset or empty signatures
cellmarkers = bc.tl.sig.filter_siggenes(adata_concat, cellmarkers)


genes=list()
sets=list()
for key in gomarkers:
    for gene in gomarkers[key]:
        genes.append(gene)
        sets.append(key)

gosigs=pd.DataFrame()

gosigs['gene_symbol']=genes
gosigs['geneset']=sets
gosigs['gs_name']=sets


In [None]:
genes=list()
sets=list()
for key in cellmarkers:
    for gene in cellmarkers[key]:
        genes.append(gene)
        sets.append(key)

cellsigs=pd.DataFrame()

cellsigs['gene_symbol']=genes
cellsigs['geneset']=sets
cellsigs['gs_name']=sets


In [None]:
gosigs

In [None]:
# Run ora
enr_pvals = dc.get_gsea_df(
    df=results_df,
    stat = 'stat',
    net=cellsigs,
    source='geneset',
    target='gene_symbol',
    verbose= True
)

enr_pvals.sort_values('NES', ascending=False)

enr_pvals

enr_pvals_filtered = enr_pvals[enr_pvals['FDR p-value'] < 0.01]

enr_pvals_filtered_color = enr_pvals_filtered.sort_values(by='NES', key=abs, ascending=False).head(25)

enr_pvals_filtered_color['color'] = np.where(enr_pvals_filtered_color['NES'] < 0, 'blue', 'red')

enr_pvals_filtered_color=enr_pvals_filtered_color.sort_values(by='NES', ascending=True)

enr_pvals_filtered_color


In [None]:

dc.plot_barplot_df(enr_pvals_filtered_color, x='NES', y= 'Term', figsize=(6, 8), 
                   color=enr_pvals_filtered_color.color.tolist())


In [None]:

# Infer enrichment with ora using significant deg
top_genes_TvsC = results_df[(results_df['padj'] < 0.01) & (results_df['stat'] > 0)]

# Run ora
enr_pvals = dc.get_ora_df(
    df=top_genes_TvsC,
    net=cellsigs,
    source='geneset',
    target='gene_symbol',
    verbose= True
)

enr_pvals.head()

top_genes_TvsC

enr_pvals_filtered = enr_pvals[enr_pvals['FDR p-value'] < 0.001]
enr_pvals_filtered = enr_pvals[enr_pvals['Combined score'] > 75]
enr_pvals_filtered



In [None]:
dc.plot_dotplot(enr_pvals_filtered, x='Combined score', y = 'Term', s='Odds ratio', c = 'FDR p-value', scale = 0.3, 
                figsize=(6,8))


In [None]:

# Infer enrichment with ora using significant deg
top_genes_upTvsC = results_df[(results_df['padj'] < 0.01) & (results_df['stat'] > 0)]

# Run ora
enr_pvals = dc.get_ora_df(
    df=top_genes_TvsC,
    net=gosigs,
    source='geneset',
    target='gene_symbol',
    verbose= True
)

enr_pvals.head()

top_genes_upCvsT

enr_pvals_filtered = enr_pvals[enr_pvals['FDR p-value'] < 0.001]
enr_pvals_filtered = enr_pvals[enr_pvals['Combined score'] > 300]

enr_pvals_filtered


In [None]:

dc.plot_dotplot(enr_pvals_filtered, x='Combined score', y = 'Term', s='Odds ratio', c = 'FDR p-value', 
                scale = 0.3, figsize=(6,8))

In [None]:
### Top 40 induced genes
top50=results_df.loc[results_df['padj']<=0.01,:].sort_values('log2FoldChange', ascending=False).iloc[0:50,:]
#results_df.sort_values('stat')
top50.iloc[0:50,:]

In [None]:
indcat=['A2','A3','A4','A5','A8','B6','B7','B8','B16','B19']
adata_sub=adata[adata.obs.individual_id.isin(indcat)]

sc.pl.matrixplot(adata_sub, var_names=list(top50.index), 
                                                  groupby='individual_id', standard_scale='var', vmax=0.6, 
                 categories_order=indcat, save='ALL-Top50induced.pdf')


In [None]:
adata.obs["readout_id"].unique().tolist()

In [None]:
for i, library in enumerate(
   adata.obs["readout_id"].unique().tolist()
):
    ad = adata[adata.obs["readout_id"] == library, :].copy()
    print(ad.obs["readout_id"].unique()[0])
    print(ad.obs["treatment_id"].unique()[0])
    sc.pl.spatial(
        ad,
        img_key="hires",
        library_id=library,
        color=["Glycam1",'Meox1','Madcam1','Cxcl13','Chst4',
               'Cd274','Cd3e','Cd4','Cd8a','Tcf7','Slamf7', 'Fcer2a','Ms4a1',
               'Cxcl9','Cxcl10'],
        size=1.5,  use_raw=False, save=ad.obs["readout_id"].unique()[0]+'ALL-MultipleMarkerexpression.pdf')
    
#sc.pl.violin(adata, keys = ["Glycam1"], groupby='treatment_id', rotation=90,  use_raw=False)

In [None]:
indcat=['A2','A3','A4','A5','A8','B6','B7','B8','B16','B19']
#adata_sub=adata[adata.obs.individual_id.isin(indcat)]
adata_sub=adata[~adata.obs['readout_id'].isin(exclude)].copy()

In [None]:
plotgoi=['Glycam1', 'Meox1', 'Selp',  'Ms4a1', 'Cd79a', 'Igkc', 'Bcl2l14',
         'Fcer2a',  'H2-Ab1', 'H2-DMb1', 'H2-DMa', 'Cd274', 'Cd3d', 'Cd8a', 'Cd4', 
         'Sell','Tcf7', 'Slamf7', 'Pdcd1', 'Ifng', 'Gzmb', 'Gzma', 'Cxcl9', 'Cxcl10', 
         'Cxcl13',   'Il6', 'Il1b']

In [None]:
sc.pl.matrixplot(adata_sub, var_names=plotgoi, use_raw=False,groupby='individual_id', 
                 standard_scale='var', vmax=0.6, 
                 categories_order=indcat, 
                 save= 'ALL-FAP-LTBR_vs_Untreated_GOIHeatmap.pdf')


In [None]:
results_df

In [None]:
results_df_oi=results_df.loc[list(set(plotgoi).intersection(results_df.index)),:]

In [None]:
#['Ms4a1', 'Bcl2l14', 'Fcer2a', 'Ifng', 'Cxcl9']
subplotgoi=['Glycam1', 'Meox1', 'Selp',  'Cd79a', 'Igkc', 
          'H2-Ab1', 'H2-DMb1', 'H2-DMa', 'Cd274', 'Cd3d', 'Cd8a', 'Cd4', 
         'Sell','Tcf7', 'Slamf7', 'Pdcd1', 'Gzmb', 'Gzma', 'Cxcl10', 
           'Cxcl13',   'Il6', 'Il1b']

In [None]:
df=results_df_oi.loc[subplotgoi,:]

# Add new rows with specific index names
new_rows = pd.DataFrame({
    "baseMean": [np.nan, np.nan,np.nan, np.nan, np.nan],  # Values for 'baseMean'
    "log2FoldChange": [np.nan, np.nan,np.nan, np.nan, np.nan],  # Values for 'log2FoldChange'
    "lfcSE": [np.nan, np.nan,np.nan, np.nan, np.nan],  # Values for 'lfcSE'
    "stat": [np.nan, np.nan,np.nan, np.nan, np.nan],  # Values for 'stat'
    "pvalue": [np.nan, np.nan,np.nan, np.nan, np.nan],  # Values for 'pvalue'
    "padj": [np.nan, np.nan,np.nan, np.nan, np.nan],  # Values for 'padj'
}, index=['Ms4a1', 'Bcl2l14', 'Fcer2a', 'Ifng', 'Cxcl9'])  # Specify custom index names

df = pd.concat([df, new_rows])

# Check the updated DataFrame
#print(df)

In [None]:
df=df.loc[plotgoi,:]

In [None]:
# Plot setup
fig, ax = plt.subplots(figsize=(10, 6))

x = np.arange(len(df))  # X-axis positions
bars = ax.bar(x, df['log2FoldChange'], color='grey', edgecolor='black')

for i, (padj, log2fc) in enumerate(zip(df['padj'], df['log2FoldChange'])):
    if padj < 0.01:
        ax.text(i, log2fc + 0.1 if log2fc > 0 else log2fc - 0.1, '**', 
                ha='center', va='bottom' if log2fc > 0 else 'top', fontsize=16, color='black')
    elif padj < 0.05:
        ax.text(i, log2fc + 0.1 if log2fc > 0 else log2fc - 0.1, '*', 
                ha='center', va='bottom' if log2fc > 0 else 'top', fontsize=16, color='black')

# Customize plot
ax.set_xticks(x)
#ax.set_xticklabels([f"Gene {i+1}" for i in x], rotation=45, ha='right')
ax.set_xticklabels(df.index, rotation=45, ha='right')
ax.set_ylabel("log2(Fold Change)")
ax.set_title("log2(Fold Change) with Significance Stars")
ax.axhline(0, color='black', linewidth=0.8, linestyle='--')  # Add a horizontal line at y=0

plt.tight_layout()

plt.savefig(os.path.join(pseudobulk_folder , "FCplot_ALL-FAP-LTBR_vs_Untreated_GOIHeatmap.pdf"), 
            format="pdf", dpi=300)
plt.show()

### Plot regions

In [None]:
adata=sc.read(os.path.join(global_clustering_folder , 'clustering_results_harmony_deconv_bbknn.h5ad'))

In [None]:
adata_sub=adata[~adata.obs['readout_id'].isin(exclude)].copy()
adata_sub.obs['annoStroma']=adata_sub.obs['annov2'].copy()
adata_sub.obs['annoTumor']=adata_sub.obs['annov2'].copy()

In [None]:
stroma=['Epi_HEV',
 'Fibro_Immune_HEV',
 'Macrophage_Mmp9',
 'Muscle_Vessel','Tumor_Fibro',
 'Tumor_Fibro_Necro']

for i in stroma:
    adata_sub.obs["annoStroma"]=adata_sub.obs["annoStroma"].replace(i, "Stroma")
adata_sub.obs["annoStroma"]=adata_sub.obs["annoStroma"].replace("Stroma_Necro", "Stroma")

#### For tumor
tumor=[ 'Tumor','Tumor_Cytotox','Tumor_Fibro',
'Tumor_Fibro_Necro','Tumor_Necro']

for i in tumor:
    adata_sub.obs["annoTumor"]=adata_sub.obs["annoTumor"].replace(i, "Tumor")    

In [None]:
adata_sub.obs["annoTumor"]

In [None]:
tmp=adata_sub.obs.copy()
tmp['annoTumor']=tmp['annoTumor'].cat.add_categories(['STROMA','TUMOR','FIBRO'])
tmp['annoStroma']=tmp['annoStroma'].cat.add_categories(['STROMA','TUMOR','FIBRO'])
tmp.loc[tmp['Malignantcat'].isin([ 'neg'])&tmp['annoStroma'].isin([ 'Stroma']),'annoStroma']='STROMA'
tmp.loc[tmp['Fibrocat'].isin([ 'pos'])&tmp['annoStroma'].isin([ 'Stroma','STROMA']),'annoStroma']='FIBRO'
tmp.loc[tmp['Malignantcat'].isin([ 'pos'])&tmp['annoTumor'].isin([ 'Tumor']),'annoTumor']='TUMOR'

tmp['annoTumorStroma']=tmp['annoStroma'].astype('str')+tmp['annoTumor'].astype('str')
#tmp['annoTumorStroma'].value_counts()

In [None]:
tmp['annoTumorStroma'].value_counts()

In [None]:
tumor=['StromaTUMOR','Tumor_NecroTUMOR','TumorTUMOR','Tumor_CytotoxTUMOR']
stroma=['FIBROTumor','FIBROFibro_Immune_HEV','FIBROMacrophage_Mmp9','FIBROMuscle_Vessel','FIBROEpi_HEV',
        'STROMAEpi_HEV','STROMATumor','STROMAFibro_Immune_HEV','STROMAMuscle_Vessel',
        'STROMAMacrophage_Mmp9','FIBROTUMOR']
other=['StromaEpi_HEV','StromaTumor','Tumor_CytotoxTumor','StromaFibro_Immune_HEV','StromaMuscle_Vessel',
       'StromaMacrophage_Mmp9','TumorTumor','Tumor_NecroTumor']
both=['FIBROTUMOR']

In [None]:
for i in both:
    tmp["annoTumorStroma"]=tmp["annoTumorStroma"].replace(i, "TumorStroma")
for i in tumor:
    tmp["annoTumorStroma"]=tmp["annoTumorStroma"].replace(i, "Tumor")
for i in stroma:
    tmp["annoTumorStroma"]=tmp["annoTumorStroma"].replace(i, "Stroma")
for i in other:
    tmp["annoTumorStroma"]=tmp["annoTumorStroma"].replace(i, "NotDetermined")

adata_sub.obs['annoTumorStroma']=tmp["annoTumorStroma"].copy()

In [None]:
sc.pl.umap(adata_sub, color='annoTumorStroma')

In [None]:
for i, library in enumerate(
   adata_sub.obs["readout_id"].unique().tolist()
):
    ad = adata_sub[adata_sub.obs["readout_id"] == library, :].copy()
    print(ad.obs["readout_id"].unique()[0])
    print(ad.obs["treatment_id"].unique()[0])
    sc.pl.spatial(
        ad,
        img_key="hires",
        library_id=library,
        color=["annoTumorStroma",'annov2','PatAnno'],
        size=1.5, color_map = 'viridis', use_raw=False, ncols=2,
    save= ad.obs["readout_id"].unique()[0]+'-ALL-Annotations.pdf')

In [None]:
list((adata_sub.obs.columns))

In [None]:
df = pd.DataFrame(adata_sub.obs.loc[:,['sample_id','treatment_id','annoTumorStroma','Glycam1']])
df=df.loc[df['treatment_id']=='FAP_LTBR',:]
df=df.loc[df['annoTumorStroma'].isin(['Tumor','Stroma']),:]

sample_means = df.groupby(["sample_id", "annoTumorStroma"])["Glycam1"].mean().reset_index().dropna()
#sample_means

sample_means["annoTumorStroma"]=sample_means["annoTumorStroma"].cat.remove_unused_categories()
plt.figure(figsize=(2.5, 4))
sns.boxplot(data=sample_means, x="annoTumorStroma", y="Glycam1", 
            palette={"Tumor": "green", "Stroma": "orange"})
sns.stripplot(data=sample_means, x="annoTumorStroma", y="Glycam1", color="black", jitter=True)
plt.xlabel("Region", fontsize=10)
plt.ylabel("logcp10k", fontsize=10)
plt.title("Mean Glycam1 per Sample by Region", fontsize=10)

plt.tight_layout()
plt.savefig(os.path.join(pseudobulk_folder , 'Boxplot-ALL-FAP-LTBR_Region_Glycam1_persample.pdf'), format="pdf")
plt.show()

In [None]:
df = pd.DataFrame(adata_sub.obs.loc[:,['sample_id','treatment_id','Glycam1']])

sample_means = df.groupby(["sample_id", "treatment_id"])["Glycam1"].mean().reset_index().dropna()
plt.figure(figsize=(2.5, 4))
sns.boxplot(data=sample_means, x="treatment_id", y="Glycam1", 
            palette={"Untreated": "grey", "FAP_LTBR": "salmon"})
sns.stripplot(data=sample_means, x="treatment_id", y="Glycam1", color="black", jitter=True)
plt.xlabel("Treatment", fontsize=10)
plt.ylabel("logcp10k", fontsize=10)
plt.title("Mean Glycam1 per Sample by Treatment", fontsize=10)
plt.tight_layout()
plt.savefig(os.path.join(pseudobulk_folder , 'Boxplot-ALL-FAP-LTBR_vs_Untreated_Glycam1_persample.pdf'), format="pdf")
plt.show()

In [None]:
mymarkers['HEVEndothelial']=['Glycam1','Selp','Sele','Ackr1','Enpp6','Madcam1','Lipg','Enpp2','Cxcl1','Lifr','Serpina1b','Vwf','Syt15','Chst4','Fut7']

In [None]:
sc.tl.score_genes(adata_sub, mymarkers['HEVEndothelial'], score_name='HEVEndothelial')

In [None]:
df = pd.DataFrame(adata_sub.obs.loc[:,['sample_id','treatment_id','HEVEndothelial']])

sample_means = df.groupby(["sample_id", "treatment_id"])["HEVEndothelial"].mean().reset_index().dropna()

plt.figure(figsize=(2.5, 4))
sns.boxplot(data=sample_means, x="treatment_id", y="HEVEndothelial", 
            palette={"Untreated": "grey", "FAP_LTBR": "salmon"})
sns.stripplot(data=sample_means, x="treatment_id", y="HEVEndothelial", color="black", jitter=True)
plt.xlabel("Treatment", fontsize=10)
plt.ylabel("Signature Score", fontsize=10)
plt.title("Mean HEVEndothelial per Sample by Treatment", fontsize=10)
plt.tight_layout()
plt.savefig(os.path.join(pseudobulk_folder , 'Boxplot-ALL-FAP-LTBR_vs_Untreated_HEVEndothelial_persample.pdf'), format="pdf")
plt.show()

In [None]:
df = pd.DataFrame(adata_sub.obs.loc[:,['sample_id','treatment_id','annoTumorStroma','HEVEndothelial']])
df=df.loc[df['treatment_id']=='FAP_LTBR',:]
df=df.loc[df['annoTumorStroma'].isin(['Tumor','Stroma']),:]

sample_means = df.groupby(["sample_id", "annoTumorStroma"])["HEVEndothelial"].mean().reset_index().dropna()
#sample_means

sample_means["annoTumorStroma"]=sample_means["annoTumorStroma"].cat.remove_unused_categories()

plt.figure(figsize=(2.5, 4))
sns.boxplot(data=sample_means, x="annoTumorStroma", y="HEVEndothelial", 
            palette={"Tumor": "green", "Stroma": "orange"})
sns.stripplot(data=sample_means, x="annoTumorStroma", y="HEVEndothelial", color="black", jitter=True)
plt.xlabel("Region", fontsize=10)
plt.ylabel("Signature Score", fontsize=10)
plt.title("Mean HEVEndothelial per Sample by Region", fontsize=10)
plt.tight_layout()
plt.savefig(os.path.join(pseudobulk_folder , 'Boxplot-ALL-FAP-LTBR_Region_HEVEndothelial_persample.pdf'), format="pdf")
plt.show()

In [None]:
adata_tmp=adata_sub.copy()
adata_tmp.obs=adata_tmp.obs.drop(columns='Glycam1')
adata_tmp=adata_tmp[adata_tmp.obs['annoTumorStroma'].isin(['Tumor','Stroma'])]
adata_tmp.obs["annoTumorStroma"]=adata_tmp.obs["annoTumorStroma"].cat.remove_unused_categories()

In [None]:
#FIGSIZE = (3, 3)
from matplotlib import rcParams
rcParams["figure.figsize"] = (4, 4)
sc.pl.violin(adata_tmp, ['Glycam1','HEVEndothelial'], groupby='annoTumorStroma',
            save='Overview-Glycam1HEV-acrossSamplesRegions.pdf')

In [None]:
adata_sub.obs.loc[:,['annoTumorStroma','annoStroma','annoTumor','HEVEndothelial']].to_csv('Additional_annotation.tsv',sep='\t')

In [None]:
apoptosis_genes_mouse_minimal = ["Fas", "Casp8", "Tnfsf10",    "Tnfrsf10b", "Tnf", "Fasl",  "Fadd"]


In [None]:
sc.tl.score_genes(
    adata_sub,
    gene_list=apoptosis_genes_mouse_minimal,
    score_name="Apoptosis",
    use_raw=False
)

In [None]:
for i, library in enumerate(
   adata_sub.obs["readout_id"].unique().tolist()
):
    ad = adata_sub[adata_sub.obs["readout_id"] == library, :].copy()
    print(ad.obs["readout_id"].unique()[0])
    print(ad.obs["treatment_id"].unique()[0])
    sc.pl.spatial(
        ad,
        img_key="hires",
        library_id=library,
        color=["Apoptosis"],vmax=0.7,
        size=1.5, color_map = 'viridis', use_raw=False, 
    save= ad.obs["readout_id"].unique()[0]+'-ALL-Apoptosis.pdf')

In [None]:
df = pd.DataFrame(adata_sub.obs.loc[:,['sample_id','treatment_id','Apoptosis']])

# Step: Calculate the mean of 'value1' per sample
sample_means = df.groupby(["sample_id", "treatment_id"])["Apoptosis"].mean().reset_index().dropna()

# Step 2: Create a boxplot for the two treatments
plt.figure(figsize=(2.5, 4))
sns.boxplot(data=sample_means, x="treatment_id", y="Apoptosis", 
            palette={"Untreated": "grey", "FAP_LTBR": "salmon"})
sns.stripplot(data=sample_means, x="treatment_id", y="Apoptosis", color="black", jitter=True)

# Step 3: Customize the plot
plt.xlabel("Treatment", fontsize=10)
plt.ylabel("Signature Score", fontsize=10)
plt.title("Mean Apoptosis per Sample by Treatment", fontsize=10)

# Show the plot
plt.tight_layout()

plt.savefig(os.path.join(pseudobulk_folder , 'Boxplot-ALL-FAP-LTBR_vs_Untreated_Apoptosis_persample.pdf'), format="pdf")

plt.show()

In [None]:
df = pd.DataFrame(adata_sub.obs.loc[:,['sample_id','treatment_id','annoTumorStroma','Apoptosis']])
df=df.loc[df['treatment_id']=='FAP_LTBR',:]
df=df.loc[df['annoTumorStroma'].isin(['Tumor','Stroma']),:]

# Step: Calculate the mean of 'value1' per sample
sample_means = df.groupby(["sample_id", "annoTumorStroma"])["Apoptosis"].mean().reset_index().dropna()
#sample_means

sample_means["annoTumorStroma"]=sample_means["annoTumorStroma"].cat.remove_unused_categories()

# Step 2: Create a boxplot for the two treatments
plt.figure(figsize=(2.5, 4))
sns.boxplot(data=sample_means, x="annoTumorStroma", y="Apoptosis", 
            palette={"Tumor": "green", "Stroma": "orange"})
sns.stripplot(data=sample_means, x="annoTumorStroma", y="Apoptosis", color="black", jitter=True)

# Step 3: Customize the plot
plt.xlabel("Region", fontsize=10)
plt.ylabel("Signature Score", fontsize=10)
plt.title("Mean NewApoptosis per Sample by Region", fontsize=10)

# Show the plot
plt.tight_layout()

plt.savefig(os.path.join(pseudobulk_folder , 'Boxplot-ALL-FAP-LTBR_Region_Apoptosis_persample.pdf'), format="pdf")

plt.show()

In [None]:
! jupyter nbconvert --to html 07_Pseudobulk_analysis.ipynb