In [None]:
# Date: 20.3.2023
# Author: Antti Kiviaho
#
# Notebook for analysing and visualizing visium data after copy number variation, single cell mapping and clustering
# analyses. This is the main results notebook

In [None]:
import os 
os.chdir('/lustre/scratch/kiviaho/prostate_spatial/')

import scanpy as sc
import numpy as np
import squidpy as sq
import pandas as pd
import anndata as ad

#from cell2location.utils import select_slide
#from cell2location.plt import plot_spatial

import matplotlib.pyplot as plt
from scripts.utils import load_from_pickle, get_sample_ids
import matplotlib as mpl

sc.set_figure_params(figsize=(4.7,4.7),dpi_save=1200,frameon=False,format='pdf')

In [None]:
# Download data and format cell2location mapping results into obs columns in both aggregated adata and individual slides

adata_vis = sc.read_h5ad('./c2l-results/cell2location_map_20230322/visium_adata_with_c2l_mapping_20230322.h5ad')

adata_vis.obs.joint_leiden_clusters = adata_vis.obs.sample_id.astype(str) + '_' + adata_vis.obs.joint_leiden_clusters.astype(str)
adata_vis.obs['joint_leiden_clusters'] = pd.Categorical(adata_vis.obs['joint_leiden_clusters'])


adata_slides = load_from_pickle('./data/individual_sections_normalized_clustered.pickle')
samples = get_sample_ids()

# Copy obsm (cell2location results) to adata object obs
adata_vis.obs[adata_vis.uns['mod']['factor_names']] = adata_vis.obsm['q05_cell_abundance_w_sf']

# Copy obsm (cell2location results) to individual slides
for sample in samples:
    vis_subset = adata_vis[adata_vis.obs['sample_id']==sample]
    
    if (vis_subset.obs_names == adata_slides[sample].obs_names).all():
        adata_slides[sample].obsm = vis_subset.obsm.copy()
        
        # add 5% quantile, representing confident cell abundance, 'at least this amount is present',
        # to adata.obs with nice names for plotting
        adata_slides[sample].uns['mod'] = vis_subset.uns['mod'].copy()
        adata_slides[sample].obs[adata_slides[sample].uns['mod']['factor_names']] = adata_slides[sample].obsm['q05_cell_abundance_w_sf']


## SINGLE-CELL REFERENCE PLOTTING

In [None]:
# Load the single-cell cell type reference: export estimated expression in 'cell type'
adata_ref = sc.read_h5ad('/lustre/scratch/kiviaho/prostate_spatial/c2l-results/cell2location_map_20230322/sc_reference_signatures_20230322.h5ad')


In [None]:
adata_ref.obs['dataset'].value_counts(
    
)

In [None]:
y_variable = 'detailed_celltypes' # phenotype

meta = adata_ref.obs[[y_variable,'dataset']]
meta['count'] = 1

grouped_meta = meta.groupby(['dataset',y_variable],axis=0).sum()
grouped_meta.reset_index(inplace=True)
grouped_meta = grouped_meta.pivot(index='dataset',columns=y_variable,values='count')
grouped_meta = grouped_meta.loc[['hirz_2023','chen_2021','cheng_2022','dong_2020','chen_2022','song_2022']]
#grouped_meta.set_index('dataset',inplace=True)
#grouped_meta
fig,ax = plt.subplots()

grouped_meta.plot.bar(stacked=True,grid=False,ax=ax,figsize=(3,5),legend=False,width=0.8 #yticks=(0,2e4,4e4,14e4),
                      )
plt.axis('off')
plt.savefig('./figures/single-cell-barplot-celltypes.pdf',dpi=600)

In [None]:
from itertools import cycle, islice

y_variable = 'phenotype' # phenotype


meta = adata_ref.obs[[y_variable,'dataset']]
meta['count'] = 1

grouped_meta = meta.groupby(['dataset',y_variable],axis=0).sum()
grouped_meta.reset_index(inplace=True)
grouped_meta = grouped_meta.pivot(index='dataset',columns=y_variable,values='count')
grouped_meta = grouped_meta.loc[['hirz_2023','chen_2021','cheng_2022','dong_2020','chen_2022','song_2022']]

fig,ax = plt.subplots()


# Make a list by cycling through the colors you care about
# to match the length of your data.
my_colors = list(islice(cycle(['r', 'b', 'g',]), None, len(grouped_meta)))


grouped_meta.plot.bar(stacked=True,grid=False,ax=ax,figsize=(3,5),width=0.8,color=my_colors #yticks=(0,2e4,4e4,14e4),legend=False
                      )
#plt.axis('off')
#plt.savefig('./figures/single-cell-barplot-phenotypes.pdf',dpi=600)

In [None]:

sc.pl.umap(adata_ref,color='detailed_celltypes',size=10,legend_loc=None,
           save='single-cell-dataset-celltypes.pdf')

In [None]:
sc.set_figure_params(figsize=(8,8))
sc.pl.umap(adata_ref,color='dataset',size=10)

In [None]:
sc.set_figure_params(figsize=(8,8))
sc.pl.umap(adata_ref,color='sample',size=10)

In [None]:
ribosomal_markers = ['RPSA','RPS2','RPS3','RPS3A','RPS4X','RPS4Y1','RPS4Y2','RPS5','RPS6','RPS7','RPS8','RPS9','RPS10','RPS11',
'RPS12','RPS13','RPS14','RPS15','RPS15A','RPS16','RPS17','RPS18','RPS19','RPS20','RPS21','RPS23','RPS24','RPS25'
,'RPS26','RPS27','RPS27A','RPS27L','RPS28','RPS29','FAU','RPLP0','RPLP1','RPLP2','RPL3','RPL3L','RPL4','RPL5','RPL6',
'RPL7','RPL7A','RPL7L1','RPL8','RPL9','RPL10','RPL10A','RPL10L','RPL11','RPL12','RPL13','RPL13A','RPL14','RPL15','RPL17',
'RPL18','RPL18A','RPL19','RPL21','RPL22','RPL22L1','RPL23','RPL23A','RPL24','RPL26','RPL26L1','RPL27','RPL27A','RPL28','RPL29',
'RPL30','RPL31','RPL32','RPL34','RPL35','RPL35A','RPL36','RPL36A','RPL36AL','RPL37','RPL37A','RPL38','RPL39','RPL39L','UBA52','RPL41']
sc.tl.score_genes(adata_ref,ribosomal_markers,score_name='ribosomal_score')
sc.pl.umap(adata_ref,color=['ribosomal_score'],size=10)

## Visium mapping plots

In [None]:
sc.set_figure_params(figsize=(4.7,4.7),dpi_save=1200,frameon=False,format='pdf')

sc.tl.umap(adata_slides[sample])
sc.pl.umap(adata_slides[sample],color='joint_leiden_clusters',size=30,legend_loc=None,
           save='_sample_'+sample+'.pdf')


In [None]:
sc.set_figure_params(figsize=(4.7,4.7),dpi_save=1200,frameon=False,format='pdf')

sc.pl.spatial(adata_slides[sample],color='joint_leiden_clusters',size=1.3,legend_loc=None,
           save='_sample_'+sample+'.pdf')

In [None]:
# Plots cell type abundance violin plots side by side with the spatial clustering plot
plt.rcParams.update({'axes.titlesize': 'small'})

n_types = 0
n_types_end = 6
obs_name = 'joint_leiden_clusters'

slide = adata_slides[sample]

# Subset to only plot the cell types with highest mean prevalence
cell_types = slide.obs.iloc[:,16:].mean(axis=0).sort_values(ascending=False)[n_types:n_types_end].index

fig = plt.figure(figsize=(6,14))

gs = fig.add_gridspec(6,1)
# Plot the data in the left column subplots
for i in range(6):

    ax_plotting = fig.add_subplot(gs[i,0])

    sc.pl.violin(slide,groupby=obs_name,keys=cell_types[i],jitter=False,rotation=45,inner='box',
                ax=ax_plotting,show=False)
    
    ax_plotting.yaxis.label.set_size(12)
    #ax_plotting.yaxis.label.set_rotation('horizontal')

    if i < 5:  # only remove x-labels for the top 5 subplots on the left
        ax_plotting.tick_params(axis='x', which='both', bottom=False, labelbottom=False) 


In [None]:
# Cropping coordinate to get a standard crop
coords = {'PC_7875OIK':(2000,19500,2000,19500),
          'PC_4980':(500,21000,500,21000),
          'PC_01_14451_OIK':(1400,19000,5500,23000),
          'PC_02_10136_VAS':(3700,22000,2200,20000),
          'PC_02_05601_OIK':(1500,20500,3200,21000)}

# Plots and saves top n_types with highest prevelance on visium slides
sample = 'PC_02_05601_OIK'
cell_types = ['Luminal','Luminal_Tumor','Fibroblast','T_cell']
colormap = 'magma'

slide = adata_slides[sample]

# Subset to only plot the cell types with highest mean prevalence
#cell_types = [cell_types[0]]

sc.set_figure_params(dpi=600,figsize=(5,5))
# plot in spatial coordinates

fig, ax = plt.subplots(1,1)
sc.pl.spatial(slide, cmap='viridis',
            # show first 8 cell types
            color=cell_types[0],
            size=0,alpha_img=1, alpha=0,
            # limit color scale at 99.2% quantile of cell abundance
            vmin=0, vmax=4, show=False,colorbar_loc=None,ax = ax,crop_coord=coords[sample]  # 'p99.2'
            )
plt.axis('off')
plt.savefig('./figures/'+sample+'_histology_shown.pdf',dpi=600)
plt.clf()

for celltype in cell_types:

    fig, ax = plt.subplots(1,1)
    sc.pl.spatial(slide, cmap=colormap,
                # show first 8 cell types
                color=celltype,
                size=1.4,alpha_img=0.8,
                # limit color scale at 99.2% quantile of cell abundance
                vmin=0, vmax=4, show=False,colorbar_loc=None,ax = ax,crop_coord=coords[sample]  # 'p99.2'
                )
    plt.axis('off')
    plt.savefig('./figures/'+sample+'_abundance_on_section_'+celltype+'.pdf',dpi=600)
    plt.clf()


sc.set_figure_params(dpi=600,figsize=(5,5))

fig, ax = plt.subplots(1,1)
sc.pl.spatial(slide, cmap=colormap,
            # show first 8 cell types
            color=cell_types[0],
            size=1.4,alpha_img=1,
            # limit color scale at 99.2% quantile of cell abundance
            vmin=0, vmax=4, show=False,ax = ax,crop_coord=coords[sample]  # 'p99.2'
            )
plt.axis('off')
plt.savefig('./figures/'+sample+'_with_colorbar.pdf',dpi=600)
plt.clf()

In [None]:
# F the optimal cropping coordinates
coords = {'PC_7875OIK':(2000,19500,2000,19500),
          'PC_4980':(500,21000,500,21000),
          'PC_01_14451_OIK':(1400,19000,5500,23000),
          'PC_02_10136_VAS':(3700,22000,2200,20000),
          'PC_02_05601_OIK':(1500,20500,3200,21000)}

# Plots and saves top n_types with highest prevelance on visium slides
sample = 'PC_02_05601_OIK'
cell_types = ['Luminal','Luminal_Tumor','Fibroblast','T_cell']
colormap = 'magma'

slide = adata_slides[sample]

# Subset to only plot the cell types with highest mean prevalence
#cell_types = [cell_types[0]]

sc.set_figure_params(dpi=100,figsize=(5,5))

fig, ax = plt.subplots(1,1)
sc.pl.spatial(slide, cmap=colormap,
            # show first 8 cell types
            color=cell_types[0],
            size=1.4,alpha_img=1,
            # limit color scale at 99.2% quantile of cell abundance
            vmin=0, vmax=4, show=False,ax = ax,crop_coord=coords[sample]  # 'p99.2'
            )
plt.axis('off')

In [None]:
# Cropping coordinate to get a standard crop
# Cropping coordinate to get a standard crop
coords = {'PC_7875OIK':(2000,19500,2000,19500),
          'PC_4980':(500,21000,500,21000),
          'PC_01_14451_OIK':(1400,19000,5500,23000),
          'PC_02_10136_VAS':(3700,22000,2200,20000)}

# Plots and saves top n_types with highest prevelance on visium slides
sample = 'PC_02_10136_VAS'
cell_types = ['Luminal','Luminal_Tumor','Fibroblast','T_cell']
colormap = 'magma'

slide = adata_slides[sample]

# Subset to only plot the cell types with highest mean prevalence
#cell_types = [cell_types[0]]

sc.set_figure_params(dpi=600,figsize=(5,5))
# plot in spatial coordinates

fig, ax = plt.subplots(1,1)
sc.pl.spatial(slide, cmap='viridis',
            # show first 8 cell types
            color=cell_types[0],
            size=0,alpha_img=1, alpha=0,
            # limit color scale at 99.2% quantile of cell abundance
            vmin=0, vmax=4, show=False,colorbar_loc=None,ax = ax,crop_coord=coords[sample]  # 'p99.2'
            )
plt.axis('off')
plt.savefig('./figures/'+sample+'_histology_shown.pdf',dpi=600)
plt.clf()

for celltype in cell_types:

    fig, ax = plt.subplots(1,1)
    sc.pl.spatial(slide, cmap=colormap,
                # show first 8 cell types
                color=celltype,
                size=1.4,alpha_img=0.8,
                # limit color scale at 99.2% quantile of cell abundance
                vmin=0, vmax=4, show=False,colorbar_loc=None,ax = ax,crop_coord=coords[sample]  # 'p99.2'
                )
    plt.axis('off')
    plt.savefig('./figures/'+sample+'_abundance_on_section_'+celltype+'.pdf',dpi=600)
    plt.clf()


sc.set_figure_params(dpi=600,figsize=(5,5))

fig, ax = plt.subplots(1,1)
sc.pl.spatial(slide, cmap=colormap,
            # show first 8 cell types
            color=cell_types[0],
            size=1.4,alpha_img=1,
            # limit color scale at 99.2% quantile of cell abundance
            vmin=0, vmax=4, show=False,ax = ax,crop_coord=coords[sample]  # 'p99.2'
            )
plt.axis('off')
plt.savefig('./figures/'+sample+'_with_colorbar.pdf',dpi=600)
plt.clf()

In [None]:
# Plots and saves top n_types with highest prevelance on visium slides
n_types = 12

for sample in samples:
    slide = adata_slides[sample]
    # Subset to only plot the cell types with highest mean prevalence
    cell_types = slide.obs.iloc[:,16:].mean(axis=0).sort_values(ascending=False)[:n_types]

    # plot in spatial coordinates
    with mpl.rc_context({'axes.facecolor':  'black',
                        'figure.figsize': [4.5, 5]}):

        sc.pl.spatial(slide, cmap='magma',
                    # show first 8 cell types
                    color=cell_types.index,
                    ncols=4, size=1.3,alpha_img=0.8,
                    # limit color scale at 99.2% quantile of cell abundance
                    vmin=0, vmax='p99.2', show=False
                    )
        plt.savefig('./plots/c2l_mapping_results_20230322/'+sample+'_c2l_mapping_top12_abundant.png',dpi=200)
        plt.clf()

In [None]:
## Side by side violin plots of abundant cell types + spatial 

plt.rcParams.update({'axes.titlesize': 'small'})

n_types = 6
obs_name = 'joint_leiden_clusters'

for s in samples:

    slide = adata_slides[s]

    # Subset to only plot the cell types with highest mean prevalence
    cell_types = slide.obs.iloc[:,16:].mean(axis=0).sort_values(ascending=False)[:n_types].index

    fig = plt.figure(figsize=(12,12))

    gs = fig.add_gridspec(6,3)
    ax6 = fig.add_subplot(gs[:, 1:3])

    # Plot the data in the left column subplots
    for i in range(6):

        ax_plotting = fig.add_subplot(gs[i,0])

        sc.pl.violin(slide,groupby=obs_name,keys=cell_types[i],jitter=False,rotation=45,inner='box',
                    ax=ax_plotting,show=False)
        
        ax_plotting.yaxis.label.set_size(12)
        #ax_plotting.yaxis.label.set_rotation('horizontal')

        if i < 5:  # only remove x-labels for the top 5 subplots on the left
            ax_plotting.tick_params(axis='x', which='both', bottom=False, labelbottom=False) 

    sc.pl.spatial(adata_slides[s],color='joint_leiden_clusters',size=1.3,ax=ax6)

#    plt.show()
    fig.savefig('./plots/c2l_cell_type_prevalence_violin_plots/'+s+'_cell_type_prevalences_per_cluster.png')
    fig.clf()



cell2loc_adata = ad.AnnData(adata_vis.obs.iloc[:,19:])
cell2loc_adata.obs['sample'] = adata_vis.obs['sample_id']
cell2loc_adata.obs['clusters'] = adata_vis.obs['sample_id'].astype('str') + '_' +  adata_vis.obs['joint_leiden_clusters'].astype('str')

## Scoring gene markers on spatial data

In [None]:

signature_dict = {
# Full signature from supplemntary table 3 of https://doi.org/10.1038/s41467-021-25624-1
'Persist' : ['CDC20','MKI67','CCNB2','CENPF','DLGAP5','HMMR','PLK1','CCNB1','PTTG1','CENPE','UBE2S','NCAPD2','TUBA1C','DYNLL1','CD81','CKAP5','ODC1'],

# Top 50 genes from the first tab of supplementary table 1 in https://doi.org/10.1016/j.ccell.2019.06.005
'Balanis_NE' : ['PIK3R3','TARDBP','TMBIM4','DPYSL5','INSM1','PIK3R2','TIMM10B','PSMA1','DLL3','ASCL1','FOXG1','ZIC2','SOX11','ZIC5','LHX2','DLK1',
              'PI4K2B','SEZ6','ACTL6B','PCSK1','CACNA1A','PAPD7','CPLX2','XKR7','ZNF286A','ST18','INA','CELF3','TAGLN3','PAGR1','NRSN1','ONECUT2',
              'ELAVL3','POU4F1','CDK5R2','SCG3','DCX','BARX1','FAM57B','C17orf100','UNC13A','CHRNB2','ST8SIA3','NEUROD1','MAST1','ELAVL4','KIF1A','ACSL6','SPATA13','KIF18B'],

# From supplementary table 1 of https://doi.org/10.1016/j.clgc.2020.08.004
'TP53_1' :['ABR','ACTL6A','ADAM17','AK2','ALDH3A2','ANKRA2','ANLN','ASF1B','ASPM','ATP9B','AURKA','BBS4','BEND3','BRIX1','BUB1','C16orf71','C17orf53','C17orf59','C9orf117',
          'CCDC150','CCDC40','CCNA2','CCNB2','CCNG1','CDC123','CDC20','CDC25C','CDC45','CDCA4','CDCA5','CDCA8','CDKN1A','CDKN3','CDON','CDR2L','CENPA','CENPF','CENPI','CENPL',
          'CES2','CIT','CKAP2L','CPSF3','CSNK2A1','CYB5D1','CYB5D2','DCAF13','DCUN1D5','DDB2','DEPDC1B','DERL1','DGAT2','DLGAP5','DNMT3B','DPH2','DSN1','DVL3','E2F2','E2F3','ECT2'
          ,'EDA2R','EEF2','ERCC6L','ESPL1','EXO1','EZH2','FAM49B','FAM72B','FANCB','FANCF','FDXR','FMO5','GINS1','GINS4','GNAI3','GREB1','GTF3C2','GTPBP4','HHAT','HJURP','INCENP',
          'IQGAP3','ISY1','ITGB1BP1','ITGBL1','KCNK6','KIAA1524','KIF18B','KIF20A','KIF23','KIF2C','KIF4A','KIF4B','KIFC1','KPNA2','LSG1','MAP2K4','MCM10','MCM4','MDM2','MED11','MELK',
          'MLF1','MPDU1','MRPL13','MRPL3','MSH2','MTBP','MYBL2','NAA50','NCAPG2','NCAPG','NCAPH','NCBP2','NEIL3','NFATC3','NUF2','NUP85','PARP2','PDCD10','PHLDA3','PKMYT1','PLK1','POLQ',
          'POLR2K','PRPF38A','PSMA7','PSMC2','PSMD12','PTTG1','RAB6B','RABEP1','RACGAP1','RAD21','RAD51','RAD54B','RAD54L','RAE1','RANGRF','RBM17','RBM28','RFC4','RPH3AL','RPS27L','RSPH1',
          'SEC22A','SGOL1','SMC1B','SMC4','SMG6','SMYD2','SNRPB2','SNX33','SPAG5','SPATA18','SPDEF','SPRY3','SPTBN4','SRR','STIP1','SYTL1','TACC3','TARDBP','TCEB1','TK1','TMEM132A','TNFSF12',
          'TOPBP1','TPX2','TRAIP','TRIP13','TROAP','TTC9C','TTK','UBE2C','VAMP2','VPS39','VRK1','VWA3A','WDHD1','WDR53','WDR62','WDR78','XPO1','YEATS2','ZNF18'],


# These below are all from supplementary table 16 of https://doi.org/10.1126/science.abe1505
'TCGA_androgen' : ['PTGER4','FKBP5','KLK2','CENPN','MAF','ACSL3','HERC3','ZBTB10','EAF2','ABCC4','C1orf116','PMEPA1','MED28','MPHOSPH9','TMPRSS2','KLK3','NKX3-1','NNMT','ADAM7','ELL2'],
'Beltran_NE' : ['ASXL3','CAND2','ETV5','GPX2','JAKMIP2','KIAA0408','SOGA3','TRIM9','BRINP1','C7orf76','GNAO1','KCNB2','KCND2','LRRC16B','MAP10','NRSN1','PCSK1','PROX1','RGS7','SCG3','SEC11C','SEZ6','ST8SIA3','SVOP','SYT11','AURKA','DNMT1','EZH2','MYCN'],
'WNT_hallmark' : ['ADAM17','AXIN1','AXIN2','CCND2','CSNK1E','CTNNB1','CUL1','DKK1','DKK4','DLL1','DVL2','FRAT1','FZD1','FZD8','GNAI1','HDAC11','HDAC2','HDAC5','HEY1','HEY2','JAG1','JAG2','KAT2A','LEF1',
                'MAML1','MYC','NCOR2','NCSTN','NKD1','NOTCH1','NOTCH4','NUMB','PPARD','PSEN2','PTCH1','RBPJ','SKP2','TCF7','TP53','WNT1','WNT5B','WNT6'],
'Wang_YAP_TAZ' : ['CYR61','CTGF','AMOTL2','ANKRD1','IGFBP3','F3','FJX1','NUAK2','LAST2','CRIM1','GADD45A','TGFB2','PTPN14','NTSE','FOXF2','AXL','DOCK5','ASAP1','RBMS3','MYOF','ANHGEF17','CCDC80','AJUBA']
}

In [None]:
# Calculate gene scores for both aggregated visium data and individual slides (the same calculation)
for k in signature_dict.keys():
    
    sc.tl.score_genes(adata=adata_vis,gene_list=signature_dict[k],score_name=k+'_score')

    for s in samples:
        sc.tl.score_genes(adata=adata_slides[s],gene_list=signature_dict[k],score_name=k+'_score')

In [None]:
# Pseudobulk violin plots of gene set scores
sc.set_figure_params(figsize=(12,6))
for k in signature_dict.keys():
    fig,ax = plt.subplots(1,1)
    sc.pl.violin(adata_vis,groupby='sample_id',keys=k+'_score',legend_loc=None,inner='box',jitter=False,ax=ax,show=False,rotation=60)
    plt.tight_layout()
    plt.savefig('./plots/gene_score_violin_plots/'+k+'_violin_plots_pseudobulk.png',dpi=120)
    plt.clf()

In [None]:
## Side by side violin plots of gene set scores + spatial 

plt.rcParams.update({'axes.titlesize': 'small'})

n_types = 7
obs_name = 'joint_leiden_clusters'

for s in samples:

    adata_subset = adata_slides[s]

    fig = plt.figure(figsize=(16,16))

    gs = fig.add_gridspec(7,3)
    ax6 = fig.add_subplot(gs[:, 1:3])

    # Plot the data in the left column subplots
    for i,k in enumerate(signature_dict.keys()):

        ax_plotting = fig.add_subplot(gs[i,0])

        sc.tl.score_genes(adata=adata_subset,gene_list=signature_dict[k],score_name=k+'_score')
        sc.pl.violin(adata_subset,groupby='joint_leiden_clusters',keys=k+'_score',legend_loc=None,inner='box',jitter=False,ax=ax_plotting,show=False)
        
        ax_plotting.yaxis.label.set_size(12)
        #ax_plotting.yaxis.label.set_rotation('horizontal')

        if i < 6:  # only remove x-labels for the top 5 subplots on the left
            ax_plotting.tick_params(axis='x', which='both', bottom=False, labelbottom=False) 

    sc.pl.spatial(adata_subset,color='joint_leiden_clusters',size=1.3,ax=ax6,show=False)

#    plt.show()
    plt.savefig('./plots/gene_score_violin_plots/'+s+'_gene_set_scores.png',dpi=120)
    plt.clf()



## Main differential expression test

In [None]:
PC_samples = samples[3:13]
col_name = 'joint_leiden_clusters'


de_res = pd.DataFrame()
for sample in PC_samples:
    slide = adata_slides[sample]
    groups = list(slide.obs[col_name].cat.categories)

    sc.tl.rank_genes_groups(slide, col_name, method='wilcoxon') # groups=['1','2'],reference='0',
    res = sc.get.rank_genes_groups_df(slide,groups)
    res = res[(res['pvals_adj']<0.05) & (res['logfoldchanges']>=1)].sort_values('logfoldchanges',ascending=False).reset_index(drop=True)

    res['group'] = sample + '_' + res['group'].astype(str)
    de_res = pd.concat([de_res,res],axis=0)

    

In [None]:
# Take out genes that are in only one cluster

# The minimum number can be modified if 
de_genes_for_use = list(de_res['names'].value_counts()[de_res['names'].value_counts() > 1].keys()) 

len(de_genes_for_use)

In [None]:
# Subset PC samples and calculate scaled expression (Z-scores)
adata_pc = adata_vis[adata_vis.obs['sample_id'].isin(PC_samples)].copy()
adata_pc.layers['scaled'] = sc.pp.scale(adata_pc, copy=True,layer='counts').X


# Subset the adata object by calculating an overlap
adata_pc = adata_pc[:,adata_pc.var_names.isin(de_genes_for_use)]
de_genes_for_use = [g for g in de_genes_for_use if g in adata_pc.var_names]
adata_pc

## Differential expression analyses

In [None]:
def enrich_results(adata,cluster):
    df = sc.get.rank_genes_groups_df(adata,group=cluster)
    df = df[(df['pvals_adj']<0.05) & (df['logfoldchanges']>=1)].sort_values('logfoldchanges',ascending=False)
    enriched = sc.queries.enrich(list(df['names']),gprofiler_kwargs={'sources':['GO:BP']})
    return(enriched)

In [None]:
sample = 'PC_02_05601_OIK'
groups = ['0','1','2','3','4','5','6','7']

slide = adata_slides[sample].copy()
slide.layers['scaled'] = sc.pp.scale(slide, copy=True,layer='counts').X
slide = slide[slide.obs['joint_leiden_clusters'].isin(groups)]

sc.tl.rank_genes_groups(slide, "joint_leiden_clusters", method='wilcoxon') # groups=['1','2'],reference='0',

In [None]:
de_res = pd.DataFrame()
for group in groups:
    res = sc.get.rank_genes_groups_df(slide,group)
    res[(res['pvals_adj']<0.05) & (res['logfoldchanges']>=1)].sort_values('logfoldchanges',ascending=False)

    res['group'] = group

    res = res.iloc[:5]
    de_res = pd.concat([de_res,res],axis=0)


In [None]:
de_markers = {}
for group in groups:
    res = sc.get.rank_genes_groups_df(slide,group)
    res[(res['pvals_adj']<0.05) & (res['logfoldchanges']>=1)].sort_values('logfoldchanges',ascending=False)
    
    de_markers[group] = res.loc[:50]['names']


In [None]:
for gr in de_markers.keys():
    markers = de_markers[gr]
    for marker in markers:
        for sign in signature_dict.keys():
            if marker in signature_dict[sign]:
                print(sign+' gene '+marker+' in cluster ' + gr)

In [None]:
sc.pl.heatmap(slide, de_markers,cmap='RdBu_r', groupby='joint_leiden_clusters', layer='scaled', swap_axes=True,dendrogram=True,vmin=-2,vmax=2,
                   figsize=(10,10)
)


In [None]:
sc.pl.heatmap(slide, de_markers,cmap='RdBu_r', groupby='joint_leiden_clusters', layer='scaled', swap_axes=True,dendrogram=True,vmin=-2,vmax=2,
                   figsize=(8,8),save='_'+sample+'_expression.pdf'
)


In [None]:
enrich_res = pd.DataFrame()
for group in groups:
    res = enrich_results(slide,group)
    res['group'] = group
    enrich_res = pd.concat([enrich_res,res],axis=0)


In [None]:
# Heatmap plotting of enrichment results for PC_02_10136

tokeep = ['mesenchyme morphogenesis','vasculature development','response to stress','acyl-CoA biosynthetic process']
res_for_plotting = enrich_res[enrich_res['name'].isin(tokeep)]

# Format the results for plotting
res_for_plotting = res_for_plotting[['name','p_value','intersection_size','group']]
x_coords ={'0':5,'1':10,'2':15,'5':20}
y_coords ={'mesenchyme morphogenesis':5,'response to stress':10,'acyl-CoA biosynthetic process':15,'vasculature development':20}
res_for_plotting['x'] = res_for_plotting['group'].map(x_coords)
res_for_plotting['y'] = res_for_plotting['name'].map(y_coords)
res_for_plotting['minus_log10_p_val'] = -np.log(res_for_plotting['p_value'])
res_for_plotting = res_for_plotting.reset_index(drop=True)
res_for_plotting

df_heatmap = pd.pivot(res_for_plotting,index='name',values=['minus_log10_p_val'],columns='group')
df_heatmap = df_heatmap.fillna(0)
df_heatmap = df_heatmap.iloc[[1,2,0,3]] # Reorder to get a diagonal

sns.set(rc={'figure.figsize':(4,2),"figure.dpi": 300})
fig,ax = plt.subplots(1,1)
ax = sns.heatmap(df_heatmap.T,vmax=20, cmap='Blues', linewidths=1.5, linecolor='white', cbar=None
                )
ax.axis('off')
plt.savefig('figures/enrichment_heatmap_no_axis.pdf',bbox_inches='tight',dpi=300)
df_heatmap = pd.pivot(res_for_plotting,index='name',values=['minus_log10_p_val'],columns='group')
df_heatmap = df_heatmap.fillna(0)
df_heatmap = df_heatmap.iloc[[1,2,0,3]] # Reorder to get a diagonal

sns.set(rc={'figure.figsize':(4,2),"figure.dpi": 300})
fig,ax = plt.subplots(1,1)
ax = sns.heatmap(df_heatmap.T,vmax=20, cmap='Blues'#,cbar=None
                )
#ax.axis('off')
plt.savefig('figures/enrichment_heatmap_with_axis.pdf',bbox_inches='tight',dpi=300)


In [None]:
sns.set(rc={'figure.figsize':(4,2),"figure.dpi": 300})
fig,ax = plt.subplots(1,1)

ax = sns.scatterplot(data=res_for_plotting, x="x", y="y", hue='minus_log10_p_val', size="intersection_size",
    palette='viridis' , legend=None,sizes=(50, 500), hue_norm=(5, 60)
)
#sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
ax.grid(False)
ax.axis('off')
ax.set_ylim([25,3])
ax.set_xlim([3,25])
#ax.invert_yaxis()
plt.tight_layout()
plt.savefig('figures/enrichment_scatterplot_no_axis.pdf',bbox_inches='tight',dpi=300)


## Plotting accurately cropped visium slides

In [None]:
# Cropping coordinate to get a standard crop
coords = {'PC_7875OIK':(2000,19500,2000,19500),
          'PC_4980':(500,21000,500,21000),
          'PC_01_14451_OIK':(1400,19000,5500,23000),
          'PC_02_10136_VAS':(3700,22000,2200,20000)}

# Plots and saves top n_types with highest prevelance on visium slides
sample = 'PC_02_10136_VAS'
slide = adata_slides[sample]

# Subset to only plot the cell types with highest mean prevalence
#cell_types = [cell_types[0]]

sc.set_figure_params(dpi=100,figsize=(6,6))
# plot in spatial coordinates

fig, ax = plt.subplots(1,1)
sc.pl.spatial(slide,
            color='joint_leiden_clusters',
            size=1.4,alpha_img=0.8,
            show=False,ax = ax,crop_coord=coords[sample]
            )
plt.axis('off')
plt.show()

In [None]:
# Cropping coordinate to get a standard crop
coords = {'PC_7875OIK':(2000,19500,2000,19500),
          'PC_4980':(500,21000,500,21000),
          'PC_01_14451_OIK':(1400,19000,5500,23000),
          'PC_02_10136_VAS':(3700,22000,2200,20000),
          'PC_02_05601_OIK':(1500,20500,3200,21000),
          'CRPC-278':(1500,20500,3200,21000),
          'BPH_665':(2800,21500,2200,20500),
          'PC_03_01669_TUTKV':(0,18000,3000,21000),
          'PC-03-6712':(0,18000,3000,21000)
          }

# Plots and saves top n_types with highest prevelance on visium slides
sample = 'PC-03-6712'
slide = adata_slides[sample]

# Subset to only plot the cell types with highest mean prevalence
#cell_types = [cell_types[0]]

sc.set_figure_params(dpi=100,figsize=(3,3))
# plot in spatial coordinates

fig, ax = plt.subplots(1,1)
sc.pl.spatial(slide,
            color='joint_leiden_clusters',
            size=0,alpha_img=1, alpha=0,
            # limit color scale at 99.2% quantile of cell abundance
            show=False,ax = ax,crop_coord=coords[sample]  # 'p99.2'
            )

plt.axis('off')
plt.savefig('./figures/'+sample+'_histology_shown.pdf',dpi=800)
plt.clf()

fig, ax = plt.subplots(1,1)
sc.pl.spatial(slide,
            color='joint_leiden_clusters',
            size=1.4,alpha_img=0.8,
            show=False,ax = ax,crop_coord=coords[sample]
            )
plt.axis('off')
plt.savefig('./figures/'+sample+'_clustering.pdf',dpi=800)
plt.clf()
