In [None]:
import os
os.chdir('/lustre/scratch/kiviaho/prostate_spatial')

import numpy as np
import anndata as ad
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from scripts.utils import load_from_pickle, save_to_pickle
from scipy.stats import chi2_contingency
import warnings
warnings.filterwarnings('ignore')
sc.set_figure_params(figsize=(6,6),dpi=80)
import upsetplot as ups

## Scoring intagrated single cell data sets

In [None]:
refined_markers = {'Epithelial':['KRT5','KRT7','KRT19','MMP7','EPCAM','PRAC1','HOXB13','LRRC26','SPDEF','NKX3-1',
                                 'KLK3','FOLH1','SCHLAP1','PLA2G2A','MSMB','CETN2','ZMYND10','EPCAM','PCA3','KLK3',
                                 'KRT18','KRT8','ACPP','MSMB','S100P','SCGB3A1','SCGB1A1','SLC4A4','CLDN3','SAA1',
                                 'SAA2','KRT17','S100A2','ACPP','KLK3','MSMB','KLK2','NEFH','AZGP1','RDH11','PLA2G2A',
                                 'TMPRSS2','NKX3-1','AR','KRT19','KRT18','KRT8','TP63','KRT14','KRT5','DPP4'],
                    'Endothelial':['ACKR1','CCL14','PECAM1','IFI27','CLEC14A','ENPP2','DEPP1','RAMP2','VWF',
                                   'IF127','RNASE1','ACKR1','TM4SF1','VWF','SELE','IFI27','FLT1','SPARCL1',
                                   'SDPR','PTPRB','DARC','PLVAP','A2M','PECAM1','VWF','ENG'],
                    'Fibroblast':['LUM','DCN','IGF1','DCN','LUM','MGP','DCN','PTGDS','APOD',
                                  'STC1','AREG','DCN','FBLN1','COL1A2','IGF1','C7','IGFBP5','CCDC80','CFD','LTBP4','SFRP1','ACTA2'],
                    'SMC':['RGS5','ACTA2','TAGLN','BGN','RGS5','MT1M','ACTA2','ACTG2','DES','RGS5','NDUFA4L2','RERGL','MYH11','RGS5',
                           'ACTA2','TAGLN','MYL9','MYLK','C11orf96','MCAM','CALD1','LMOD1'],
                    'Mast':['KIT','TPSB2','TPSAB1','CPA3','TPSAB1','CPA3','CPA3','TPSAB1','KIT',
                            'VWA5A','IL1RL1','CTSG','SLC18A2','ACSL4','MS4A2','GATA2','CMA1','MS4A2','TPSAB1','TPSB2'],
                    'T cell':['CD3D','CD3E','CD3G','CD8A','CD8B','IL7R','CD2','CCL5','IFNG','CD8B','CD8A','SMC4','MK167',
                              'CTLA4','IL2RA','FOXP3','CD3D','CCL5','CD69','IL7R','TRBC2','CCL5','IFNG','CD8A','CXCR4',
                              'PTPRC','ETS1','CCL4','CD69','PDCD1','CTLA4','CD8A','SELL','PTPRC','BTLA','IL2RA','IL7R',
                              'CCR7','CD28','CD27','SLAMF1','CD7','CD2','CD3G','CD3E','CD3D'],
                        'B cell':['CD79A','MS4A1','MS4A1','CD79B','IGJ','MZB1','XBP1','TPSB2',
                                  'TPSAB1','IGKC','IGHA1','IGJ','IGHA2','AC096579.7','MZB1',
                                  'IGHG3','SLAMF7','IGHG4','IGHG1','MS4A1','IGHM','CXCR5','CD79A','CD22','BANK1','LY9','CCR7','IRF8','CD83'],
                        'Myeloid':['CD4','C1QA','C1QB','AIF1','CD68','LYZ','IL1B','FCN1','S100A12','C1QC','C1QA','S100A9','S100A8','LYZ',
                                   'HLA-DPB1','HLA-DRA','IL1B','HLA-DRA','HLA-DPA1','HLA-DPB1','HLA-DRB1','CD74','IL8','HLA-DQA1','IF130',
                                   'LYZ','LYZ','FCGR3A','CSF1R','CD68','CD163','CD14','UCHL1','HAVCR2','CD4'],
                        'Neuronal':['PLP1','MPZ','S100A8','MT1H'],
                        'Dendritic':['IRF7','IRF4','FCER1A','CD1C']  
                                 }

In [None]:
def plot_stacked_bar(data,sum_variable='phenotype',plot_variable='VI_clusters',filter_kw='',plot_legend=False):
    # This snippet plots the scanorama leiden clusters contents by phenotype as a normalized bar plot.
    plot_data = data.obs[[sum_variable,plot_variable]]

    plot_data = plot_data.groupby(sum_variable)[plot_variable].value_counts()
    plot_data = plot_data.unstack(sum_variable)
    plot_data = plot_data.div(plot_data.sum(axis=1), axis=0)

    # Order according to decreasing number of sample represented
    #cat_order = (plot_data != 0).sum(axis=1).sort_values(ascending=True).index
    cat_order = plot_data.max(axis=1).sort_values(ascending=False).index
    plot_data = plot_data.reindex(cat_order)
    

    if filter_kw !='':
        plot_data = plot_data.loc[[name for name in plot_data.index if filter_kw in name]]
        plot_data.index = plot_data.index.remove_unused_categories()

    if plot_legend == True:
        plot_data.plot.barh(stacked=True,figsize=(12,8),grid=False).legend(loc='center left',bbox_to_anchor=(1.0, 0.5))
    else:
        plot_data.plot.barh(stacked=True,figsize=(12,8),grid=False,legend=False,sort_columns=False)
    return plot_data

def drop_missing_markers(marker_list,missing_vals):
    for v in marker_list.values():
        for miss in missing_vals:
            if (miss) in v:
                v.remove((miss))
    return marker_list

## All cell types in the integration

In [None]:
adata = load_from_pickle('all-scvi-integrated-7-sc-datasets.pickle')
adata

In [None]:
# Remove the phenotype lables from broad celltypes – we have them anyway
adata.obs.loc[adata.obs['broad_celltypes']=='normal_Epithelial','broad_celltypes'] = 'Epithelial'
adata.obs.loc[adata.obs['broad_celltypes']=='PCa_Epithelial','broad_celltypes'] = 'Epithelial'
adata.obs.loc[adata.obs['broad_celltypes']=='CRPC_Epithelial','broad_celltypes'] = 'Epithelial'

In [None]:
sc.tl.dendrogram(adata,groupby='VI_clusters',use_rep='X_scVI')

refined_markers = drop_missing_markers(refined_markers,['AC096579.7', 'ACKR1', 'C11orf96', 'CD1C', 'CMA1', 'CTSG', 'DARC', 'DEPP1', 
                                                        'FCER1A', 'IF127', 'IF130', 'IGHA1', 'IGHA2', 'IGHG1', 'IGHG3', 'IGHG4', 
                                                        'IGHM', 'IGJ', 'IGKC', 'IL8', 'MK167', 'MS4A1', 'PECAM1', 'SCHLAP1', 'SDPR', 
                                                        'TPSB2', 'TRBC2','ACKR1', 'IGJ', 'MS4A1', 'PECAM1', 'TPSB2','MS4A1'])
sc.pl.dotplot(adata, refined_markers, groupby='VI_clusters', dendrogram=True, log= False,
              swap_axes = True, vmax=4)

In [None]:
purity_scores = list()
for clust in adata.obs['VI_clusters'].cat.categories:
    counts = adata[adata.obs['VI_clusters'] == clust].obs['broad_celltypes'].value_counts()
    purity = counts[0]/counts.sum() # What proportion of the cluster is represent by the top sample
    purity_scores.append(purity)
    #print('Cluster '+ clust +' purity for type ' + counts.index[0] + ': {0:.1%}'.format(counts[0]/counts.sum()))

print('')
print('The overall mean purity for clusters is {0:.1%}'.format(np.mean(purity_scores)))
print('The overall median is {0:.1%}'.format(np.median(purity_scores)))
print('The lower quantile is {0:.1%}'.format(np.quantile(purity_scores,0.25)))
print('The upper quantile is {0:.1%}'.format(np.quantile(purity_scores,0.75)))


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
# Create a violinplot using Seaborn
sns.violinplot(y=purity_scores)
sns.stripplot(y=purity_scores, color='black', jitter=True, size=5)

# Set plot title and axis labels
plt.title('Violinplot Example')
plt.xlabel('X-axis Label')
plt.ylabel('Y-axis Label')

# Show plot
plt.show()


In [None]:
sc.pl.umap(adata,color='broad_celltypes',s=5)

In [None]:
sc.pl.umap(adata,color=['sample','VI_clusters'],s=5,legend_loc=None)

In [None]:
to_be_dropped_by_sample_purity = list()
for clust in adata.obs['VI_clusters'].cat.categories:
    counts = adata[adata.obs['VI_clusters'] == clust].obs['sample'].value_counts()
    purity = counts[0]/counts.sum() # What proportion of the cluster is represent by the top sample
    if purity > 0.8:
        print('Cluster '+ clust +' purity for sample ' + counts.index[0] + ': {0:.1%}'.format(purity))
        to_be_dropped_by_sample_purity.append(clust)


In [None]:
subset_adata = adata[~adata.obs['VI_clusters'].isin(to_be_dropped_by_sample_purity)]

In [None]:
sc.pl.umap(subset_adata,color=['broad_celltypes'],s=5)
sc.pl.umap(subset_adata,color=['sample','VI_clusters'],s=5,legend_loc=None)