# Rating patterns for objective selection

### Table of Contents

* [Relevant libraries, functions and colour dictionaries](#chapter1)
* [Creating annotations](#chapter2)
* [Reading in data](#chapter3)
* [Wilcoxon test](#chapter4)

#### Relevant libraries and colour dictionaries <a class="anchor" id="chapter1"></a>

In [None]:
# Import relevant libraries
import numpy as np
import scanpy as sc
import os
import pandas as pd
import seaborn as sb
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sns
from collections import OrderedDict
from matplotlib import cm
import anndata as ann
import scanpy.external as sce
from datetime import datetime
import NaiveDE
import SpatialDE
from matplotlib_venn import venn3
%matplotlib inline
import glob
from sklearn.metrics import mean_squared_error as mse
from scipy import stats
import math
import re

# Set current directory
os.chdir("/Users/mendenlab/work/spatial_granuloma/scripts")

# assign the rigth colours to the right annotation
def _set_colors(adata, obs_name, colors):
    """Set palette with specific colors for specific categories

    Parameters
    ----------
    adata : annData
    obs_name : column to plot
    colors : OrderedDict(): colors named by categories

    Returns
    -------

    """
    if len(colors.values())>0:
        palette = []
        unique_colors = np.unique(adata.obs[obs_name])
        for key in adata.obs[obs_name].cat.categories.tolist():
            if key in colors.keys():
                palette.append(colors[key])
    return palette

def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0) 

#Set the colours per annotation
spot_colors = []
spot_colors = OrderedDict()
spot_colors["EPIDERMIS"] = 'blue'
spot_colors["DERMIS"] = '#E0EEE0'
spot_colors["INTERFACE"] = 'deepskyblue'
spot_colors["VESSEL"] = 'darkgreen'
spot_colors["HAIR FOLLICLE"] = "#543005"
spot_colors["SWEAT GLAND"] = 'y'
spot_colors["SEBACEOUS GLAND"] = 'mistyrose'
spot_colors["MUSCLE"] = 'darkcyan'
spot_colors["GA"] = 'firebrick'  
spot_colors["GNL"] = 'orchid'
spot_colors["GSS"] = 'blueviolet'
spot_colors["GSC"] = 'mediumvioletred'
spot_colors["UNDETERMINED"] = 'black'


dermis_colors = []
dermis_colors = OrderedDict()
dermis_colors["UNDETERMINED"] = 'black'
dermis_colors["upper EPIDERMIS"] = 'blue'
dermis_colors["middle EPIDERMIS"] = 'dodgerblue'
dermis_colors["basal EPIDERMIS"] = 'skyblue'
dermis_colors["DERdepth1"] = '#006837'
dermis_colors["DERdepth2"] = '#238443'
dermis_colors["DERdepth3"] = '#41AB5D'
dermis_colors["DERdepth4"] = '#78C679'
dermis_colors["DERdepth5"] = '#ADDD8E'
dermis_colors["DERdepth6"] = '#D9F0A3'
dermis_colors["DERdepth7"] = '#F7FCB9'

leiden_r13_colours = []
leiden_r13_colours = OrderedDict()
leiden_r13_colours["0"] = 'darkolivegreen'
leiden_r13_colours["1"] = "#D9F0A3"
leiden_r13_colours["2"] = '#238443'
leiden_r13_colours["3"] = 'firebrick'
leiden_r13_colours["4"] = '#78C679'
leiden_r13_colours["5"] = '#78C679'
leiden_r13_colours["6"] = '#41AB5D'
leiden_r13_colours["7"] = '#006837'
leiden_r13_colours["8"] = '#ADDD8E'
leiden_r13_colours["9"] = "#238443"
leiden_r13_colours["10"] = '#78C679'
leiden_r13_colours["11"] = 'blue'
leiden_r13_colours["12"] = 'orchid'
leiden_r13_colours["13"] = '#F46D43'
leiden_r13_colours["14"] = 'dodgerblue'
leiden_r13_colours["15"] = 'deepskyblue'
leiden_r13_colours["16"] = '#cfafaf'
leiden_r13_colours["17"] = 'yellow'
leiden_r13_colours["18"] = 'darkcyan'
leiden_r13_colours["19"] = '#006837'

In [None]:
# Plot patterns in a specific order with their respective categories (e.g., spot_type, skin_layer, leiden_r13)
# -------------------------------------------------------------------------------------------------------
# Input: It takes the anndata object, the patterns dataframe and 
# a pattern list of the sample_SPECIMEN_pattern(s) in the order you want them plotted,
# it also takes a list_of_annotations if you want any of the adata.obs plotted (e.g., spot_type)
# and the number of columns you want your plots organised in (e.g., if you want to plot 20 patterns 
# and you set col_number to 10, it will give you 2 rows of patterns)
# Note: the patternsdf must have the following columns: spot_index, array_row, array_col, sample_SPECIMEN and a column for each of the patterns (0, 1, 2...)
# Output: A figure with 3 rows (pattern, spot type, leiden_clustering) of each of the sample_SPECIMEN_patterns in the given order

# for debug:
# pattern_list_example = ['P17851_1002_91253-A_pattern6', 'P17851_1003_45703-A_pattern7', 'P17851_1004_45703-A_pattern6']

def plot_in_order_annotations(anndata, patternsdf, patternlist, list_of_annotations, col_number = 8):
    # Input: each element of this list is in turn a list of the sample_SPECIMEN_pattern(s) you want to include in the analysis.
    sample_SPECIMEN_pattern_list_of_lists = [patternlist] # If you only want to plot 1 group, then just add it as a list of a list e.g., [['P17851_1001_91253-A_pattern0', 'P17851_1002_91253-A_pattern4']]
    adata = anndata
    all_patterns_df = patternsdf

    number_of_columns = int(col_number)
    number_of_rows_per_category = math.ceil(int(len(patternlist))/number_of_columns) # if there are too many patters we'll divide them in several rows
    number_of_rows = (int(len(list_of_annotations)) + 3) * number_of_rows_per_category
    
    # Loop for each sample_SPECIMEN_pattern
    # -------------------------------------------------------------------------------------------------------
    for k, sample_SPECIMEN_pattern_list in enumerate(sample_SPECIMEN_pattern_list_of_lists):
        fig,axes = plt.subplots(facecolor='w', edgecolor='k', figsize=(5*number_of_columns, int(4 * number_of_rows)))
        for i, sample_SPECIMEN_pattern in enumerate(sample_SPECIMEN_pattern_list):
            sample_SPECIMEN = sample_SPECIMEN_pattern.split('_pattern')[0]
            pattern_number = sample_SPECIMEN_pattern.split('_pattern')[1]
        #     print(sample_SPECIMEN)
        #     print(pattern_number)
            
            # To remove the patient ID from the title
            title_without_patientid = str(sample_SPECIMEN_pattern)
            title_without_patientid = re.sub('P17851_1001', 'P1_1001', title_without_patientid)
            title_without_patientid = re.sub('P17851_1002', 'P1_1002', title_without_patientid)
            title_without_patientid = re.sub('P17851_1003', 'P2_1003', title_without_patientid)
            title_without_patientid = re.sub('P17851_1004', 'P2_1004', title_without_patientid)
            title_without_patientid = re.sub('P18554_1001', 'P3_1001', title_without_patientid)
            title_without_patientid = re.sub('P18554_1002', 'P3_1002', title_without_patientid)
            title_without_patientid = re.sub('P18554_1003', 'P4_1003', title_without_patientid)
            title_without_patientid = re.sub('P18554_1004', 'P4_1004', title_without_patientid)        
            title_without_patientid = re.sub('P18554_1005', 'P1_1005', title_without_patientid)
            title_without_patientid = re.sub('P18554_1006', 'P1_1006', title_without_patientid)
            title_without_patientid = re.sub('P18554_1007', 'P1_1007', title_without_patientid)
            title_without_patientid = re.sub('P18554_1008', 'P1_1008', title_without_patientid)

            #Subset the data
            adata_sample_SPECIMEN = adata[(adata.obs['sample_SPECIMEN']== sample_SPECIMEN)].copy()
            all_patterns_df_temp = all_patterns_df[all_patterns_df['sample_SPECIMEN'] == sample_SPECIMEN]
            
            # Plotting subplots
            ax = plt.subplot(number_of_rows, number_of_columns, i + 1)
            if ((adata_sample_SPECIMEN.obs['sample'].unique() == 'P17851_1002') | (adata_sample_SPECIMEN.obs['sample'].unique() == 'P18554_1006') | (adata_sample_SPECIMEN.obs['sample'].unique() == 'P18554_1002')):
                ax.invert_xaxis() 
                ax.invert_yaxis()
            plt.scatter(all_patterns_df_temp['array_col'], -1.5*all_patterns_df_temp['array_row'], c = all_patterns_df_temp[str(pattern_number)], s = 80); # Note: if patterns.csv is read in, you might need to change it to patterns[str(i)] 
            plt.axis('equal')
            plt.title(title_without_patientid, fontsize = 15) #sample_SPECIMEN_pattern, fontsize = 15)
            plt.colorbar().remove()
            #plt.colorbar(ticks=[all_patterns_df_temp[str(pattern_number)].min(), 0, all_patterns_df_temp[str(pattern_number)].max()]);
            ax.spines[:].set_color('purple')
            ax.spines[:].set_linewidth(2)
            ax.tick_params(axis='both',          # changes apply to the x-axis
                            which='both',      # both major and minor ticks are affected
                            bottom=False,      # ticks along the bottom edge are off
                            top=False, # ticks along the top edge are off
                            left=False, 
                            labelleft=False,
                            labelbottom=False) # labels along the bottom edge are off
    
            ax = plt.subplot(number_of_rows, number_of_columns, i + 1 + number_of_rows_per_category * number_of_columns)
            if ((adata_sample_SPECIMEN.obs['sample'].unique() == 'P17851_1002') | (adata_sample_SPECIMEN.obs['sample'].unique() == 'P18554_1006') | (adata_sample_SPECIMEN.obs['sample'].unique() == 'P18554_1002')):
                ax.invert_xaxis() 
                ax.invert_yaxis()
            plt.scatter(adata_sample_SPECIMEN.obs['array_col'], -1.5*adata_sample_SPECIMEN.obs['array_row'], c = adata_sample_SPECIMEN.obs['spot_type'].map(spot_colors), s = 80); # Note: if patterns.csv is read in, you might need to change it to patterns[str(i)] 
            plt.axis('equal')
            #plt.title(sample_SPECIMEN_pattern + '\n coloured by spot type')
            ax.spines[:].set_color('purple')
            ax.spines[:].set_linewidth(2)
            ax.tick_params(axis='both',          # changes apply to the x-axis
                            which='both',      # both major and minor ticks are affected
                            bottom=False,      # ticks along the bottom edge are off
                            top=False, # ticks along the top edge are off
                            left=False, 
                            labelleft=False,
                            labelbottom=False) # labels along the bottom edge are off
            
            ax = plt.subplot(number_of_rows, number_of_columns, i + 1 + 2 * number_of_rows_per_category * number_of_columns)
            if ((adata_sample_SPECIMEN.obs['sample'].unique() == 'P17851_1002') | (adata_sample_SPECIMEN.obs['sample'].unique() == 'P18554_1006') | (adata_sample_SPECIMEN.obs['sample'].unique() == 'P18554_1002')):
                ax.invert_xaxis() 
                ax.invert_yaxis()
            plt.scatter(adata_sample_SPECIMEN.obs['array_col'], -1.5*adata_sample_SPECIMEN.obs['array_row'], c = adata_sample_SPECIMEN.obs['leiden_r1.3_patient'].map(leiden_r13_colours), s = 80); # Note: if patterns.csv is read in, you might need to change it to patterns[str(i)] 
            plt.axis('equal')
            #plt.title(sample_SPECIMEN_pattern + '\n coloured by leiden clusters (r = 1.3)')
            ax.spines[:].set_color('purple')
            ax.spines[:].set_linewidth(2)
            ax.tick_params(axis='both',          # changes apply to the x-axis
                            which='both',      # both major and minor ticks are affected
                            bottom=False,      # ticks along the bottom edge are off
                            top=False, # ticks along the top edge are off
                            left=False, 
                            labelleft=False,
                            labelbottom=False) # labels along the bottom edge are off
            
            for j, annotation in enumerate(list_of_annotations):
                ax = plt.subplot(number_of_rows, number_of_columns, i + 1 + (3 + j) * number_of_rows_per_category * number_of_columns)
                if ((adata_sample_SPECIMEN.obs['sample'].unique() == 'P17851_1002') | (adata_sample_SPECIMEN.obs['sample'].unique() == 'P18554_1006') | (adata_sample_SPECIMEN.obs['sample'].unique() == 'P18554_1002')):
                    ax.invert_xaxis() 
                    ax.invert_yaxis()
                plt.scatter(adata_sample_SPECIMEN.obs['array_col'], -1.5*adata_sample_SPECIMEN.obs['array_row'], c = adata_sample_SPECIMEN.obs[annotation], s = 80); # Note: if patterns.csv is read in, you might need to change it to patterns[str(i)] 
                plt.axis('equal')
                plt.title(sample_SPECIMEN_pattern + '\n coloured by ' + annotation)

In [None]:
# Import adata 
adata_path = "../results/current/"

adata = sc.read(os.path.join(adata_path, "final/Granuloma_QC_clustering.h5"))
    
# setting up "factors" with different levels, order = TRUE
# add less common annotations LAST so they are not overwritten

# Set spot_type and skin_layer as categories and define the levels in each category
# Spot type: annatomical annotations
adata.obs['spot_type'] = pd.Categorical(
    adata.obs['spot_type'],
    categories = ['UNDETERMINED', 'DERMIS', "EPIDERMIS", 'INTERFACE', 'HAIR FOLLICLE',
                'VESSEL', 'MUSCLE', 'SEBACEOUS GLAND', 'SWEAT GLAND', 'GA', 'GNL', 'GSS', 'GSC'],
                 ordered = True)

# Skin layer
adata.obs['skin_layer'] = pd.Categorical(
    adata.obs['skin_layer'],
    categories = ['UNDETERMINED', 
                'upper EPIDERMIS', 'middle EPIDERMIS', 'basal EPIDERMIS',
                'DERdepth1', 'DERdepth2', 'DERdepth3', 'DERdepth4',
                'DERdepth5', 'DERdepth6', 'DERdepth7'],
    ordered = True)

In [None]:
os.chdir("/Volumes/Drive/spatial_granuloma/output/SpatialDE/Intensity_ratios/") # Set working directory so it saves it in the drive
print(os.getcwd())

#### Creating annotations <a class="anchor" id="chapter2"></a>

In [None]:
# Set leiden r = 1.3 clusters as ground truth - 
# We will group all leiden clusters in 4 clusters: core granuloma, border, and non-granuloma, epidermis pattern ('annotation groups').

#sns.countplot(x='leiden_r1.3_patient', data=adata.obs, hue = 'Border_spot') # Border spots generally map to cluster 3
#sns.countplot(x='leiden_r1.3_patient', data=adata.obs, hue = 'GRANULOMA')

adata.obs['leiden_core_granuloma'] = np.where(adata.obs['leiden_r1.3_patient'] == '3', 1, 0)
adata.obs['leiden_border_granuloma'] = np.where((adata.obs['leiden_r1.3_patient'] == '12') | 
                                                (adata.obs['leiden_r1.3_patient'] == '13'), 1, 0)
adata.obs['leiden_epidermis'] = np.where((adata.obs['leiden_r1.3_patient'] == '11') | 
                                  (adata.obs['leiden_r1.3_patient'] == '14') | 
                                  (adata.obs['leiden_r1.3_patient'] == '15'), 1, 0)
adata.obs['leiden_granuloma'] = np.where((adata.obs['leiden_r1.3_patient'] == '3') | 
                                  (adata.obs['leiden_r1.3_patient'] == '12') |
                                  (adata.obs['leiden_r1.3_patient'] == '13'), 1, 0)
adata.obs['manual_granuloma'] = np.where(adata.obs['GRANULOMA'] == 1, 1, 0)
adata.obs['epidermis_interface'] = np.where((adata.obs['spot_type'] == 'EPIDERMIS') | 
                                  (adata.obs['spot_type'] == 'INTERFACE'), 1, 0)

# To check if it worked
#sns.countplot(x='leiden_r1.3_patient', data=adata.obs, hue = 'leiden_nongranuloma')

In [None]:
adata_subset = adata[(adata.obs['sample_SPECIMEN']== 'P18554_1001_50107-A')]

fig = plt.figure(facecolor = 'w', edgecolor = 'k', figsize = (12, 12))

plt.subplot(3, 2, 1)
plt.title('Leiden granuloma')
plt.scatter(adata_subset.obs['array_col'], 
            -1.5*adata_subset.obs['array_row'],
            c = adata_subset.obs['leiden_granuloma'])
plt.colorbar(ticks=[]);

plt.subplot(3, 2, 2)
plt.title('Manual granuloma')
plt.scatter(adata_subset.obs['array_col'], 
            -1.5*adata_subset.obs['array_row'],
            c = adata_subset.obs['manual_granuloma'])
plt.colorbar(ticks=[]);

plt.subplot(3, 2, 3)
plt.title('Leiden epidermis')
plt.scatter(adata_subset.obs['array_col'], 
            -1.5*adata_subset.obs['array_row'],
            c = adata_subset.obs['leiden_epidermis'])
plt.colorbar(ticks=[]);

plt.subplot(3, 2, 4)
plt.title('Manual epidermis-interface')
plt.scatter(adata_subset.obs['array_col'], 
            -1.5*adata_subset.obs['array_row'],
            c = adata_subset.obs['epidermis_interface'])
plt.colorbar(ticks=[]);

plt.subplot(3, 2, 5)
plt.title('Leiden granuloma core')
plt.scatter(adata_subset.obs['array_col'], 
            -1.5*adata_subset.obs['array_row'],
            c = adata_subset.obs['leiden_core_granuloma'])
plt.colorbar(ticks=[]);

plt.subplot(3, 2, 6)
plt.title('Leiden border')
plt.scatter(adata_subset.obs['array_col'], 
            -1.5*adata_subset.obs['array_row'],
            c = adata_subset.obs['leiden_border_granuloma'])
plt.colorbar(ticks=[]);

#### Reading in data <a class="anchor" id="chapter3"></a>

In [None]:
# # Read in files
# all_sample_info_df = pd.read_csv('all_sample_info_df.csv', index_col=0, header=0)
# all_patterns_df = pd.read_csv('all_patterns_df.csv', header=0, usecols = range(2, 12)) # the first two columns are just row numbers (indexes
# all_df = pd.read_csv('all_df.csv', index_col=0, header=0)
# freqtable_all = pd.read_csv('freqtable_all.csv', index_col=0, header=0)
# df_all = pd.concat([all_sample_info_df, all_patterns_df.reindex(all_sample_info_df.index)], axis = 1)
# df_all = df_all.rename(columns={'Unnamed: 0': 'spot_index'})
# df_all.head()

# df_all = pd.concat([all_sample_info_df, all_patterns_df.reindex(all_sample_info_df.index)], axis = 1)
# df_all = df_all.rename(columns={'Unnamed: 0': 'spot_index'})
# df_all.head()

# # And save for nextime
# df_all.to_csv('patterns_samples_merged.csv')

df_all = pd.read_csv('patterns_samples_merged.csv', index_col = 0)
patterns_columns = ['0', '1', '2', '3', '4', '5', '6', '7']

In [None]:
# Checking that the dataframes with sample_info and patterns_info were correctly merged

adata_subset = df_all[(df_all['sample_SPECIMEN']== 'P18554_1001_50107-A')] # define here the sample_SPECIMEN you want to display
plt.scatter(adata_subset['array_col'], 
            -1.5*adata_subset['array_row'],
            c = adata_subset['1']) # define here the pattern you want to display
plt.colorbar(ticks=[]);

In [None]:
# df_all has all the data per spot for each pattern (sample_SPECIMEN_pattern)
df_all.head()

In [None]:
# Check the distribution of intensities across patterns of a specific sample_SPECIMEN
df_all[df_all['sample_SPECIMEN'] == 'P18554_1001_50107-A']
#sns.histplot(data = df_all[df_all['sample_SPECIMEN'] == 'P18554_1001_50107-A'], x='0')
sns.histplot(data = df_all[df_all['sample_SPECIMEN'] == 'P18554_1001_50107-A'], x='1')
# Check the absolute minimum and maximum across all patterns
print('the minimum across all patterns is: ' + str(df_all[patterns_columns].min().min()))
print('the maximum across all patterns is: ' + str(df_all[patterns_columns].max().max()))

#### Wilcoxon test <a class="anchor" id="chapter6"></a>

## Wilcoxon test

In [None]:
# # Merging adata.obs and df_all
# We only need to do this once, you can skip to the next cell to just load the files

# sample_SPECIMEN_list = ['P18554_1008_72859-B', 'P18554_1003_95096-A',
#        'P17851_1002_91253-A', 'P18554_1004_95096-A',
#        'P18554_1008_72859-A', 'P18554_1005_82301-B',
#        'P18554_1007_72859-B', 'P18554_1005_82301-A',
#        'P18554_1007_72859-A', 'P18554_1006_82301-A',
#        'P17851_1003_45703-A', 'P18554_1002_50107-A',
#        'P18554_1006_82301-B', 'P18554_1001_50107-A',
#        'P17851_1004_45703-A', 'P17851_1001_91253-A',
#        'P17851_1002_91253-B']

# #sample_SPECIMEN_list = ['P18554_1001_50107-A']

# # ---------------------------------------------------------------------------
# adata_df_all_merged = []
# for sample_SPECIMEN in sample_SPECIMEN_list:
#     patterns_info = df_all[df_all['sample_SPECIMEN'] == sample_SPECIMEN] # subset the patterns dataframe for 1 sample_SPECIMEN
#     adata_sub = adata.obs[adata.obs['sample_SPECIMEN'] == sample_SPECIMEN].copy() # subset the adata.obs
#     adata_sub['spot_index'] = adata_sub.index
#     adata_sub.reset_index(inplace = True)
# #     print(adata_sub.columns)
# #     print(patterns_info.shape)
# #     print(adata_sub.shape)
#     superdf = patterns_info.merge(adata_sub, how = 'outer', right_on = ['spot_index', 'sample_SPECIMEN', 'array_col', 'array_row', 'n_counts'], left_on = ['spot_index', 'sample_SPECIMEN', 'array_col', 'array_row', 'n_counts']) # merge them together in one super df
#     #print(superdf.columns)
#     adata_df_all_merged.append(superdf) 
# adata_df_all_merged = pd.concat(adata_df_all_merged)

# print(adata_df_all_merged.shape)
# adata_df_all_merged.head()

# adata_df_all_merged.to_csv('adata_df_all_merged.csv')

In [None]:
intensity_df = pd.read_csv('adata_df_all_merged.csv', header = 0, index_col = 0)
intensity_df = intensity_df.reset_index(drop = True)
intensity_df.head()
# Slice to some columns
intensity_df = intensity_df[['spot_index', 'array_row', 'array_col', 'n_counts', 'sample_SPECIMEN', 
                            '0', '1', '2', '3', '4', '5', '6', '7', 
                            'leiden_core_granuloma', 'leiden_border_granuloma', 'leiden_epidermis', 'leiden_granuloma', 'manual_granuloma', 'epidermis_interface']]

# Melt the dataframe so that all leiden_columns are condensed in 1 column
intensity_df = pd.melt(intensity_df, 
                       id_vars=['spot_index', 'array_row', 'array_col', 'n_counts', 'sample_SPECIMEN', '0', '1', '2', '3', '4', '5', '6', '7'], 
                       value_vars= ['leiden_core_granuloma', 'leiden_border_granuloma', 'leiden_epidermis', 'leiden_granuloma', 'manual_granuloma', 'epidermis_interface'], 
                       value_name = 'annotation_value', 
                       var_name = 'annotation')
intensity_df.head()

# Melt the dataframe so that all patterns are condensed in 1 column
intensity_df = pd.melt(intensity_df, id_vars=['spot_index', 'array_row', 'array_col', 'n_counts', 'sample_SPECIMEN', 'annotation', 'annotation_value'], 
                       value_vars= patterns_columns, 
                       value_name = 'intensity', 
                       var_name = 'pattern')
intensity_df.head()

# Add sample_SPECIMEN_pattern
intensity_df['sample_SPECIMEN_pattern'] = intensity_df['sample_SPECIMEN'] + '_pattern' + intensity_df['pattern']
intensity_df.head()

In [None]:
intensity_df.groupby(['annotation', 'annotation_value'])['intensity'].describe()

In [None]:
# For the nice pattern P18554_1001_50107-A_pattern1
g = sns.catplot(x="sample_SPECIMEN_pattern", y="intensity",
                hue="annotation_value", col="annotation",
                data=intensity_df[intensity_df['sample_SPECIMEN_pattern'] == 'P18554_1001_50107-A_pattern1'], kind="box",
                height = 5, aspect = 1, col_wrap = 2, 
                col_order = ['manual_granuloma', 'leiden_granuloma', 'epidermis_interface', 'leiden_epidermis', 'leiden_border_granuloma', 'leiden_core_granuloma']);

In [None]:
intensity_df
g = sns.relplot(x = "sample_SPECIMEN_pattern", y = "intensity",
                hue = "annotation_value", col = "annotation",
                data = intensity_df, kind = "line",
                height = 5, aspect = 1.2, col_wrap = 2,
                col_order = ['manual_granuloma', 'leiden_granuloma', 'epidermis_interface', 'leiden_epidermis', 'leiden_border_granuloma', 'leiden_core_granuloma']);
g.set_xticklabels(rotation=90, fontsize = 10)

In [None]:
# Testing Wilcoxon test for one sample pattern 

# df1 = intensity_df[(intensity_df['annotation_value'] == 0) & 
#              (intensity_df['sample_SPECIMEN_pattern'] == 'P18554_1001_50107-A_pattern1') &
#             (intensity_df['annotation'] == 'leiden_core_granuloma')]
# df2 = intensity_df[(intensity_df['annotation_value'] == 1) & 
#              (intensity_df['sample_SPECIMEN_pattern'] == 'P18554_1001_50107-A_pattern1') &
#             (intensity_df['annotation'] == 'leiden_core_granuloma')]

# statistic, pvalue = stats.ranksums(df1['intensity'], df2['intensity'])
# print('core')
# print(statistic)
# print(pvalue)


# df3 = intensity_df[(intensity_df['annotation_value'] == 0) & 
#              (intensity_df['sample_SPECIMEN_pattern'] == 'P18554_1001_50107-A_pattern1') &
#             (intensity_df['annotation'] == 'leiden_epidermis')]
# df4 = intensity_df[(intensity_df['annotation_value'] == 1) & 
#              (intensity_df['sample_SPECIMEN_pattern'] == 'P18554_1001_50107-A_pattern1') &
#             (intensity_df['annotation'] == 'leiden_epidermis')]

# statistic4, pvalue4 = stats.ranksums(df3['intensity'], df4['intensity'])
# print('')
# print('epidermis')
# print(statistic4)
# print(pvalue4)


# df5 = intensity_df[(intensity_df['annotation_value'] == 0) & 
#              (intensity_df['sample_SPECIMEN_pattern'] == 'P18554_1001_50107-A_pattern1') &
#             (intensity_df['annotation'] == 'leiden_nongranuloma')]
# df6 = intensity_df[(intensity_df['annotation_value'] == 1) & 
#              (intensity_df['sample_SPECIMEN_pattern'] == 'P18554_1001_50107-A_pattern1') &
#             (intensity_df['annotation'] == 'leiden_nongranuloma')]

# statistic4, pvalue4 = stats.ranksums(df5['intensity'], df6['intensity'])
# print('')
# print('nongran')
# print(statistic4)
# print(pvalue4)

In [None]:
# The Wilcoxon signed-rank test tests the null hypothesis that two related paired samples come from the same 
# distribution. In particular, it tests whether the distribution of the differences x - y is symmetric about zero. 
# It is a non-parametric version of the paired T-test.

sample_SPECIMEN_pattern_list = list(intensity_df['sample_SPECIMEN_pattern'].unique())
annotation_list = list(intensity_df['annotation'].unique())

statistics_list = []
pvalue_list = []
ssp_list = []
ann_list = []

for sample_SPECIMEN_pattern in sample_SPECIMEN_pattern_list:
    for annotation in annotation_list:
        # subset the intensity_df
        intensity_df_sub = intensity_df[(intensity_df['annotation'] == annotation) & (intensity_df['sample_SPECIMEN_pattern'] == sample_SPECIMEN_pattern)]
        # obtain the two lists for annotation_value = 0 and annotation_value = 1
        intensity_0 = intensity_df_sub[intensity_df_sub['annotation_value'] == 0]
        intensity_1 = intensity_df_sub[intensity_df_sub['annotation_value'] == 1]
        
        #wilcoxon test for the subset
        statistic, pvalue = stats.ranksums(intensity_1['intensity'], intensity_0['intensity'])
        
        # add the output to list
        statistics_list.append(statistic)
        pvalue_list.append(pvalue)
        
        # This is not so nice code but it this way we'll be sure the right sample_SPECIMEN_pattern and annotation is associated with the right wilcoxon result 
        ssp_list.append(sample_SPECIMEN_pattern)
        ann_list.append(annotation)
        
wilcox_df = pd.DataFrame({'sample_SPECIMEN_pattern': ssp_list,
                         'annotation': ann_list,
                         'statistic': statistics_list,
                         'pvalue': pvalue_list})

# Add the minus log pvalue and the sign of the statistic
wilcox_df['signed_pvalue'] = np.sign(wilcox_df['statistic']) * (-np.log10(wilcox_df['pvalue']))

#wilcox_df.to_csv('wilcox_df.csv') # Uncomment this if you want to save the wilcoxon results dataframe
wilcox_df.head()

In [None]:
# Check the distribution of wilcoxon_df for P18554_1001_50107-A_pattern1 and P18554_1001_50107-A_pattern0
wilcox_df[wilcox_df['sample_SPECIMEN_pattern'].isin(['P18554_1001_50107-A_pattern0', 'P18554_1001_50107-A_pattern1', 'P18554_1001_50107-A_pattern2', 'P18554_1001_50107-A_pattern3'])]

In [None]:
# Check the distribution of the wilcoxon minuslog(pvalue)_signed across patterns

# if you want the annotations each in a separate plot, substitute hue by col, and the set the col_wrap to 2 (2 columns)
g = sns.relplot(data = wilcox_df, x = 'sample_SPECIMEN_pattern', y='signed_pvalue', kind = 'scatter', col='annotation', 
               col_wrap = 2, height = 5, aspect = 1.8, alpha = 0.8,
               col_order = ['manual_granuloma', 'leiden_granuloma', 'epidermis_interface', 'leiden_epidermis', 'leiden_border_granuloma', 'leiden_core_granuloma']);
# (g.set_axis_labels("Patterns", "Average intensity per group")
#   .set_titles("Annotation: {col_name}"))
g.set_xticklabels(rotation=90, fontsize = 10)

In [None]:
# dataframe sorted by the the minimum minuslogp_signed for each annotation
wilcox_df_core = wilcox_df[wilcox_df['annotation'] == 'leiden_core_granuloma'].sort_values(by = 'signed_pvalue')
wilcox_df_border = wilcox_df[wilcox_df['annotation'] == 'leiden_border_granuloma'].sort_values(by = 'signed_pvalue')
wilcox_df_leidenepidermis = wilcox_df[wilcox_df['annotation'] == 'leiden_epidermis'].sort_values(by = 'signed_pvalue')
wilcox_df_nongranuloma = wilcox_df[wilcox_df['annotation'] == 'leiden_granuloma'].sort_values(by = 'signed_pvalue')
wilcox_df_epidermis = wilcox_df[wilcox_df['annotation'] == 'epidermis_interface'].sort_values(by = 'signed_pvalue')

# Let's visualise it! 
print('Top 10 core granuloma')
plot_in_order_withcategory(anndata = adata, patternsdf = df_all, patternlist = list(wilcox_df_core['sample_SPECIMEN_pattern'].tail(10)), category = 'leiden_core_granuloma')
print('Top 10 border granuloma')
plot_in_order_withcategory(anndata = adata, patternsdf = df_all, patternlist = list(wilcox_df_border['sample_SPECIMEN_pattern'].tail(10)), category = 'leiden_border_granuloma')
print('Top 10 granuloma')
plot_in_order_withcategory(anndata = adata, patternsdf = df_all, patternlist = list(wilcox_df_nongranuloma['sample_SPECIMEN_pattern'].tail(10)), category = 'leiden_granuloma')
print('Top 10 Leiden epidermis')
plot_in_order_withcategory(anndata = adata, patternsdf = df_all, patternlist = list(wilcox_df_leidenepidermis['sample_SPECIMEN_pattern'].tail(10)), category = 'leiden_epidermis')
print('Top 10 manual epidermis')
plot_in_order_withcategory(anndata = adata, patternsdf = df_all, patternlist = list(wilcox_df_epidermis['sample_SPECIMEN_pattern'].tail(10)), category = 'epidermis_interface')


In [None]:
# Select the top 30 patterns with wilcox minuslogPsigned < 0 for the core granuloma (plot by groups)
wilcoxtopcore = wilcox_df[(wilcox_df['annotation'] == 'leiden_core_granuloma') & (wilcox_df['signed_pvalue'] > 0)].sort_values(by = 'signed_pvalue')
list(wilcoxtopcore['sample_SPECIMEN_pattern'])
plot_in_order_withcategory(anndata = adata, patternsdf = df_all, patternlist = list(wilcoxtopcore['sample_SPECIMEN_pattern'])[:10], category = 'leiden_core_granuloma')
plot_in_order_withcategory(anndata = adata, patternsdf = df_all, patternlist = list(wilcoxtopcore['sample_SPECIMEN_pattern'])[10:20], category = 'leiden_core_granuloma')
plot_in_order_withcategory(anndata = adata, patternsdf = df_all, patternlist = list(wilcoxtopcore['sample_SPECIMEN_pattern'])[20:30], category = 'leiden_core_granuloma')

In [None]:
# Check for which patterns border and core differ the most
#wilcox_df[wilcox_df['annotation'] == 'leiden_core_granuloma'

w_pivot = wilcox_df.pivot(index="sample_SPECIMEN_pattern", columns='annotation', values="signed_pvalue")
w_pivot['borderminuscore'] = w_pivot['leiden_border_granuloma'] - w_pivot['leiden_core_granuloma']
top10 = w_pivot.sort_values(by = 'borderminuscore').head(10).index # these have a much higher score for core >> border
last10 = w_pivot.sort_values(by = 'borderminuscore').tail(10).index # these have a much higher score for border >> core
w_pivot

In [None]:
plot_in_order_annotations(anndata = adata, patternsdf = df_all, patternlist = list(top10), list_of_annotations = ['leiden_core_granuloma', 'leiden_border_granuloma'])

plot_in_order_annotations(anndata = adata, patternsdf = df_all, patternlist = list(last10), list_of_annotations = ['leiden_core_granuloma', 'leiden_border_granuloma'])

### Plot k means clustering patterns

In [None]:
# Granuloma cluster
cluster4 = ["P17851_1001_91253-A_pattern6", "P17851_1002_91253-A_pattern5", "P17851_1002_91253-B_pattern1", "P17851_1003_45703-A_pattern6", "P17851_1004_45703-A_pattern7", "P18554_1001_50107-A_pattern1", "P18554_1002_50107-A_pattern5", "P18554_1003_95096-A_pattern6", "P18554_1004_95096-A_pattern7", "P18554_1005_82301-A_pattern4", "P18554_1005_82301-B_pattern1", "P18554_1006_82301-A_pattern5", "P18554_1006_82301-B_pattern7", "P18554_1007_72859-A_pattern3", "P18554_1007_72859-B_pattern5", "P18554_1008_72859-A_pattern4", "P18554_1008_72859-B_pattern3"]
# Granuloma cluster (small)
cluster2 = ["P17851_1002_91253-A_pattern1", "P18554_1001_50107-A_pattern7", "P18554_1002_50107-A_pattern4", "P18554_1004_95096-A_pattern2", "P18554_1005_82301-A_pattern0", "P18554_1007_72859-B_pattern6", "P18554_1008_72859-A_pattern3", "P18554_1008_72859-B_pattern1"]
# Inner epidermis cluster
cluster5 = ["P17851_1001_91253-A_pattern0", "P17851_1002_91253-A_pattern4", "P17851_1002_91253-B_pattern4", "P17851_1003_45703-A_pattern2", "P17851_1004_45703-A_pattern4", "P18554_1001_50107-A_pattern6", "P18554_1003_95096-A_pattern3", "P18554_1004_95096-A_pattern0", "P18554_1007_72859-B_pattern7", "P18554_1008_72859-A_pattern6", "P18554_1008_72859-B_pattern5"]
# Outer epidermis cluster
cluster3 = ["P17851_1001_91253-A_pattern1", "P17851_1002_91253-A_pattern2", "P17851_1002_91253-B_pattern5", "P17851_1002_91253-B_pattern7", "P17851_1003_45703-A_pattern1", "P17851_1004_45703-A_pattern5", "P18554_1001_50107-A_pattern2", "P18554_1002_50107-A_pattern6", "P18554_1002_50107-A_pattern7", "P18554_1003_95096-A_pattern5", "P18554_1004_95096-A_pattern1", "P18554_1005_82301-A_pattern3", "P18554_1005_82301-B_pattern0", "P18554_1005_82301-B_pattern3", "P18554_1006_82301-B_pattern4", "P18554_1007_72859-A_pattern0", "P18554_1007_72859-B_pattern0", "P18554_1008_72859-B_pattern7"]

In [None]:
plot_in_order_annotations(anndata = adata, patternsdf = df_all, patternlist = cluster4, list_of_annotations = ['GRANULOMA', 'EPIDERMIS'], col_number = 6)

In [None]:
plot_in_order_annotations(anndata = adata, patternsdf = df_all, patternlist = cluster2, list_of_annotations = ['GRANULOMA', 'EPIDERMIS'])

In [None]:
plot_in_order_annotations(anndata = adata, patternsdf = df_all, patternlist = cluster5, list_of_annotations = ['GRANULOMA', 'EPIDERMIS'])

In [None]:
plot_in_order_annotations(anndata = adata, patternsdf = df_all, patternlist = cluster3, list_of_annotations = ['GRANULOMA', 'EPIDERMIS'])

In [None]:
# Further Kmeans k = 2 of granuloma cluster

cluster2_sub1 = ["P17851_1002_91253-A_pattern5", "P18554_1001_50107-A_pattern1", "P18554_1004_95096-A_pattern7", "P18554_1005_82301-A_pattern4", "P18554_1007_72859-B_pattern5", "P18554_1008_72859-A_pattern4", "P18554_1008_72859-B_pattern3"]
cluster2_sub2 = ["P17851_1001_91253-A_pattern6", "P17851_1002_91253-B_pattern1", "P17851_1003_45703-A_pattern6", "P17851_1004_45703-A_pattern7", "P18554_1002_50107-A_pattern5", "P18554_1003_95096-A_pattern6", "P18554_1005_82301-B_pattern1", "P18554_1006_82301-A_pattern5", "P18554_1006_82301-B_pattern7", "P18554_1007_72859-A_pattern3"]

In [None]:
plot_in_order_annotations(anndata = adata, patternsdf = df_all, patternlist = cluster2_sub1, list_of_annotations = ['GRANULOMA', 'EPIDERMIS'])

In [None]:
plot_in_order_annotations(anndata = adata, patternsdf = df_all, patternlist = cluster2_sub2, list_of_annotations = ['GRANULOMA', 'EPIDERMIS'])

In [None]:
# Further k Means 2 of granuloma clusters 2 and 4
cluster24_sub1 = ["P17851_1002_91253-A_pattern5", "P17851_1002_91253-B_pattern1", "P17851_1004_45703-A_pattern7", "P18554_1001_50107-A_pattern1", "P18554_1002_50107-A_pattern5", "P18554_1004_95096-A_pattern7", "P18554_1005_82301-B_pattern1", "P18554_1006_82301-A_pattern5", "P18554_1006_82301-B_pattern7", "P18554_1007_72859-A_pattern3", "P18554_1007_72859-B_pattern5", "P18554_1008_72859-B_pattern3"]
cluster24_sub2 = ["P18554_1002_50107-A_pattern4", "P18554_1004_95096-A_pattern2", "P18554_1005_82301-A_pattern0", "P18554_1007_72859-B_pattern6"]

In [None]:
plot_in_order_annotations(anndata = adata, patternsdf = df_all, patternlist = cluster24_sub1, list_of_annotations = ['GRANULOMA', 'EPIDERMIS'])

In [None]:
plot_in_order_annotations(anndata = adata, patternsdf = df_all, patternlist = cluster24_sub2, list_of_annotations = ['GRANULOMA', 'EPIDERMIS'])

In [None]:
plot_in_order_annotations(anndata = adata, patternsdf = df_all, patternlist = cluster4, list_of_annotations = [], col_number = 9)

In [None]:
plot_in_order_annotations(anndata = adata, patternsdf = df_all, patternlist = cluster2, list_of_annotations = [], col_number = 9)

In [None]:
plot_in_order_annotations(anndata = adata, patternsdf = df_all, patternlist = cluster5, list_of_annotations = [], col_number = 8)

In [None]:
plot_in_order_annotations(anndata = adata, patternsdf = df_all, patternlist = cluster3, list_of_annotations = [], col_number = 9)