# Creating new adata with unique rownames and new columns

#### Relevant libraries and colour dictionaries <a class="anchor" id="chapter1"></a>

In [1]:
# Import relevant libraries
import numpy as np
import scanpy as sc
import os
import pandas as pd
import seaborn as sb
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sns
from collections import OrderedDict
from matplotlib import cm
import anndata as ann
import scanpy.external as sce
from datetime import datetime
import NaiveDE
import SpatialDE
from matplotlib_venn import venn3
%matplotlib inline
import glob
from sklearn.metrics import mean_squared_error as mse
from scipy import stats

# Set current directory
os.chdir("/Users/mendenlab/work/spatial_granuloma/scripts")

In [21]:
# Import adata 
adata_path = "../results/current/"

adata = sc.read(os.path.join(adata_path, "final/Granuloma_QC_clustering.h5"))
    
# setting up "factors" with different levels, order = TRUE
# add less common annotations LAST so they are not overwritten

# Set spot_type and skin_layer as categories and define the levels in each category
# Spot type: annatomical annotations
adata.obs['spot_type'] = pd.Categorical(
    adata.obs['spot_type'],
    categories = ['UNDETERMINED', 'DERMIS', "EPIDERMIS", 'INTERFACE', 'HAIR FOLLICLE',
                'VESSEL', 'MUSCLE', 'SEBACEOUS GLAND', 'SWEAT GLAND', 'GA', 'GNL', 'GSS', 'GSC'],
                 ordered = True)

# Skin layer
adata.obs['skin_layer'] = pd.Categorical(
    adata.obs['skin_layer'],
    categories = ['UNDETERMINED', 
                'upper EPIDERMIS', 'middle EPIDERMIS', 'basal EPIDERMIS',
                'DERdepth1', 'DERdepth2', 'DERdepth3', 'DERdepth4',
                'DERdepth5', 'DERdepth6', 'DERdepth7'],
    ordered = True)

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.


In [22]:
adata

AnnData object with n_obs × n_vars = 15777 × 15107
    obs: 'sample', 'project', 'slide', 'in_tissue', 'array_row', 'array_col', 'ANNOTATOR', 'DISEASE', 'disease', 'SAMPLE', 'LESIONAL', 'NON LESIONAL', 'upper EPIDERMIS', 'middle EPIDERMIS', 'basal EPIDERMIS', 'DERMIS', 'DERdepth1', 'DERdepth2', 'DERdepth3', 'DERdepth4', 'DERdepth5', 'DERdepth6', 'DERdepth7', 'INTERFACE', 'VESSEL', 'HAIR FOLLICLE', 'SWEAT GLAND', 'SEBACEOUS GLAND', 'MUSCLE', 'FAT TISSUE', 'KERATINOCYTE', 'ENDOTHELIAL', 'GRANULOMA', 'BIOBANK', 'SPECIMEN', 'GA', 'GNL', 'GSC', 'GSS', 'EPIDERMIS', 'initial_size_spliced', 'initial_size_unspliced', 'initial_size', 'library_id', 'batch', 'specimen', 'patient', 'biopsy_type', 'cell_type', 'tissue_type', 'skin_layer', 'spot_type', 'n_counts', 'log_counts', 'n_genes', 'mt_frac', 'size_factors', 'leiden_r5', 'leiden_r3', 'leiden_r1.5', 'leiden_r1.3', 'leiden_r1.0', 'leiden_r0.8', 'leiden_r0.5', 'leiden_r0.3', 'leiden_r5_patient', 'leiden_r3_patient', 'leiden_r1.5_patient', 'leiden

In [23]:
# Get indexes to be unique values and column with spot indexes to column 'indexes'
adata.obs = adata.obs.rename_axis('indexes').reset_index()
#adata.obs
adata.obs['shorter_id'] = (adata.obs.groupby(['sample']).ngroup()) + 1
adata.obs['unique_indexes'] = adata.obs['indexes'] + '_' + adata.obs['shorter_id'].astype(str)
adata.obs.set_index('unique_indexes')

Unnamed: 0_level_0,indexes,sample,project,slide,in_tissue,array_row,array_col,ANNOTATOR,DISEASE,disease,...,leiden_r3_patient,leiden_r1.5_patient,leiden_r1.3_patient,leiden_r1.0_patient,leiden_r0.8_patient,leiden_r0.5_patient,leiden_r0.3_patient,sample_SPECIMEN,Border_spot,shorter_id
unique_indexes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACACCAATAACTGC_1,AAACACCAATAACTGC,P17851_1001,P17851,V10J29-080-V1,1,59,19,S,granuloma annulare,granuloma annulare,...,12,1,4,0,2,0,0,P17851_1001_91253-C,0,1
AAACAGTGTTCCTGGG_1,AAACAGTGTTCCTGGG,P17851_1001,P17851,V10J29-080-V1,1,73,43,S,granuloma annulare,granuloma annulare,...,26,14,14,10,9,5,5,P17851_1001_91253-C,0,1
AAACATGGTGAGAGGA_1,AAACATGGTGAGAGGA,P17851_1001,P17851,V10J29-080-V1,1,62,0,S,granuloma annulare,granuloma annulare,...,2,15,1,2,0,1,2,P17851_1001_91253-C,0,1
AAACCGTTCGTCCAGG_1,AAACCGTTCGTCCAGG,P17851_1001,P17851,V10J29-080-V1,1,52,42,S,granuloma annulare,granuloma annulare,...,12,1,4,0,2,0,0,P17851_1001_91253-C,0,1
AAACGAAGAACATACC_1,AAACGAAGAACATACC,P17851_1001,P17851,V10J29-080-V1,1,6,64,S,granuloma annulare,granuloma annulare,...,21,6,15,9,8,7,5,P17851_1001_91253-A,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTATCACACAGAAT_12,TTGTATCACACAGAAT,P18554_1008,P18554,V10M09-087-V4,1,12,74,S,sarcoidosis,sarcoidosis suspected,...,1,0,0,1,0,1,2,P18554_1008_72859-B,0,12
TTGTGAACCTAATCCG_12,TTGTGAACCTAATCCG,P18554_1008,P18554,V10M09-087-V4,1,56,90,S,sarcoidosis,sarcoidosis suspected,...,10,14,14,10,9,5,5,P18554_1008_72859-A,1,12
TTGTGGCCCTGACAGT_12,TTGTGGCCCTGACAGT,P18554_1008,P18554,V10M09-087-V4,1,18,60,S,sarcoidosis,sarcoidosis suspected,...,15,3,3,5,4,2,3,P18554_1008_72859-B,1,12
TTGTTAGCAAATTCGA_12,TTGTTAGCAAATTCGA,P18554_1008,P18554,V10M09-087-V4,1,22,42,S,sarcoidosis,sarcoidosis suspected,...,26,14,14,10,9,5,0,P18554_1008_72859-C,0,12


In [24]:
adata.obs[['sample', 'shorter_id']].drop_duplicates()

Unnamed: 0,sample,shorter_id
0,P17851_1001,1
1941,P17851_1002,2
3761,P17851_1003,3
4480,P17851_1004,4
5240,P18554_1001,5
7185,P18554_1002,6
8836,P18554_1003,7
10079,P18554_1004,8
11214,P18554_1005,9
12362,P18554_1006,10


#### Creating annotations <a class="anchor" id="chapter2"></a>

In [None]:
# Outer border of granuloma spots: manual_border_2

# Parameters
# -------------------------------------------------------------------------------------------------------
number_of_neighbours = 2 # Number of neighbours for the border (don't forget to change the name of the annotation at the bottom)
adata.obs['manual_border_2'] = 0 # 0 by default, then replace by 1s

# We will only run this for sample_SPECIMENs that have lesional spots, and exclude the 0 specimens
sample_SPECIMEN_list = list(adata[(adata.obs['LESIONAL'] == 1) & (adata.obs['SPECIMEN'].str.contains('-0')==False)].obs['sample_SPECIMEN'].unique())
sample_SPECIMEN_list.remove('P17851_1001_91253-B') # this one does not have any granuloma spots 

# Loop
# -------------------------------------------------------------------------------------------------------
all_border_indexes = []
for sample_SPECIMEN in sample_SPECIMEN_list:
    # Subset adata 
    adata_sample_SPECIMEN = adata[(adata.obs['sample_SPECIMEN']== sample_SPECIMEN)].copy()
    # Subset to only spots marked as GRANULOMA 
    adata_subset = adata[(adata.obs['sample_SPECIMEN']== sample_SPECIMEN) & 
                     ((adata.obs['spot_type']== 'GA') | 
                      (adata.obs['spot_type']== 'GSC') | 
                      (adata.obs['spot_type']== 'GSS') | 
                      (adata.obs['spot_type']== 'GNL'))].copy()
    
    # Save coordinates of granuloma_spots+ spots of a sample in list of tuples [(x1, y1), (x2, y2), ...]
    print('Extracting coordinates of granuloma spots...')
    granuloma_spots_coordinates = []
    granuloma_spots_coordinates = list(zip(*(adata_subset.obs['array_col'].values, 
                                             adata_subset.obs['array_row'].values)))
    #granuloma_spots_coordinates
    
    # Get coordinates of neighbouring spots, take all unique values
    #print('Extracting coordinates of neighbouring spots...')
    all_coordinates = []
    for g_coord in granuloma_spots_coordinates:
        all_coordinates_temp = index_spots(g_coord, number_of_neighbours)
        all_coordinates.append(all_coordinates_temp)

    all_coordinates = flatten(all_coordinates)
    #print('Number of neighbouring spots: ' + str(len(set(all_coordinates))))
    
    # Extract unique indexes
    all_indexes = []
    for coord in set(all_coordinates):
        #print(coord)
        index_temp = list(adata_sample_SPECIMEN.obs[(adata_sample_SPECIMEN.obs['array_col']== coord[0]) & (adata_sample_SPECIMEN.obs['array_row']== coord[1])]['unique_indexes'])
        #print(index_temp)
        all_indexes.extend(index_temp)
    #all_indexes
    all_border_indexes.append(all_indexes)

all_border_indexes = flatten(all_border_indexes)
    
# create a new column in adata.obs called 'manual_border_2' which has value 1 if it is a border spot (indexes of those spots are saved in all_border_indexes) and 0 otherwise
adata.obs['manual_border_2'] = np.where(adata.obs['unique_indexes'].isin(all_border_indexes), 1, 0)

In [None]:
# Set leiden r = 1.3 clusters as ground truth - 
# We will group all leiden clusters in 4 clusters: core granuloma, border, and non-granuloma, epidermis pattern ('annotation groups').

#sns.countplot(x='leiden_r1.3_patient', data=adata.obs, hue = 'Border_spot') # Border spots generally map to cluster 3
#sns.countplot(x='leiden_r1.3_patient', data=adata.obs, hue = 'GRANULOMA')

adata.obs['leiden_core_granuloma'] = np.where(adata.obs['leiden_r1.3_patient'] == '3', 1, 0)
adata.obs['leiden_border_granuloma'] = np.where((adata.obs['leiden_r1.3_patient'] == '12') | 
                                                (adata.obs['leiden_r1.3_patient'] == '13'), 1, 0)
adata.obs['leiden_epidermis'] = np.where((adata.obs['leiden_r1.3_patient'] == '11') | 
                                  (adata.obs['leiden_r1.3_patient'] == '14') | 
                                  (adata.obs['leiden_r1.3_patient'] == '15'), 1, 0)
adata.obs['leiden_nongranuloma'] = np.where((adata.obs['leiden_r1.3_patient'] != '3') & 
                                  (adata.obs['leiden_r1.3_patient'] != '12') &
                                  (adata.obs['leiden_r1.3_patient'] != '13'), 1, 0)

# Exclusively dermis spots for lesional and non-lesional samples, excluding necrobiosis lipoidica.
adata.obs['dermis_lesional'] = np.where((adata.obs['spot_type'] == 'DERMIS') & 
                                        (adata.obs['DISEASE'] != 'necrobiosis lipoidica') &
                                        (adata.obs['LESIONAL'] == 1) &
                                        (adata.obs['GRANULOMA'] == 0), 1, 0)
adata.obs['dermis_nonlesional'] = np.where((adata.obs['spot_type'] == 'DERMIS') & 
                                           (adata.obs['DISEASE'] != 'necrobiosis lipoidica') &
                                           (adata.obs['LESIONAL'] == 0), 1, 0)

adata.obs['dermis_lesional_noborder'] = np.where((adata.obs['dermis_lesional'] == 1) & 
                                                 (adata.obs['manual_border_2'] != 1), 1, 0)
adata.obs['dermis_nonlesional_noborder'] = np.where((adata.obs['dermis_nonlesional'] == 1) & 
                                                 (adata.obs['manual_border_2'] != 1), 1, 0)

adata.obs['epidermis_interface'] = np.where((adata.obs['spot_type'] == 'EPIDERMIS') | 
                                  (adata.obs['spot_type'] == 'INTERFACE'), 1, 0)

# To check if it worked
#sns.countplot(x='leiden_r1.3_patient', data=adata.obs, hue = 'leiden_nongranuloma')

In [None]:
adata_subset = adata[(adata.obs['sample_SPECIMEN']== 'P18554_1001_50107-A')]
#adata_subset = adata[(adata.obs['sample_SPECIMEN']== 'P18554_1004_95096-B')]
#adata_subset = adata[(adata.obs['sample_SPECIMEN']== 'P18554_1002_50107-C')]
fig = plt.figure(facecolor = 'w', edgecolor = 'k', figsize = (25, 20))

plt.subplot(4, 3, 1)
plt.title('Manual granuloma')
plt.scatter(adata_subset.obs['array_col'], 
            -1.5*adata_subset.obs['array_row'],
            c = adata_subset.obs['GRANULOMA'])
plt.colorbar(ticks=[]);

plt.subplot(4, 3, 2)
plt.title('Leiden granuloma core')
plt.scatter(adata_subset.obs['array_col'], 
            -1.5*adata_subset.obs['array_row'],
            c = adata_subset.obs['leiden_core_granuloma'])
plt.colorbar(ticks=[]);

plt.subplot(4, 3, 3)
plt.title('Leiden border')
plt.scatter(adata_subset.obs['array_col'], 
            -1.5*adata_subset.obs['array_row'],
            c = adata_subset.obs['leiden_border_granuloma'])
plt.colorbar(ticks=[]);

plt.subplot(4, 3, 4)
plt.title('Leiden epidermis')
plt.scatter(adata_subset.obs['array_col'], 
            -1.5*adata_subset.obs['array_row'],
            c = adata_subset.obs['leiden_epidermis'])
plt.colorbar(ticks=[]);

plt.subplot(4, 3, 5)
plt.title('epidermis_interface')
plt.scatter(adata_subset.obs['array_col'], 
            -1.5*adata_subset.obs['array_row'],
            c = adata_subset.obs['epidermis_interface'])
plt.colorbar(ticks=[]);

plt.subplot(4, 3, 6)
plt.title('dermis_lesional')
plt.scatter(adata_subset.obs['array_col'], 
            -1.5*adata_subset.obs['array_row'],
            c = adata_subset.obs['dermis_lesional'])
plt.colorbar(ticks=[]);

plt.subplot(4, 3, 7)
plt.title('dermis_nonlesional')
plt.scatter(adata_subset.obs['array_col'], 
            -1.5*adata_subset.obs['array_row'],
            c = adata_subset.obs['dermis_nonlesional'])
plt.colorbar(ticks=[]);

plt.subplot(4, 3, 8)
plt.title('dermis_nonlesional_noborder')
plt.scatter(adata_subset.obs['array_col'], 
            -1.5*adata_subset.obs['array_row'],
            c = adata_subset.obs['dermis_nonlesional_noborder'])
plt.colorbar(ticks=[]);

plt.subplot(4, 3, 9)
plt.title('dermis_lesional_noborder')
plt.scatter(adata_subset.obs['array_col'], 
            -1.5*adata_subset.obs['array_row'],
            c = adata_subset.obs['dermis_lesional_noborder'])
plt.colorbar(ticks=[]);

plt.subplot(4, 3, 10)
plt.title('Spot type')
plt.scatter(adata_subset.obs['array_col'], 
            -1.5*adata_subset.obs['array_row'],
            c = adata_subset.obs['spot_type'].map(spot_colors))
plt.colorbar(ticks=[]);

plt.subplot(4, 3, 11)
plt.title('Skin layer')
plt.scatter(adata_subset.obs['array_col'], 
            -1.5*adata_subset.obs['array_row'],
            c = adata_subset.obs['skin_layer'].map(dermis_colors))
plt.colorbar(ticks=[]);

plt.subplot(4, 3, 12)
plt.title('Leiden clusters')
plt.scatter(adata_subset.obs['array_col'], 
            -1.5*adata_subset.obs['array_row'],
            c = adata_subset.obs['leiden_r1.3_patient'].map(leiden_r13_colours))
plt.colorbar(ticks=[]);

In [None]:
adata.obs['dermis_lesvsnonles'] = np.where((adata.obs['dermis_lesional_noborder'] == 1), 'dermis_lesional',
                                           np.where((adata.obs['dermis_nonlesional_noborder'] == 1), 'dermis_nonlesional', 'other'))

In [None]:
# Combine them in a single column
# Create columns for each deg comparison we want to do
  
# Dermis lesional vs non-lesional
adata.obs['dermis_lesvsnonles'] = np.where((adata.obs['dermis_lesional_noborder'] == 1), 'dermis_lesional',
                                           np.where((adata.obs['dermis_nonlesional_noborder'] == 1), 'dermis_nonlesional', 'other'))
                                           

# GA: border vs core
adata.obs['ga_corevsborder'] = np.where((adata.obs['DISEASE'] == "granuloma annulare") & (adata.obs['leiden_core_granuloma'] == 1), 'ga_core',
                                           np.where((adata.obs['DISEASE'] == "granuloma annulare") & (adata.obs['leiden_border_granuloma'] == 1), 'ga_border', 'other'))

# SA: border vs core
adata.obs['sa_corevsborder'] = np.where((adata.obs['DISEASE'] == "sarcoidosis") & (adata.obs['leiden_core_granuloma'] == 1), 'sa_core',
                                           np.where((adata.obs['DISEASE'] == "sarcoidosis") & (adata.obs['leiden_border_granuloma'] == 1), 'sa_border', 'other'))

# GA: granuloma vs dermis
adata.obs['ga_gvsd'] = np.where((adata.obs['DISEASE'] == "granuloma annulare") & (adata.obs['GRANULOMA'] == 1), 'ga_granuloma',
                                           np.where((adata.obs['dermis_nonlesional_noborder'] == 1), 'healthydermis', 'other'))

# SA: granuloma vs dermis
adata.obs['sa_gvsd'] = np.where((adata.obs['DISEASE'] == "sarcoidosis") & (adata.obs['GRANULOMA'] == 1), 'sa_granuloma',
                                           np.where((adata.obs['dermis_nonlesional_noborder'] == 1), 'healthydermis', 'other'))

# NL: granuloma vs dermis
adata.obs['nl_gvsd'] = np.where((adata.obs['DISEASE'] == "necrobiosis lipoidica") & (adata.obs['GRANULOMA'] == 1), 'nl_granuloma',
                                           np.where((adata.obs['dermis_nonlesional_noborder'] == 1), 'healthydermis', 'other'))

# Granuloma manual: granuloma vs dermis
adata.obs['manual_gvsd'] = np.where((adata.obs['GRANULOMA'] == 1), 'granuloma',
                                    np.where((adata.obs['dermis_nonlesional_noborder'] == 1), 'healthydermis', 'other'))

# Granuloma Leiden: granuloma vs dermis
adata.obs['leiden_gvsd'] = np.where((adata.obs['leiden_core_granuloma'] == 1) | (adata.obs['leiden_border_granuloma'] == 1), 'leiden_granuloma',
                                           np.where((adata.obs['dermis_nonlesional_noborder'] == 1), 'healthydermis', 'other'))

# Epidermis vs dermis (with interface)
adata.obs['epivsdermis'] = np.where((adata.obs['epidermis_interface'] == 1), 'epidermis',
                                           np.where((adata.obs['dermis_nonlesional_noborder'] == 1), 'healthydermis', 'other'))

# Epidermis vs dermis (without interface)
adata.obs['epivsdermis_withoutinterface'] = np.where((adata.obs['spot_type'] == 'EPIDERMIS'), 'epidermis',
                                                     np.where((adata.obs['dermis_nonlesional_noborder'] == 1), 'healthydermis', 'other'))

In [None]:
# Map continents to the colors

adata_subset = adata[(adata.obs['sample_SPECIMEN']== 'P18554_1001_50107-A')]
color_map = dict(zip(adata_subset.obs['epivsdermis_withoutinterface'].unique(), sns.color_palette("Set2", 3)))

#adata_subset = adata[(adata.obs['sample_SPECIMEN']== 'P18554_1001_50107-A')]
#adata_subset = adata[(adata.obs['sample_SPECIMEN']== 'P18554_1004_95096-B')]
#adata_subset = adata[(adata.obs['sample_SPECIMEN']== 'P18554_1002_50107-C')]
fig = plt.figure(facecolor = 'w', edgecolor = 'k', figsize = (25, 20))

plt.subplot(4, 3, 1)
plt.title('Manual granuloma')
plt.scatter(adata_subset.obs['array_col'], 
            -1.5*adata_subset.obs['array_row'],
            c = adata_subset.obs['epivsdermis_withoutinterface'].map(color_map))
plt.colorbar(ticks=[]);


adata_subset = adata[(adata.obs['sample_SPECIMEN']== 'P18554_1002_50107-C')]
color_map = dict(zip(adata_subset.obs['epivsdermis_withoutinterface'].unique(), sns.color_palette("Set2", 3)))

#adata_subset = adata[(adata.obs['sample_SPECIMEN']== 'P18554_1001_50107-A')]
#adata_subset = adata[(adata.obs['sample_SPECIMEN']== 'P18554_1004_95096-B')]
#adata_subset = adata[(adata.obs['sample_SPECIMEN']== 'P18554_1002_50107-C')]
fig = plt.figure(facecolor = 'w', edgecolor = 'k', figsize = (25, 20))

plt.subplot(4, 3, 1)
plt.title('Manual granuloma')
plt.scatter(adata_subset.obs['array_col'], 
            -1.5*adata_subset.obs['array_row'],
            c = adata_subset.obs['epivsdermis_withoutinterface'].map(color_map))
plt.colorbar(ticks=[]);

## Save it!

In [None]:
# Save only if they are ready! Careful because you might overwrite previous files!!!
# To save, uncomment the following lines

# Save anndata observations and variables as .csv
# pd.DataFrame(adata.X).to_csv("/Volumes/Drive/spatial_granuloma/output/DEG/normalised_counts.csv")
# adata.obs.to_csv("/Volumes/Drive/spatial_granuloma/output/DEG/normalised_counts_obs.csv")
# adata.var.to_csv("/Volumes/Drive/spatial_granuloma/output/DEG/normalised_counts_var.csv")

# Save as .h5 object
# os.chdir("/Volumes/Drive/spatial_granuloma/output/DEG/") # Set working directory so it saves it in the drive
# sc.write(os.path.join('adata_deg.h5'), adata)
# os.chdir("/Users/mendenlab/work/spatial_granuloma/results/current/final/") # Set working directory so it saves it in the drive
# sc.write(os.path.join('adata_deg.h5'), adata)