# SCAFOLD Function
SCAFOLD identifies local structures ('clusters') in spatial data based on user defined values

#### Inputs: 
1. adata = adata object containing x,y coordinates of cells under adata_comb.obsm['spatial'] and cell type annotations under adata.obs['ct']
2. k = number of nearest neighbors for knn
3. r = distance cutoff between two cells
4. valuelist = list of cell types of interest

#### Output: 
Adds cluster assignments to adata.obs['cluster']

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import pickle
from anndata import AnnData
import warnings
import hotspot
import squidpy as sq
from scipy.cluster.hierarchy import DisjointSet
import copy
warnings.filterwarnings("ignore")

In [2]:
def runScafold(adata, k, r, valuelist):
    filtered_neighs = makeneighborgraph(adata, k, r, valuelist)
    joinsets(adata, filtered_neighs)
    
def makeneighborgraph(adata, k, r, valuelist):
    sc.pp.neighbors(adata, n_neighbors= k + 1, use_rep = 'spatial', key_added='knn')
    g = copy.deepcopy(adata.obsp['knn_distances'])
    g[g >= r] = 0 # filter edges on radius cutoff
    g[g > 0] = 1 
    
    keepidx = [i for i,c in enumerate(adata.obs['ct']) if c in valuelist] 
    map_idxtoct = dict(map(lambda i,j : (i,j) , keepidx,[adata.obs['ct'][x] for x in keepidx])) 
    
    filtered_neighs = dict() 
    for idx in keepidx:
        filtered_neighs[idx] = set([i for i in np.nonzero(g[idx,:])[1] if i in map_idxtoct])

    return filtered_neighs

def joinsets(adata, filtered_neighs): # call union find
    edges = set()
    nodes = set()
    for k in filtered_neighs:
        nodes.add(k)
        for v in filtered_neighs[k]:
            edges.add((k,v))
            
    disjoint_set = DisjointSet(nodes)
    for a,b in edges:
        disjoint_set.merge(a, b)

    clusters = dict() # key is cell; value is cluster assignment (root cell)
    for s in disjoint_set.subsets():
        for c in s:
            clusters[c] = disjoint_set.__getitem__(c)

    adata.obs['cluster'] = range(adata.shape[0])
    adata.obs = adata.obs.replace({"cluster": clusters})



In [3]:
adata_comb = pickle.load(open('/data1/greenbab/users/zhangb2/xenium_files/adatas_megacorrected_TLS_annotated_LOCK.pickle','rb'))

In [4]:
adata_comb.obsm['spatial']

array([[ 1735.67320709,  5907.53197209],
       [ 8052.32246051,  5706.30418283],
       [ 8289.39414416, 11536.8386676 ],
       ...,
       [ 2274.22886566, 13574.94622184],
       [  818.37917797,  2576.73437019],
       [ 1369.77385112,  4876.46429951]])

In [5]:
adata_comb.obs['ct'] = adata_comb.obs['ct_general']
adata_comb.obs['ct']

aaaaffnp-1-0               Megakaryocyte
aaaaojnj-1-0                   Erythroid
aaaapmda-1-0     Maturing/Mature Myeloid
aaabbcdp-1-0                      T Cell
aaabkmdn-1-0                   Erythroid
                          ...           
oihnhiki-1-19    Maturing/Mature Myeloid
oiifcaee-1-19                Plasma Cell
oijcbbpj-1-19    Maturing/Mature Myeloid
oikcgdcb-1-19    Maturing/Mature Myeloid
oikdhjmf-1-19    Maturing/Mature Myeloid
Name: ct, Length: 1852727, dtype: category
Categories (17, object): ['Adipocyte', 'B Cell', 'Early Myeloid', 'Endothelial', ..., 'Plasma Cell', 'T Cell', 'VSMC', 'pDC']

In [6]:
df = None
for samp in adata_comb.obs['sample'].unique():
    print(samp)
    adatasamp = adata_comb[adata_comb.obs['sample'] == samp]
    runScafold(adatasamp, 10, 20, ['T Cell', 'B Cell', 'pDC', 'NK']) # to find lymphoid aggregates
    df = pd.concat([df, adatasamp.obs.loc[: , ['sample','cluster']]])

0
1
2
3
4
5
6
8
9
10
11
13
14
15
16
17
18
19


In [7]:
# add cluster annotations to combined adata
df['cluster_new'] = [str(df['sample'][i]) + '_' + str(j) for i,j in enumerate(df['cluster'])]
di = dict(zip(df.index, df.cluster_new))
adata_comb.obs['cluster'] = [di[x] for x in adata_comb.obs.index]
