In [1]:
import os
import scanpy as sc
import pandas as pd
import numpy as np
import anndata as ad
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

sc.settings.verbosity = 0
sc.settings.set_figure_params(dpi=300, dpi_save=500)

In [2]:
data_dir = '../../../Data/Spatial/Proteomics/CODEX_CRC_Schurch2020/processed/'

In [3]:
df = pd.read_csv(data_dir + 'CRC_clusters_neighborhoods_markers.csv', header=0, index_col=0)
df

Unnamed: 0,CellID,ClusterID,EventID,File Name,Region,TMA_AB,TMA_12,Index in File,groups,patients,...,CD68+Ki67+,CD68+PD-1+,CD8+ICOS+,CD8+Ki67+,CD8+PD-1+,Treg-ICOS+,Treg-Ki67+,Treg-PD-1+,neighborhood number final,neighborhood name
0,0,10668,0,reg001_A,reg001,A,1,0,1,1,...,0,0,0,0,0,0,0,0,9.0,Granulocyte enriched
1,1,10668,4,reg001_A,reg001,A,1,4,1,1,...,0,0,0,0,0,0,0,0,4.0,Macrophage enriched
2,2,10668,5,reg001_A,reg001,A,1,5,1,1,...,0,0,0,0,0,0,0,0,3.0,Immune-infiltrated stroma
3,3,10668,6,reg001_A,reg001,A,1,6,1,1,...,0,0,0,0,0,0,0,0,3.0,Immune-infiltrated stroma
4,4,10668,30,reg001_A,reg001,A,1,30,1,1,...,0,0,0,0,0,0,0,0,4.0,Macrophage enriched
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258380,258380,10664,216709,reg057_B,reg057,B,2,1002,1,29,...,0,0,0,0,0,0,0,0,5.0,Follicle
258381,258381,10664,222124,reg059_A,reg059,A,1,1272,2,30,...,0,0,0,0,0,0,0,0,3.0,Immune-infiltrated stroma
258382,258382,10664,234850,reg062_A,reg062,A,1,735,2,31,...,0,0,0,0,0,0,0,0,1.0,T cell enriched
258383,258383,10664,249806,reg067_A,reg067,A,1,174,1,34,...,0,0,0,0,0,0,0,0,6.0,Tumor boundary


In [None]:
fnames = sorted(list(set(list(df['File Name']))))

adata_list = []
file_name_list = []

for fname in fnames:

    df_filtered = df[df['File Name'] == fname]
    
    cols1 = df.columns[df.columns.get_loc('CD44 - stroma:Cyc_2_ch_2'):df.columns.get_loc('CD138 - plasma cells:Cyc_21_ch_3') + 1]
    cols2 = df.columns[df.columns.get_loc('CDX2 - intestinal epithelia:Cyc_2_ch_4'):df.columns.get_loc('MMP12 - matrix metalloproteinase:Cyc_21_ch_4') + 1]
    cols = list(cols1) + list(cols2)
    data_matrix = df_filtered[cols].values
    
    remaining_cols = df_filtered.drop(columns=cols).columns
    
    spatial_coor = df_filtered[['X:X', 'Y:Y']].values
    
    adata = ad.AnnData(X=data_matrix, obs=df_filtered[remaining_cols].astype(str), var=pd.DataFrame(index=cols))
    adata.obsm['spatial'] = spatial_coor

    adata = adata[adata.obs['ClusterName'] != 'dirt', :]

    if len(list(set(list(adata.obs['patients'])))) != 1:
        raise ValueError(f'More than 1 patient in {fname} !')
    if len(list(set(list(adata.obs['groups'])))) != 1:
        raise ValueError(f'More than 1 group for {fname} !')
    patient = list(set(list(adata.obs['patients'])))[0]
    group = list(set(list(adata.obs['groups'])))[0]

    print(fname + f' patient: {patient} group: {group} shape: {adata.shape}')
    # adata.write_h5ad(data_dir + fname + f'_patient{patient}_group{group}.h5ad')
    
    adata_list.append(adata)
    file_name_list.append(fname + f'_patient{patient}_group{group}')

# with open(data_dir + "file_name_list.txt", "w") as f:
#     for file_name in file_name_list:
#         f.write(file_name + '\n')

with open(data_dir + "file_name_list.txt", "r") as f:
    file_names = [line.strip() for line in f.readlines()]

reg001_A patient: 1 group: 1 shape: (1107, 56)
reg001_B patient: 1 group: 1 shape: (349, 56)
reg002_A patient: 1 group: 1 shape: (1373, 56)
reg002_B patient: 1 group: 1 shape: (2623, 56)
reg003_A patient: 2 group: 2 shape: (1264, 56)
reg003_B patient: 2 group: 2 shape: (1231, 56)
reg004_A patient: 2 group: 2 shape: (1475, 56)
reg004_B patient: 2 group: 2 shape: (1892, 56)
reg005_A patient: 3 group: 2 shape: (2552, 56)
reg005_B patient: 3 group: 2 shape: (1657, 56)
reg006_A patient: 3 group: 2 shape: (1376, 56)
reg006_B patient: 3 group: 2 shape: (2086, 56)
reg007_A patient: 4 group: 2 shape: (3008, 56)
reg007_B patient: 4 group: 2 shape: (2343, 56)
reg008_A patient: 4 group: 2 shape: (1397, 56)
reg008_B patient: 4 group: 2 shape: (3322, 56)
reg009_A patient: 5 group: 2 shape: (2428, 56)
reg009_B patient: 5 group: 2 shape: (3394, 56)
reg010_A patient: 5 group: 2 shape: (2077, 56)
reg010_B patient: 5 group: 2 shape: (1250, 56)
reg011_A patient: 6 group: 1 shape: (2746, 56)
reg011_B patie

In [27]:
adata_concat = ad.concat(adata_list, keys=file_name_list, label='slice_name')

In [13]:
adata_concat.obs['ClusterName'].value_counts()

ClusterName
tumor cells                   47602
CD68+CD163+ macrophages       39596
smooth muscle                 27817
granulocytes                  22144
stroma                        20139
CD8+ T cells                  16675
CD4+ T cells CD45RO+          16661
B cells                       13043
vasculature                   11725
plasma cells                   8510
undefined                      6524
immune cells                   3127
Tregs                          2791
CD4+ T cells                   2303
immune cells / vasculature     2153
CD68+ macrophages              2108
adipocytes                     1811
tumor cells / immune cells     1797
CD11b+CD68+ macrophages        1500
CD11b+ monocytes                815
nerves                          659
CD11c+ DCs                      400
lymphatics                      328
NK cells                        323
CD3+ T cells                    189
CD68+ macrophages GzmB+         183
CD4+ T cells GATA3+              67
CD163+ macrophag

In [None]:
proteins = ['CD44 - stroma:Cyc_2_ch_2',
            'FOXP3 - regulatory T cells:Cyc_2_ch_3',
            'CD8 - cytotoxic T cells:Cyc_3_ch_2',
            'p53 - tumor suppressor:Cyc_3_ch_3',
            'GATA3 - Th2 helper T cells:Cyc_3_ch_4',
            'CD45 - hematopoietic cells:Cyc_4_ch_2',
            'T-bet - Th1 cells:Cyc_4_ch_3',
            'beta-catenin - Wnt signaling:Cyc_4_ch_4',
            'HLA-DR - MHC-II:Cyc_5_ch_2',
            'PD-L1 - checkpoint:Cyc_5_ch_3',
            'Ki67 - proliferation:Cyc_5_ch_4',
            'CD45RA - naive T cells:Cyc_6_ch_2',
            'CD4 - T helper cells:Cyc_6_ch_3',
            'CD21 - DCs:Cyc_6_ch_4',
            'MUC-1 - epithelia:Cyc_7_ch_2',
            'CD30 - costimulator:Cyc_7_ch_3',
            'CD2 - T cells:Cyc_7_ch_4',
            'Vimentin - cytoplasm:Cyc_8_ch_2',
            'CD20 - B cells:Cyc_8_ch_3',
            'LAG-3 - checkpoint:Cyc_8_ch_4',
            'Na-K-ATPase - membranes:Cyc_9_ch_2',
            'CD5 - T cells:Cyc_9_ch_3',
            'IDO-1 - metabolism:Cyc_9_ch_4',
            'Cytokeratin - epithelia:Cyc_10_ch_2',
            'CD11b - macrophages:Cyc_10_ch_3',
            'CD56 - NK cells:Cyc_10_ch_4',
            'aSMA - smooth muscle:Cyc_11_ch_2',
            'BCL-2 - apoptosis:Cyc_11_ch_3',
            'CD25 - IL-2 Ra:Cyc_11_ch_4',
            'CD11c - DCs:Cyc_12_ch_3',
            'PD-1 - checkpoint:Cyc_12_ch_4',
            'Granzyme B - cytotoxicity:Cyc_13_ch_2',
            'EGFR - signaling:Cyc_13_ch_3',
            'VISTA - costimulator:Cyc_13_ch_4',
            'CD15 - granulocytes:Cyc_14_ch_2',
            'ICOS - costimulator:Cyc_14_ch_4',
            'Synaptophysin - neuroendocrine:Cyc_15_ch_3',
            'GFAP - nerves:Cyc_16_ch_2',
            'CD7 - T cells:Cyc_16_ch_3',
            'CD3 - T cells:Cyc_16_ch_4',
            'Chromogranin A - neuroendocrine:Cyc_17_ch_2',
            'CD163 - macrophages:Cyc_17_ch_3',
            'CD45RO - memory cells:Cyc_18_ch_3',
            'CD68 - macrophages:Cyc_18_ch_4',
            'CD31 - vasculature:Cyc_19_ch_3',
            'Podoplanin - lymphatics:Cyc_19_ch_4',
            'CD34 - vasculature:Cyc_20_ch_3',
            'CD38 - multifunctional:Cyc_20_ch_4',
            'CD138 - plasma cells:Cyc_21_ch_3',
            'CDX2 - intestinal epithelia:Cyc_2_ch_4',
            'Collagen IV - bas. memb.:Cyc_12_ch_2',
            'CD194 - CCR4 chemokine R:Cyc_14_ch_3',
            'MMP9 - matrix metalloproteinase:Cyc_15_ch_2',
            'CD71 - transferrin R:Cyc_15_ch_4',
            'CD57 - NK cells:Cyc_17_ch_4',
            'MMP12 - matrix metalloproteinase:Cyc_21_ch_4']