In [None]:
import numpy as np
import pandas as pd
import os
import anndata

### Preprocess IMC data from 119 TNBC patients from NeoTRIP trial (see Ref. https://www.nature.com/articles/s41586-023-06498-3)

In [None]:
base_dir = '/Volumes/Shared/Noah Greenwald/NTPublic'
save_dir = os.path.join(base_dir, 'intermediate_files', 'metadata')
data_dir = os.path.join(base_dir, 'data')
clinical_df = pd.read_csv(os.path.join(data_dir, 'derived', 'clinical.csv'))
ids_df = pd.read_csv(os.path.join(data_dir, 'derived', 'IDs.csv'))

In [None]:
timepoint_metadata = clinical_df.copy()
timepoint_metadata.rename(columns = {'PatientID':'Patient_ID', 'BiopsyPhase':'Timepoint'}, inplace=True)
timepoint_metadata['Tissue_ID'] = [f'T{i}' for i in range(0, clinical_df.shape[0])]
timepoint_metadata['MIBI_data_generated'] = True

In [None]:
core_metadata = ids_df.copy()
core_metadata.rename(columns = {'ImageID': 'fov', 'PatientID': 'Patient_ID', 'BiopsyPhase': 'Timepoint'}, inplace = True)
core_metadata = pd.merge(core_metadata, timepoint_metadata, on = ['Timepoint', 'Patient_ID'])
core_metadata.drop(columns = ['ImageNumber'], inplace=True)
core_metadata

In [None]:
patient_metadata = clinical_df.copy()
patient_metadata.drop(columns = ['BiopsyPhase', 'isPerProtocol'], inplace = True)
patient_metadata.rename(columns = {'PatientID': 'Patient_ID'}, inplace = True)
patient_metadata = patient_metadata.drop_duplicates()
patient_metadata

In [None]:
# create comparison pairs
subset_metadata = timepoint_metadata.loc[timepoint_metadata.Timepoint.isin(['Baseline', 'On-treatment', 'Post-treatment']), :]
subset_metadata = subset_metadata.loc[subset_metadata.MIBI_data_generated, :]
metadata_wide = pd.pivot(subset_metadata, index='Patient_ID', columns='Timepoint', values='Tissue_ID')

In [None]:
unique_comparisons = []
comparison_lists = timepoint_metadata.groupby(['Patient_ID'])['Timepoint'].unique().reset_index().Timepoint
for i in comparison_lists:
    if tuple(i) not in unique_comparisons:
            unique_comparisons.append(tuple(i))

print(unique_comparisons)

In [None]:
comparison_pairs = [['Baseline', 'On-treatment'], ['On-treatment', 'Post-treatment'], ['Baseline', 'Post-treatment']]

# loop through pairs, find patients with matching tissue, add to patient_metadata
for pair in comparison_pairs:
    current_wide = metadata_wide.loc[:, pair]
    current_wide = current_wide.dropna(axis=0)
    # current_wide = current_wide.loc[current_wide[pair[0]] == current_wide[pair[1]], :]

    patient_metadata['__'.join(pair)] = patient_metadata.Patient_ID.isin(current_wide.index)

In [None]:
# create harmonized metadata 
timepoint_metadata_ids = ['Tissue_ID', 'MIBI_data_generated', 'Patient_ID', 'Timepoint', 'isPerProtocol']
patient_metadata_ids = ['Patient_ID', 'pCR', 'Arm', 'Baseline__On-treatment', 'On-treatment__Post-treatment', 'Baseline__Post-treatment']

harmonized_metadata = core_metadata[['fov', 'Tissue_ID']]

harmonized_metadata = pd.merge(harmonized_metadata, timepoint_metadata.loc[:, timepoint_metadata_ids], on='Tissue_ID', how='left')
assert np.sum(harmonized_metadata.Tissue_ID.isnull()) == 0

harmonized_metadata = pd.merge(harmonized_metadata, patient_metadata.loc[:, patient_metadata_ids], on='Patient_ID', how='inner')
assert np.sum(harmonized_metadata.Tissue_ID.isnull()) == 0

harmonized_metadata

In [None]:
#save csvs for all timepoints
harmonized_metadata.to_csv(os.path.join(save_dir, 'harmonized_metadata.csv'), index=False)
core_metadata.to_csv(os.path.join(save_dir, 'NTPublic_data_per_core.csv'), index=False)
timepoint_metadata.to_csv(os.path.join(save_dir, 'NTPublic_data_per_timepoint.csv'), index=False)
patient_metadata.to_csv(os.path.join(save_dir, 'NTPublic_data_per_patient.csv'), index=False)

### create preprocessed anndata object

In [None]:
base_dir = '/Volumes/Shared/Noah Greenwald/NTPublic'
save_dir = '/Users/jolene/Documents/Angelo_lab/quiche/data/Zenodo'
cell_table = pd.read_csv(os.path.join(base_dir, 'data', 'derived_ark', 'final_cell_table.csv'))
metadata = pd.read_csv(os.path.join(base_dir, 'analysis_files', 'harmonized_metadata.csv'))
cell_table = pd.merge(cell_table, metadata.loc[:, ['fov', 'isPerProtocol']], on = 'fov')
cell_table = cell_table[np.isin(cell_table['cellAnnotation'], ['TME', 'invasive'])]
cell_table = cell_table[np.isin(cell_table['isPerProtocol'], [True])]
markers = ['H3', 'CD163', 'CD20', 'PD-L1 (SP142)', 'CD56', 'Helios',
       'CD8', 'OX40', 'CD11c', 'CD3', 'GATA3', 'SMA', 'TOX', 'T-bet', 'PD-1',
       'IDO', 'AR', 'FOXP3', 'PD-L1 (73-10)', 'ICOS', 'Ki67', 'CD4', 'CK5/14',
       'TCF1', 'PDGFRB', 'CD31', 'GZMB', 'PDPN', 'HLA-ABC', 'c-PARP', 'panCK',
       'CD79a', 'DNA1', 'CK8/18', 'DNA2', 'Carboplatin', 'Vimentin',
       'Calponin', 'Caveolin-1', 'CD15', 'MPO', 'HLA-DR', 'CD68', 'pH2AX',
       'CD45', 'CA9']

cell_table_metadata = cell_table.loc[:, cell_table.columns[~np.isin(cell_table.columns, markers)]].copy()

adata = anndata.AnnData(cell_table.loc[:, markers])
adata.obs = pd.merge(cell_table_metadata, metadata.loc[:, ['fov', 'Tissue_ID', 'Patient_ID', 'pCR', 'Arm']], on = ['Patient_ID', 'fov'])
adata = adata[(adata.obs['Arm'] == 'C') & (adata.obs['BiopsyPhase'] == 'Baseline')] #subset data to include only baseline samples treated with Chemotherapy
adata.obsm['spatial'] = np.array(adata.obs[['centroid-1', 'centroid-0']])
adata.obs['cell_cluster'] = adata.obs['cell_cluster'].replace({'M2_Mac': 'Mac',
                                                                   'Epithelial_1': 'Cancer_4',
                                                                   'Epithelial_2': 'Cancer_4',
                                                                   'Epithelial_3': 'Cancer_4',
                                                                   'Endothelial':'Endothelium'})

phenotypic_markers = ['CK5/14', 'CK8/18', 'panCK', 'AR','CD45', 'CD3', 'CD4', 'CD8', 'FOXP3', 'CD20','CD79a', 'CD56', 'CD68', 'CD163', 'CD11c', 'HLA-DR',  'CD15', 'MPO', 'Calponin', 'SMA', 'Vimentin', 'PDGFRB','PDPN', 'CD31']
functional_markers = ['PD-L1 (SP142)', 'PD-L1 (73-10)', 'IDO', 'PD-1', 'OX40', 'ICOS', 'CA9', 'c-PARP', 'Ki67', 'pH2AX', 'Helios', 'GATA3', 'T-bet', 'TCF1', 'TOX', 'GZMB', 'HLA-ABC']

var_names = phenotypic_markers+functional_markers

cell_ordering = ['Cancer_4', 'CD4T', 'CD8T', 'Treg', 'B', 'Plasma',
                 'NK', 'CD163_Mac', 'APC','DC', 'Neutrophil',
                 'Fibroblast', 'PDPN', 'Endothelium']

adata = adata[:, np.isin(adata.var_names, var_names)].copy()
adata = adata[np.isin(adata.obs['pCR'], ['RD', 'pCR'])]
adata = adata[np.isin(adata.obs['cell_cluster'], cell_ordering)]
adata.write_h5ad(os.path.join(save_dir, 'nt_preprocessed.h5ad'))