In [1]:
import os
wd = os.getcwd()
print (f'working directory: {wd}')

working directory: /mnt/disks/store/ist_benchmarking


In [2]:
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import glob
import matplotlib.colors as colors

from st_utils import df_2_gdf
from st_utils import get_gdf_core
from st_utils import get_gene_type
from st_utils import transcript_loader
from st_utils import correct_tissue_names, correct_platform_panel
from st_utils import name_parser


from constants import SCALING_FACTOR_DICT
from constants import CORE_RADIUS_DICT
from constants import POINTS_SRC_DICT
from constants import UNIQUE_GENES_DICT
from constants import SAMPLES

pd.set_option('display.max_rows',999)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
plt.rcParams['figure.dpi']=250

import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

from collections import Counter


with open(f'{wd}/data/common_cores.pkl', 'rb') as f:
    matching_cores = pickle.load(f)



In [3]:
def xenium_fdr(sample, neg_metric, common_cores=matching_cores):

    modality, panel, tma = name_parser(sample)
    # Core information: including tissue_type, tumor ...
    csv_sample_info = f'{wd}/data/Sample_Info_{tma.upper()}_TMA - Sheet1.csv'

    df_meta = pd.read_csv(csv_sample_info)
    df_meta = correct_tissue_names(sample, df_meta)
    df_meta['core'] = df_meta['core'].astype('int')
    df_meta['core'] += (300 if 'tumor2' in sample else 0) + (200 if 'normal' in sample else 0)
    df_meta = df_meta.loc[df_meta['core'].isin(matching_cores)]
    df_meta['core'] = df_meta['core'].astype('str')

    # Get cell-level data
    df_cell = pd.read_parquet(f'{wd}/data/cell_level_csv/{sample}_cell_level.parquet.gzip', engine='pyarrow')
    df_cell = correct_tissue_names(sample, df_cell)


    df_cell['core'] = df_cell['core'].astype('int')
    df_cell['core'] += (300 if 'tumor2' in sample else 0) + (200 if 'normal' in sample else 0)
    df_cell = df_cell.loc[df_cell['core'].isin(matching_cores)]
    df_cell['core'] = df_cell['core'].astype('str')

    # customize aggregation for each col
    df_metric = df_cell.groupby(by = ['core'], as_index=False).agg({
        'cell_id':'count',
        'transcript_counts':'sum',
        'control_probe_counts':'sum',
        'control_codeword_counts':'sum',
        'unassigned_codeword_counts':'sum',
        'total_counts':'sum',
        })

    df_metric = df_metric.rename(columns={'cell_id':'cell_count'})

    # calculate transcripts_per_cell
    df_metric['transcripts_per_cell'] = df_metric.apply(
    lambda row: round(row['total_counts']/row['cell_count'],1),
    axis=1)

    # get tissue_type
    df_metric = pd.merge(df_metric, df_meta[['core','tissue_type']], on='core', how='inner')

    # Calculate FDR
    df_metric['fdr'] = df_metric.apply(
        lambda row: (np.true_divide(row['control_probe_counts'],row['total_counts'])
                    ) * (UNIQUE_GENES_DICT[sample]['gene'] / UNIQUE_GENES_DICT[sample][neg_metric]) * 100, axis=1)

    df_metric = df_metric.sort_values(by=['core'])

    return df_metric


def merscope_fdr(sample, neg_metric,common_cores=matching_cores):

    modality, panel, tma = name_parser(sample)
    # Core information: including tissue_type, tumor ...
    csv_sample_info = f'data/Sample_Info_{tma.upper()}_TMA - Sheet1.csv'

    df_meta = pd.read_csv(csv_sample_info)
    df_meta = correct_tissue_names(sample, df_meta)
    df_meta['core'] = df_meta['core'].astype('int')
    df_meta.core = df_meta.core + 200 if 'normal' in sample else df_meta.core
    # df_meta = df_meta.loc[df_meta['core'].isin(matching_cores)]
    df_meta['core'] = df_meta['core'].astype('str')

    # Get cell-level data
    df_cell = pd.read_parquet(f'data/cell_level_csv/{sample}_cell_level.parquet.gzip', engine='pyarrow')
    df_cell = correct_tissue_names(sample, df_cell)

    df_cell['core'] = df_cell['core'].astype('int')
    df_cell.core = df_cell.core + 200 if 'normal' in sample else df_cell.core
    # df_cell = df_cell.loc[df_cell['core'].isin(matching_cores)]
    df_cell['core'] = df_cell['core'].astype('str')

    # Get gdf_core
    # core centroid data x/y in pixel
    csv_points = glob.glob(f'data/{sample}/*points.csv')[0]

    # 1 pixel = scaling_factor um
    scaling_factor = SCALING_FACTOR_DICT[sample]
    radius_um = CORE_RADIUS_DICT[sample]

    # Get the gdf of core information with geometry (buffer from the core centroid)
    gdf_core= get_gdf_core(csv_points, csv_sample_info, scaling_factor, radius_um, points_src=POINTS_SRC_DICT[sample])
    gdf_core = correct_tissue_names(sample, gdf_core)
    gdf_core['core'] = gdf_core['core'].apply(lambda x: str(x).zfill(3))

    # Get transcript data
    df_t = transcript_loader(f'data/{sample}')
    df_t['calc_id']  = df_t['cell_id'].apply(lambda x: x.split('_region')[0]).astype(float)
    df_t = df_t.loc[df_t['calc_id']>0]

    gdf_t = df_2_gdf(df_t, 'global_x', 'global_y', crs="EPSG:4326", drop_xy=False)

    extra_variable = []

    gdf_join = gdf_core[
        ['core','tissue_type','geometry']
        ].sjoin(gdf_t, how='right', op='intersects'
                ).drop(columns='index_left').dropna(subset=['core'])
    
    gdf_join['core'] = gdf_join['core'].apply(lambda x: str(x).zfill(3))
    gdf_join['gene_type'] = gdf_join['gene'].apply(lambda x: get_gene_type(x))

    tts = []
    merscope_fdrs = []

    for tt in gdf_join.tissue_type.unique():
        df_tt = gdf_join.loc[gdf_join['tissue_type']==tt]
        gene_count = {**Counter(df_tt.gene_type)}
        print (tt, gene_count)
        merscope_fdr = (np.true_divide(gene_count.get(neg_metric,0), gene_count.get(neg_metric,0)+ gene_count.get('gene',0)) * (UNIQUE_GENES_DICT[sample]['gene'] / UNIQUE_GENES_DICT[sample][neg_metric]) * 100 )
        tts.append(tt)
        merscope_fdrs.append(merscope_fdr)


    print (f'df_t len: {len(df_t)}, df_join len:{len(gdf_join)}')

    df_metric = pd.DataFrame({'tissue_type':tts, 'fdr':merscope_fdrs})

    return df_metric


def cosmx_fdr(sample, neg_metric,common_cores=matching_cores):

    # Get cell-level data
    df_cell = pd.read_parquet(f'{wd}/data/cell_level_csv/{sample}_cell_level.parquet.gzip', engine='pyarrow')
    df_cell = correct_tissue_names(sample, df_cell)
    df_cell['core'] = df_cell['core'].astype('int')
    df_t = transcript_loader(f'{wd}/data/{sample}')
    
    df =pd.merge(df_t, df_cell[['core', 'tissue_type', 'cell_id']], on='cell_id', how='left')

    df['core'] += (300 if 'tumor2' in sample else 0) + (200 if 'normal' in sample else 0)
    df = df.loc[df['core'].isin(matching_cores)]
    df = df.dropna(subset=['core'])
    print (f'ct/spot/join:{len(df_cell)}/{len(df_t)}/{len(df)}')

    df['core'] = df['core'].apply(lambda x: str(x).zfill(3))
    df['gene_type'] = df['gene'].apply(lambda x: get_gene_type(x))

    tts = []
    cosmx_fdrs = []

    for tt in df.tissue_type.unique():
        df_tt = df.loc[df['tissue_type']==tt]
        gene_count = {**Counter(df_tt.gene_type)}
        print (tt, gene_count)
        cosmx_fdr = (np.true_divide(gene_count.get(neg_metric,0), gene_count.get(neg_metric,0)+ gene_count.get('gene',0)) * (UNIQUE_GENES_DICT[sample]['gene'] / UNIQUE_GENES_DICT[sample][neg_metric]) * 100 )
        tts.append(tt)
        cosmx_fdrs.append(cosmx_fdr)

    print (f'df_t len: {len(df_t)}, df_join len:{len(df)}')

    df_metric = pd.DataFrame({'tissue_type':tts, 'fdr':cosmx_fdrs})

    return df_metric



cmap = colors.LinearSegmentedColormap.from_list('nameofcolormap',['#0072B2','#D55E00'],gamma=2.0)



Reference: https://pages.10xgenomics.com/tch-2023-04-tech-lit-ra_g-p_xenium-performance-data-lp.html

# FDR Figure 1: Xenium (blank) vs. MERSCOPE(blank) vs. CosMX (sys control)

In [4]:
metric_by_sample = {}
samples = []

for SAMPLE in SAMPLES:

    sample = SAMPLE.split('/')[1]
    print (f'sample: {sample}')

    if 'merscope' in sample:
        neg_metric = 'blank'
        df_metric = merscope_fdr(sample, neg_metric)
    elif 'cosmx' in sample:
        neg_metric = 'sys_control'
        df_metric = cosmx_fdr(sample, neg_metric)

    elif 'xenium' in sample:
        neg_metric = 'blank'
        df_metric = xenium_fdr(sample, neg_metric)
    
    metric_by_sample[sample] = df_metric
    samples.append(sample)

    

sample: xenium_breast_htma
sample: xenium_breast_normal
sample: xenium_panhuman_htma
sample: xenium_panhuman_normal
sample: xenium_lung_htma
sample: xenium_lung_normal
sample: merscope_breast_htma
data/merscope_breast_htma/region_0/
data/merscope_breast_htma/region_1/
data/merscope_breast_htma/region_2/
data/merscope_breast_htma/region_3/
data/merscope_breast_htma/region_4/
data/merscope_breast_htma/region_5/
data/merscope_breast_htma/region_6/
data/merscope_breast_htma/region_7/
data/merscope_breast_htma/region_8/
17743227
HNSCC {'gene': 1766270, 'blank': 12233}
Mel {'gene': 1040225, 'blank': 5311}
BrC {'gene': 2151252, 'blank': 10567}
NSCLC {'gene': 2310812, 'blank': 15687}
OvC {'gene': 2258259, 'blank': 6667}
CRC {'gene': 1932167, 'blank': 11549}
BlC {'gene': 1696510, 'blank': 10316}
df_t len: 13823088, df_join len:13227825
sample: merscope_breast_normal
data/merscope_breast_normal/region_0/
data/merscope_breast_normal/region_1/
data/merscope_breast_normal/region_2/
data/merscope_br

In [5]:
if not os.path.exists(f'{wd}/data/fdr'):
    os.makedirs(f'{wd}/data/fdr')

dfs = {}

print (samples)

for sample in samples:
    df_metric = metric_by_sample[sample]
    df_metric_no_marker = df_metric.loc[df_metric['tissue_type'].str.lower()!='marker']

    if 'xenium' in sample:
        df_metric_no_marker = df_metric_no_marker.groupby(by = ['tissue_type'], as_index=False).agg({
            'fdr':'mean',
            })

    df_metric_no_marker['fdr'] = df_metric_no_marker['fdr'].apply(lambda x: round(x,2))
    df_metric_no_marker['sample'] = sample
    df_metric_no_marker['Platform_Panel'] = df_metric_no_marker['sample'].apply(lambda x: f"{x.split('_')[-3].upper()}_{x.split('_')[-2].capitalize()}")
    df_metric_no_marker['modality'] = df_metric_no_marker['sample'].apply(lambda x: f"{x.split('_')[-3].upper()}")
    df_metric_no_marker['panel'] = df_metric_no_marker['sample'].apply(lambda x: f"{x.split('_')[-2].capitalize()}")
    dfs[sample] = df_metric_no_marker

df_combo = pd.DataFrame()
for sample in samples:
    print (sample)
    df_combo = pd.concat([df_combo,dfs[sample]])
df_combo = correct_platform_panel(df_combo, 'Platform_Panel')
df_combo.to_csv(f'{wd}/data/fdr/blank.csv', index=False)

['xenium_breast_htma', 'xenium_breast_normal', 'xenium_panhuman_htma', 'xenium_panhuman_normal', 'xenium_lung_htma', 'xenium_lung_normal', 'merscope_breast_htma', 'merscope_breast_normal', 'merscope_lung_htma', 'merscope_lung_normal', 'cosmx_multitissue_htma', 'cosmx_multitissue_normal', '2024_xenium_breast_htma', '2024_xenium_breast_tumor2', '2024_merscope_breast_htma', '2024_merscope_breast_tumor2', '2024_cosmx_multitissue_htma', '2024_cosmx_multitissue_tumor2']
xenium_breast_htma
xenium_breast_normal
xenium_panhuman_htma
xenium_panhuman_normal
xenium_lung_htma
xenium_lung_normal
merscope_breast_htma
merscope_breast_normal
merscope_lung_htma
merscope_lung_normal
cosmx_multitissue_htma
cosmx_multitissue_normal
2024_xenium_breast_htma
2024_xenium_breast_tumor2
2024_merscope_breast_htma
2024_merscope_breast_tumor2
2024_cosmx_multitissue_htma
2024_cosmx_multitissue_tumor2


# FDR Fig 2: Xenium (neg control probe) vs. CosMX (negative control)

In [6]:
metric_by_sample = {}
samples = []

for SAMPLE in SAMPLES:

    sample = SAMPLE.split('/')[1]
    print (f'sample: {sample}')

    if 'xenium' in sample:
        neg_metric = 'neg_control_probe'
        df_metric = xenium_fdr(sample, neg_metric)

        metric_by_sample[sample] = df_metric
        samples.append(sample)
    elif 'cosmx' in sample:
        neg_metric = 'neg_control_probe'
        df_metric = cosmx_fdr(sample, neg_metric)

        metric_by_sample[sample] = df_metric
        samples.append(sample)

    # df_metric.core = df_metric.core + 200 if 'normal' in sample else df_metric.core

sample: xenium_breast_htma
sample: xenium_breast_normal
sample: xenium_panhuman_htma
sample: xenium_panhuman_normal
sample: xenium_lung_htma
sample: xenium_lung_normal
sample: merscope_breast_htma
sample: merscope_breast_normal
sample: merscope_lung_htma
sample: merscope_lung_normal
sample: cosmx_multitissue_htma
23520085
ct/spot/join:240154/23520085/7091176
NSCLC {'neg_control_probe': 2888, 'gene': 1548531, 'sys_control': 80070}
OvC {'neg_control_probe': 2372, 'gene': 1949535, 'sys_control': 30274}
HNSCC {'neg_control_probe': 1381, 'gene': 685731, 'sys_control': 18923}
BrC {'neg_control_probe': 1134, 'gene': 1577999, 'sys_control': 15227}
Mel {'neg_control_probe': 745, 'gene': 475666, 'sys_control': 19121}
CRC {'neg_control_probe': 700, 'gene': 672439, 'sys_control': 8440}
df_t len: 23520085, df_join len:7091176
sample: cosmx_multitissue_normal
8878571
ct/spot/join:178424/8878571/7002444
Spleen {'neg_control_probe': 7222, 'gene': 1149617, 'sys_control': 183129}
Prostate {'neg_control_

In [7]:
dfs = {}

print (samples)

for sample in samples:
    df_metric = metric_by_sample[sample]
    df_metric_no_marker = df_metric.loc[df_metric['tissue_type'].str.lower()!='marker']

    if 'xenium' in sample:
        df_metric_no_marker = df_metric_no_marker.groupby(by = ['tissue_type'], as_index=False).agg({
            'fdr':'mean',
            })

    df_metric_no_marker['fdr'] = df_metric_no_marker['fdr'].apply(lambda x: round(x,2))
    df_metric_no_marker['sample'] = sample
    df_metric_no_marker['Platform_Panel'] = df_metric_no_marker['sample'].apply(lambda x: f"{x.split('_')[-3].upper()}_{x.split('_')[-2].capitalize()}")
    df_metric_no_marker['modality'] = df_metric_no_marker['sample'].apply(lambda x: f"{x.split('_')[-3].upper()}")
    df_metric_no_marker['panel'] = df_metric_no_marker['sample'].apply(lambda x: f"{x.split('_')[-2].capitalize()}")
    dfs[sample] = df_metric_no_marker

df_combo = pd.DataFrame()
for sample in samples:
    print (sample)
    df_combo = pd.concat([df_combo,dfs[sample]])
    
df_combo = correct_platform_panel(df_combo, 'Platform_Panel')
df_combo.to_csv(f'{wd}/data/fdr/negative.csv', index=False)

['xenium_breast_htma', 'xenium_breast_normal', 'xenium_panhuman_htma', 'xenium_panhuman_normal', 'xenium_lung_htma', 'xenium_lung_normal', 'cosmx_multitissue_htma', 'cosmx_multitissue_normal', '2024_xenium_breast_htma', '2024_xenium_breast_tumor2', '2024_cosmx_multitissue_htma', '2024_cosmx_multitissue_tumor2']
xenium_breast_htma
xenium_breast_normal
xenium_panhuman_htma
xenium_panhuman_normal
xenium_lung_htma
xenium_lung_normal
cosmx_multitissue_htma
cosmx_multitissue_normal
2024_xenium_breast_htma
2024_xenium_breast_tumor2
2024_cosmx_multitissue_htma
2024_cosmx_multitissue_tumor2


In [8]:
df_combo.head()

Unnamed: 0,tissue_type,fdr,sample,Platform_Panel,modality,panel
0,BrC,0.12,xenium_breast_htma,"Xenium,breast",XENIUM,Breast
1,CRC,0.2,xenium_breast_htma,"Xenium,breast",XENIUM,Breast
2,HNSCC,0.24,xenium_breast_htma,"Xenium,breast",XENIUM,Breast
3,Mel,0.37,xenium_breast_htma,"Xenium,breast",XENIUM,Breast
4,NSCLC,0.16,xenium_breast_htma,"Xenium,breast",XENIUM,Breast


In [7]:
df_combo = pd.read_csv(f'{wd}/data/fdr/blank.csv')
df_combo.head()

Unnamed: 0,tissue_type,fdr,sample,Platform_Panel,modality,panel
0,BrC,0.01,xenium_breast_htma,"Xenium,breast",XENIUM,Breast
1,CRC,0.02,xenium_breast_htma,"Xenium,breast",XENIUM,Breast
2,HNSCC,0.02,xenium_breast_htma,"Xenium,breast",XENIUM,Breast
3,Mel,0.04,xenium_breast_htma,"Xenium,breast",XENIUM,Breast
4,NSCLC,0.02,xenium_breast_htma,"Xenium,breast",XENIUM,Breast


In [8]:
df_combo['fdr'].describe()

count    200.000000
mean       7.004900
std       12.742925
min        0.000000
25%        0.030000
50%        2.860000
75%        6.850000
max       71.540000
Name: fdr, dtype: float64

In [9]:
df_combo['tissue_type'].unique()

array(['BrC', 'CRC', 'HNSCC', 'Mel', 'NSCLC', 'OvC', 'Bladder', 'Breast',
       'Colon', 'Heart', 'Kidney', 'Liver', 'Lung', 'Lymph node', 'Ovary',
       'Prostate', 'Skin', 'Spleen', 'Thyroid', 'Tonsil', 'BlC',
       'Pancreas', 'Renal pelvis', 'Breast invasive',
       'Breast non-invasive DCIS_2', 'Breast non-invasive DCIS_3',
       'Kidney cancer', 'Liver cancer', 'Lymphoma Hodgkin',
       'Lymphoma LN B cell', 'Lymphoma non-Hodgkin', 'Pancreas cancer',
       'Prostate cancer', 'Renal cancer', 'SCC', 'Testes cancer',
       'Thyroid cancer', 'Marker normal liver', 'Marker normal spleen'],
      dtype=object)