In [2]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import glob
import matplotlib.colors as colors
from collections import Counter

from st_utils import df_2_gdf
from st_utils import get_gdf_core
from st_utils import get_gene_type
from st_utils import transcript_loader
from st_utils import correct_tissue_names, correct_platform_panel
from st_utils import name_parser

from constants import SCALING_FACTOR_DICT
from constants import CORE_RADIUS_DICT
from constants import POINTS_SRC_DICT
from constants import UNIQUE_GENES_DICT
from constants import SAMPLES
from constants import matching_cores_2024 as matching_cores

wd = os.getcwd()
print (f'working directory: {wd}')
plt.rcParams['figure.dpi']=250

year = '2024'

samples = [x for x in SAMPLES]
if year == '2024':
    samples = [x for x in SAMPLES if '2024' in x]
elif year == '2023':
    samples = [x for x in SAMPLES if '2024' not in x]
else:
    samples = [x for x in SAMPLES]
print (samples)

['2024_xenium_breast_htma', '2024_xenium_breast_tumor2', '2024_merscope_breast_htma', '2024_merscope_breast_tumor2', '2024_cosmx_multitissue_htma', '2024_cosmx_multitissue_tumor2']


In [7]:
def xenium_fdr(sample, neg_metric, common_cores=matching_cores):

    # Core information: including tissue_type, tumor ...
    csv_sample_info = f"{wd}/data/Sample_Info_{name_parser(sample)['tma'].upper()}_TMA - Sheet1.csv"

    df_meta = pd.read_csv(csv_sample_info)
    df_meta = correct_tissue_names(sample, df_meta)
    df_meta['core'] = df_meta['core'].astype('int')
    df_meta['core'] += (300 if 'tumor2' in sample else 0) + (200 if 'normal' in sample else 0)
    df_meta = df_meta.loc[df_meta['core'].isin(matching_cores)]
    df_meta['core'] = df_meta['core'].astype('str')

    # Get cell-level data
    df_cell = pd.read_parquet(f'{wd}/data/cell_level_csv/{sample}_cell_level.parquet.gzip', engine='pyarrow')
    df_cell = correct_tissue_names(sample, df_cell)


    df_cell['core'] = df_cell['core'].astype('int')
    df_cell['core'] += (300 if 'tumor2' in sample else 0) + (200 if 'normal' in sample else 0)
    df_cell = df_cell.loc[df_cell['core'].isin(matching_cores)]
    df_cell['core'] = df_cell['core'].astype('str')

    # customize aggregation for each col
    df_metric = df_cell.groupby(by = ['core'], as_index=False).agg({
        'cell_id':'count',
        'transcript_counts':'sum',
        'control_probe_counts':'sum',
        'control_codeword_counts':'sum',
        'unassigned_codeword_counts':'sum',
        'total_counts':'sum',
        })

    df_metric = df_metric.rename(columns={'cell_id':'cell_count'})

    # calculate transcripts_per_cell
    df_metric['transcripts_per_cell'] = df_metric.apply(
    lambda row: round(row['total_counts']/row['cell_count'],1),
    axis=1)

    # get tissue_type
    df_metric = pd.merge(df_metric, df_meta[['core','tissue_type']], on='core', how='inner')

    # Calculate FDR
    df_metric['fdr'] = df_metric.apply(
        lambda row: (np.true_divide(row['control_probe_counts'],row['total_counts'])
                    ) * (UNIQUE_GENES_DICT[sample]['gene'] / UNIQUE_GENES_DICT[sample][neg_metric]) * 100, axis=1)

    df_metric = df_metric.sort_values(by=['core'])

    return df_metric


def merscope_fdr(sample, neg_metric,common_cores=matching_cores):

    # Core information: including tissue_type, tumor ...
    csv_sample_info = f"{wd}/data/Sample_Info_{name_parser(sample)['tma'].upper()}_TMA - Sheet1.csv"

    df_meta = pd.read_csv(csv_sample_info)
    df_meta = correct_tissue_names(sample, df_meta)
    df_meta['core'] = df_meta['core'].astype('int')
    df_meta.core = df_meta.core + 200 if 'normal' in sample else df_meta.core
    # df_meta = df_meta.loc[df_meta['core'].isin(matching_cores)]
    df_meta['core'] = df_meta['core'].astype('str')

    # Get cell-level data
    df_cell = pd.read_parquet(f'data/cell_level_csv/{sample}_cell_level.parquet.gzip', engine='pyarrow')
    df_cell = correct_tissue_names(sample, df_cell)

    df_cell['core'] = df_cell['core'].astype('int')
    df_cell.core = df_cell.core + 200 if 'normal' in sample else df_cell.core
    # df_cell = df_cell.loc[df_cell['core'].isin(matching_cores)]
    df_cell['core'] = df_cell['core'].astype('str')

    # Get gdf_core
    # core centroid data x/y in pixel
    csv_points = glob.glob(f'data/{sample}/*points.csv')[0]

    # 1 pixel = scaling_factor um
    scaling_factor = SCALING_FACTOR_DICT[sample]
    radius_um = CORE_RADIUS_DICT[sample]

    # Get the gdf of core information with geometry (buffer from the core centroid)
    gdf_core= get_gdf_core(csv_points, csv_sample_info, scaling_factor, radius_um, points_src=POINTS_SRC_DICT[sample])
    gdf_core = correct_tissue_names(sample, gdf_core)
    gdf_core['core'] = gdf_core['core'].apply(lambda x: str(x).zfill(3))

    # Get transcript data
    df_t = transcript_loader(f'data/{sample}')
    df_t['calc_id']  = df_t['cell_id'].apply(lambda x: x.split('_region')[0]).astype(float)
    df_t = df_t.loc[df_t['calc_id']>0]

    gdf_t = df_2_gdf(df_t, 'global_x', 'global_y', crs="EPSG:4326", drop_xy=False)

    gdf_join = gdf_core[
        ['core','tissue_type','geometry']
        ].sjoin(gdf_t, how='right', op='intersects'
                ).drop(columns='index_left').dropna(subset=['core'])
    
    gdf_join['core'] = gdf_join['core'].apply(lambda x: str(x).zfill(3))
    gdf_join['gene_type'] = gdf_join['gene'].apply(lambda x: get_gene_type(x))

    tts = []
    merscope_fdrs = []

    for tt in gdf_join.tissue_type.unique():
        df_tt = gdf_join.loc[gdf_join['tissue_type']==tt]
        gene_count = {**Counter(df_tt.gene_type)}
        print (tt, gene_count)
        merscope_fdr = (np.true_divide(gene_count.get(neg_metric,0), gene_count.get(neg_metric,0)+ gene_count.get('gene',0)) * (UNIQUE_GENES_DICT[sample]['gene'] / UNIQUE_GENES_DICT[sample][neg_metric]) * 100 )
        tts.append(tt)
        merscope_fdrs.append(merscope_fdr)


    print (f'df_t len: {len(df_t)}, df_join len:{len(gdf_join)}')

    df_metric = pd.DataFrame({'tissue_type':tts, 'fdr':merscope_fdrs})

    return df_metric


def cosmx_fdr(sample, neg_metric,common_cores=matching_cores):

    # Get cell-level data
    df_cell = pd.read_parquet(f'{wd}/data/cell_level_csv/{sample}_cell_level.parquet.gzip', engine='pyarrow')
    df_cell = correct_tissue_names(sample, df_cell)
    df_cell['core'] = df_cell['core'].astype('int')
    df_t = transcript_loader(f'{wd}/data/{sample}')
    
    df =pd.merge(df_t, df_cell[['core', 'tissue_type', 'cell_id']], on='cell_id', how='left')

    df['core'] += (300 if 'tumor2' in sample else 0) + (200 if 'normal' in sample else 0)
    df = df.loc[df['core'].isin(matching_cores)]
    df = df.dropna(subset=['core'])
    print (f'ct/spot/join:{len(df_cell)}/{len(df_t)}/{len(df)}')

    df['core'] = df['core'].apply(lambda x: str(x).zfill(3))
    df['gene_type'] = df['gene'].apply(lambda x: get_gene_type(x))

    tts = []
    cosmx_fdrs = []

    for tt in df.tissue_type.unique():
        df_tt = df.loc[df['tissue_type']==tt]
        gene_count = {**Counter(df_tt.gene_type)}
        print (tt, gene_count)
        cosmx_fdr = (np.true_divide(gene_count.get(neg_metric,0), gene_count.get(neg_metric,0)+ gene_count.get('gene',0)) * (UNIQUE_GENES_DICT[sample]['gene'] / UNIQUE_GENES_DICT[sample][neg_metric]) * 100 )
        tts.append(tt)
        cosmx_fdrs.append(cosmx_fdr)

    print (f'df_t len: {len(df_t)}, df_join len:{len(df)}')

    df_metric = pd.DataFrame({'tissue_type':tts, 'fdr':cosmx_fdrs})

    return df_metric



cmap = colors.LinearSegmentedColormap.from_list('nameofcolormap',['#0072B2','#D55E00'],gamma=2.0)



Reference: https://pages.10xgenomics.com/tch-2023-04-tech-lit-ra_g-p_xenium-performance-data-lp.html

# FDR Figure 1: Xenium (blank) vs. MERSCOPE(blank) vs. CosMX (sys control)

In [8]:
if not os.path.exists(f'{wd}/data/fdr'):
    os.makedirs(f'{wd}/data/fdr')

metric_by_sample = {}

for sample in samples:
    print (f'sample: {sample}')

    if 'merscope' in sample:
        neg_metric = 'blank'
        df_metric = merscope_fdr(sample, neg_metric)
    elif 'cosmx' in sample:
        neg_metric = 'sys_control'
        df_metric = cosmx_fdr(sample, neg_metric)
    elif 'xenium' in sample:
        neg_metric = 'blank'
        df_metric = xenium_fdr(sample, neg_metric)
    metric_by_sample[sample] = df_metric

dfs = {}
for sample in samples:
    df_metric = metric_by_sample[sample]
    df_metric_no_marker = df_metric.loc[df_metric['tissue_type'].str.lower()!='marker']
    if 'xenium' in sample:
        df_metric_no_marker = df_metric_no_marker.groupby(by = ['tissue_type'], as_index=False).agg({
            'fdr':'mean',
            })

    df_metric_no_marker['fdr'] = df_metric_no_marker['fdr'].apply(lambda x: round(x,2))
    df_metric_no_marker['sample'] = sample
    df_metric_no_marker['Platform_Panel'] = f"{name_parser(sample)['platform'].upper()}_{name_parser(sample)['panel'].capitalize()}"
    df_metric_no_marker['modality'] = name_parser(sample)['platform'].upper()
    df_metric_no_marker['panel'] = name_parser(sample)['panel'].capitalize()
    dfs[sample] = df_metric_no_marker

df_combo = pd.DataFrame()
for sample in samples:
    print (sample)
    df_combo = pd.concat([df_combo,dfs[sample]])
df_combo = correct_platform_panel(df_combo, 'Platform_Panel')
df_combo.to_csv(f'{wd}/data/fdr/blank_{year}.csv', index=False)

sample: 2024_xenium_breast_htma
sample: 2024_xenium_breast_tumor2
sample: 2024_merscope_breast_htma
data/2024_merscope_breast_htma/region_0/
data/2024_merscope_breast_htma/region_1/
data/2024_merscope_breast_htma/region_2/
2569582
NSCLC {'gene': 270805, 'blank': 2441}
OvC {'gene': 948959, 'blank': 2994}
Tonsil {'gene': 20074, 'blank': 440}
Lymph node {'gene': 36230, 'blank': 201}
Marker {'gene': 290, 'blank': 9}
HNSCC {'gene': 74289, 'blank': 792}
BrC {'gene': 323259, 'blank': 1624}
Mel {'gene': 106305, 'blank': 1256}
CRC {'gene': 130730, 'blank': 753}
df_t len: 2048003, df_join len:1921451
sample: 2024_merscope_breast_tumor2
data/2024_merscope_breast_tumor2/region_0/
data/2024_merscope_breast_tumor2/region_1/
544237
Mel {'gene': 6553, 'blank': 50}
Marker normal liver {'gene': 6381, 'blank': 184}
Kidney cancer {'gene': 758, 'blank': 3}
Breast non-invasive DCIS_2 {'gene': 549, 'blank': 34}
Lymphoma LN B cell {'gene': 45438, 'blank': 337}
Marker normal spleen {'gene': 142, 'blank': 4}
Br

# FDR Fig 2: Xenium (neg control probe) vs. CosMX (negative control)

In [9]:
metric_by_sample = {}

samples = [sample for sample in samples if 'merscope' not in sample]

for sample in samples:
    print (f'sample: {sample}')
    if 'xenium' in sample:
        neg_metric = 'neg_control_probe'
        df_metric = xenium_fdr(sample, neg_metric)
        metric_by_sample[sample] = df_metric
    elif 'cosmx' in sample:
        neg_metric = 'neg_control_probe'
        df_metric = cosmx_fdr(sample, neg_metric)
        metric_by_sample[sample] = df_metric

dfs = {}
for sample in samples:
    df_metric = metric_by_sample[sample]
    df_metric_no_marker = df_metric.loc[df_metric['tissue_type'].str.lower()!='marker']

    if 'xenium' in sample:
        df_metric_no_marker = df_metric_no_marker.groupby(by = ['tissue_type'], as_index=False).agg({
            'fdr':'mean',
            })

    df_metric_no_marker['fdr'] = df_metric_no_marker['fdr'].apply(lambda x: round(x,2))
    df_metric_no_marker['sample'] = sample
    df_metric_no_marker['Platform_Panel'] = f"{name_parser(sample)['platform'].upper()}_{name_parser(sample)['panel'].capitalize()}"
    df_metric_no_marker['modality'] = name_parser(sample)['platform'].upper()
    df_metric_no_marker['panel'] = name_parser(sample)['panel'].capitalize()
    dfs[sample] = df_metric_no_marker

df_combo = pd.DataFrame()
for sample in samples:
    print (sample)
    df_combo = pd.concat([df_combo,dfs[sample]])
    
df_combo = correct_platform_panel(df_combo, 'Platform_Panel')
df_combo.to_csv(f'{wd}/data/fdr/negative_{year}.csv', index=False)

sample: 2024_xenium_breast_htma
sample: 2024_xenium_breast_tumor2
sample: 2024_cosmx_multitissue_htma
99240555
ct/spot/join:258986/99240555/53551552
NSCLC {'neg_control_probe': 10166, 'gene': 8833503, 'sys_control': 169874}
HNSCC {'neg_control_probe': 6863, 'gene': 4465755, 'sys_control': 77568}
Mel {'neg_control_probe': 2879, 'gene': 2817378, 'sys_control': 29703}
CRC {'neg_control_probe': 3889, 'gene': 3761376, 'sys_control': 47130}
OvC {'neg_control_probe': 14832, 'gene': 15490302, 'sys_control': 155698}
BrC {'neg_control_probe': 6317, 'gene': 8754315, 'sys_control': 83578}
Marker {'neg_control_probe': 759, 'gene': 205059, 'sys_control': 8863}
Tonsil {'neg_control_probe': 6211, 'gene': 6332717, 'sys_control': 68939}
Lymph node {'neg_control_probe': 1825, 'gene': 2178779, 'sys_control': 17274}
df_t len: 99240555, df_join len:53551552
sample: 2024_cosmx_multitissue_tumor2
81250185
ct/spot/join:249682/81250185/68997583
CRC {'neg_control_probe': 5289, 'gene': 4407995, 'sys_control': 690

In [10]:
df_combo

Unnamed: 0,tissue_type,fdr,sample,Platform_Panel,modality,panel
0,BrC,0.09,2024_xenium_breast_htma,"Xenium,breast",XENIUM,Breast
1,CRC,0.14,2024_xenium_breast_htma,"Xenium,breast",XENIUM,Breast
2,HNSCC,0.27,2024_xenium_breast_htma,"Xenium,breast",XENIUM,Breast
3,Lymph node,0.06,2024_xenium_breast_htma,"Xenium,breast",XENIUM,Breast
4,Mel,0.25,2024_xenium_breast_htma,"Xenium,breast",XENIUM,Breast
5,NSCLC,0.08,2024_xenium_breast_htma,"Xenium,breast",XENIUM,Breast
6,OvC,0.05,2024_xenium_breast_htma,"Xenium,breast",XENIUM,Breast
7,Tonsil,0.1,2024_xenium_breast_htma,"Xenium,breast",XENIUM,Breast
0,BlC,0.18,2024_xenium_breast_tumor2,"Xenium,breast",XENIUM,Breast
1,Breast invasive,0.29,2024_xenium_breast_tumor2,"Xenium,breast",XENIUM,Breast
