In [8]:
import pandas as pd
import sys
import os
import math
import pandas as pd
from scipy.stats import ttest_ind
import plotly.express as px
import plotly.io as pio

sys.path.append('../src/')
from constants import SAMPLES, matching_cores_2024

wd = os.getcwd()
print (f'working directory: {wd}')

def log2_fold_change(a, b):
    """
    Calculate the log2 fold change between two expression values a and b.
    
    Parameters:
    a (float): Expression value in condition A (control or reference)
    b (float): Expression value in condition B (treatment or experimental)
    
    Returns:
    float: The log2 fold change
    """
    if a == 0:
        raise ValueError("Expression value 'a' should not be zero to avoid division by zero.")
    fold_change = b / a
    log2_fc = math.log2(fold_change)
    return log2_fc


gene_list_dict = {
    'co':['CD274','PDCD1', 'PDCD1LG2', 'CCL5', 'CXCL12', 'CD68', 'CTLA4'],
    'not_co':['ADIPOQ', 'KIT'],
}


if not os.path.exists(f'{wd}/figures/protein_validation'):
    os.makedirs(f'{wd}/figures/protein_validation')

pd_l1_high_cores = [
    1.0, 2.0, 3.0, 6.0, 7.0, 8.0, 37.0, 39.0, 41.0, 42.0,
    43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0,
    52.0, 55.0, 56.0, 57.0, 58.0, 60.0, 62.0, 64.0, 66.0,
    69.0, 70.0, 71.0, 79.0, 82.0, 83.0, 84.0, 85.0, 91.0,
    92.0, 93.0, 94.0, 95.0, 96.0, 101.0, 102.0, 111.0, 112.0,
    113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0, 120.0,
    121.0, 122.0, 127.0, 128.0, 129.0, 133.0, 134.0, 138.0,
    139.0, 140.0, 144.0, 148.0, 149.0, 150.0, 151.0, 152.0,
    153.0, 155.0, 156.0, 159.0, 160.0, 161.0, 164.0, 165.0,
    168.0, 169.0, 170.0]

working directory: /Users/whuan/dev/ist_benchmarking


In [2]:

df_core = pd.read_csv(f'{wd}/data/Sample_Info_HTMA_TMA - Sheet1.csv')[['core', 'tissue_type', 'patient_number', 'PD-L1_status']]
df_core['unique_patient_number'] = df_core.apply(lambda row: f"{row['tissue_type']}_{row['patient_number']}_{row['PD-L1_status']}", axis=1)

df_all = pd.DataFrame()

gene_list = ['CD274']
for sample in SAMPLES:
    if '2024' in sample and 'htma' in sample:
        print (sample)
        df = pd.read_csv(f'{wd}/data/gene_level_csv/gene_level_csv_{sample}.csv', engine='pyarrow')
        df_selected = df.loc[df["gene"].isin(gene_list)]
        df_join = pd.merge(df_selected, df_core, on=['core','tissue_type'], how='left')
        df_join['sample'] = sample
        df_join['core'] = df_join['core'].astype('int')
        df_join = df_join.loc[df_join['core'].isin(matching_cores_2024)]
        df_all = pd.concat([df_all, df_join])

df_all['platform'] = df_all['sample'].apply(lambda x: x.split('_')[-3].upper())
df_all['platform'] = df_all['platform'].replace({'XENIUM':'Xenium','MERSCOPE':'MERSCOPE','COSMX':'CosMx'})
df_all = df_all[['count', 'PD-L1_status', 'platform']]

2024_xenium_breast_htma
2024_merscope_breast_htma
2024_cosmx_multitissue_htma


In [3]:
df = df_all[['count', 'PD-L1_status', 'platform']]

# Calculate the mean count and the number of entries (n) for each sample and PD-L1 status
df_stats = (
    df
    .groupby(by=['platform', 'PD-L1_status'])
    .agg(Num_of_Cores=('count', 'size'), Average_CD274_Expression=('count', 'mean'))
    .reset_index()
)

# Display the resulting DataFrame
display(df_stats)


Unnamed: 0,platform,PD-L1_status,Num_of_Cores,Average_CD274_Expression
0,CosMx,high,38,199.921053
1,CosMx,low,27,183.518519
2,MERSCOPE,high,32,21.28125
3,MERSCOPE,low,25,28.96
4,Xenium,high,38,357.421053
5,Xenium,low,27,284.111111


In [4]:

# Remove rows with NaN in PD-L1_status to focus only on 'high' and 'low'
df_filtered = df.dropna(subset=['PD-L1_status'])

# Define the function to calculate p-values for each sample
def calculate_p_values(df):
    results = []
    for platform in df['platform'].unique():
        df_platform = df[df['platform'] == platform]
        
        # Ensure there are 'high' and 'low' counts to compare
        if 'high' in df_platform['PD-L1_status'].values and 'low' in df_platform['PD-L1_status'].values:
            high_counts = df_platform[df_platform['PD-L1_status'] == 'high']['count']
            low_counts = df_platform[df_platform['PD-L1_status'] == 'low']['count']
            
            # Perform t-test
            t_stat, p_val = ttest_ind(high_counts, low_counts, equal_var=False)
            
            # Determine significance based on p-value threshold (0.05)
            significant = p_val < 0.05
            
            # Append result
            results.append({
                'platform': platform,
                'p_value': p_val,
                'significant': significant
            })
    
    return pd.DataFrame(results)

# Calculate p-values and significance for each sample
df_results = calculate_p_values(df_filtered)

# Display the results
print("Results DataFrame:")
display(df_results)


Results DataFrame:


Unnamed: 0,platform,p_value,significant
0,Xenium,0.50835,False
1,MERSCOPE,0.413921,False
2,CosMx,0.732083,False


In [5]:

fontsize = 16
width = 700
height = 700

fig = px.box(
    df_all,
    x="PD-L1_status",
    y="count",
    color='PD-L1_status',
    width=width,
    height=height,
    facet_row='platform',
    category_orders={"PD-L1_status": ['high', 'low']},
    points='all'
)

# Customize box plot quartile method
fig.update_traces(quartilemethod="exclusive")

# Customize axes for all facets
fig.update_xaxes(
    title_text="",
    tickfont=dict(size=fontsize),
    showline=True,
    linewidth=1,
    linecolor='black'
)
fig.update_yaxes(
    matches=None,
    title_text="",
    title_font=dict(size=fontsize),
    showline=True,
    linewidth=1,
    linecolor='black',
    range=[0, 5000]
)

fig.update_yaxes(row=1, range=[0, 1000])
fig.update_yaxes(row=2, range=[0, 150])
fig.update_yaxes(row=3, range=[0, 2500])

# Customize legend and layout
fig.update_layout(
    legend_title_text="PD-L1 Status",
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1,
        xanchor="center",
        x=0.5
    ),
    font=dict(size=fontsize, color='black'),
    plot_bgcolor="rgba(0, 0, 0, 0)",
    showlegend=True,
)

for annotation in fig.layout.annotations:
    annotation.font.size = fontsize
    annotation.font.color = 'black'
    annotation.x = 0.95
    annotation.textangle = 90
    for platform in df_all['platform'].unique():
        if annotation.text == f'platform={platform}':
            annotation.text = platform

# Add bracket and p-value text for each row in the facet
for i, platform in enumerate(df_all['platform'].unique(), start=1):
    # Calculate dynamic y position based on row-specific range
    y_max = fig.layout[f'yaxis{i}'].range[1]  # Get max range for current row
    y_pos = y_max * 0.7  # Position the bracket slightly below the max
    
    fig.add_shape(
        type="line",
        x0=0.15, x1=0.85, y0=y_pos, y1=y_pos,  # Adjusted x0 and x1 to widen bracket
        line=dict(color="black", width=1.5),
        xref="x", yref=f'y{i}'  # Target specific row in the facet
    )
    fig.add_shape(
        type="line",
        x0=0.15, x1=0.15, y0=y_pos, y1=y_pos - (y_max * 0.05),
        line=dict(color="black", width=1.5),
        xref="x", yref=f'y{i}'
    )
    fig.add_shape(
        type="line",
        x0=0.85, x1=0.85, y0=y_pos, y1=y_pos - (y_max * 0.05),
        line=dict(color="black", width=1.5),
        xref="x", yref=f'y{i}'
    )

    # Add the annotation text above the bracket
    fig.add_annotation(
        x=0.5, y=y_pos + (y_max * 0.07), text="p>0.1, Not sig.",
        showarrow=False,
        font=dict(size=fontsize, color="black"),
        xref="x", yref=f'y{i}'
    )

# Show and save the figure
fig.show()
pio.write_image(fig, f'{wd}/figures/protein_validation/bar_plot.png', scale=4, width=width, height=height)
# pio.write_image(fig, f'{wd}/figures/protein_validation/bar_plot.eps', scale=4, width=width, height=height)


# Protein expression vs. iST expression

In [6]:
df_core = pd.read_csv(f'{wd}/data/Sample_Info_HTMA_TMA - Sheet1.csv')[['core', 'tissue_type', 'patient_number', 'PD-L1_status']]
df_core['unique_patient_number'] = df_core.apply(lambda row: f"{row['tissue_type']}_{row['patient_number']}_{row['PD-L1_status']}", axis=1)

def get_protein_expression(core, tech):

    file_dict = {
        'codex':f'/Users/whuan/dev/OPP/codex_pipeline/2022_tma_1/quantification/single_cell_quantification_whole-cell_{core}_1230.csv',
        'cycif':f'/Users/whuan/dev/OPP/codex_pipeline/cycif_tma_010/quantification/single_cell_quantification_whole-cell_{core}_600.csv',
        'hmif':f'/Users/whuan/dev/OPP/codex_pipeline/tma_mac/quantification/single_cell_quantification_whole-cell_{core}_760.csv',
    }

    df = pd.read_csv(file_dict[tech], engine='pyarrow')
    protein_expression = df['PD-L1'].mean()
    return protein_expression


gene = 'CD274'

df_p_all = pd.DataFrame()
for sample in [
    '2024_xenium_breast_htma',
    '2024_merscope_breast_htma',
    '2024_cosmx_multitissue_htma',
    ]:
    print (sample)
    df = pd.read_csv(f'{wd}/data/gene_level_csv/gene_level_csv_{sample}.csv', engine='pyarrow')
    df_selected = df.loc[df["gene"]==gene]
    df_join = pd.merge(df_selected, df_core, on=['core','tissue_type'], how='left')
    df_join['sample'] = sample
    df_join['core'] = df_join['core'].astype('int')
    df_join = df_join.loc[df_join['core'].isin(matching_cores_2024)]
    df_join = df_join.loc[~df_join['core'].isin([x for x in range(100,111)]+[42,83,97,113,114,115,116,117,118,120])]
    df_join['platform'] = df_join['sample'].apply(lambda x: x.split('_')[-3].upper())
    df_join['platform'] = df_join['platform'].replace({'XENIUM':'Xenium','MERSCOPE':'MERSCOPE','COSMX':'CosMx'})
    for tech in ['codex','cycif','hmif']:
        df_join[f'{tech}_expression'] = df_join.apply(lambda row: get_protein_expression(row['core'], tech), axis=1)
    df_join = df_join.loc[df_join['PD-L1_status'].isin(['high','low'])]
    df_p_all = pd.concat([df_p_all, df_join])


# Melt the DataFrame
df_melted = df_p_all.melt(
    id_vars=['core', 'gene', 'tissue_type', 'count', 'code_type', 'patient_number', 'PD-L1_status', 'unique_patient_number', 'sample', 'platform'],
    value_vars=['codex_expression', 'cycif_expression', 'hmif_expression'],
    var_name='protein_platform', 
    value_name='protein_expression'
)

# Map platform column to remove '_expression' suffix
df_melted['protein_platform'] = df_melted['protein_platform'].str.replace('_expression', '').str.upper()

# Display the resulting DataFrame
display(df_melted.head())

2024_xenium_breast_htma
2024_merscope_breast_htma
2024_cosmx_multitissue_htma


Unnamed: 0,core,gene,tissue_type,count,code_type,patient_number,PD-L1_status,unique_patient_number,sample,platform,protein_platform,protein_expression
0,125,CD274,NSCLC,57,gene,4.0,low,NSCLC_4.0_low,2024_xenium_breast_htma,Xenium,CODEX,14.95385
1,126,CD274,NSCLC,121,gene,4.0,low,NSCLC_4.0_low,2024_xenium_breast_htma,Xenium,CODEX,13.420459
2,127,CD274,NSCLC,50,gene,4.0,high,NSCLC_4.0_high,2024_xenium_breast_htma,Xenium,CODEX,29.915389
3,128,CD274,NSCLC,40,gene,4.0,high,NSCLC_4.0_high,2024_xenium_breast_htma,Xenium,CODEX,21.450493
4,129,CD274,NSCLC,377,gene,5.0,high,NSCLC_5.0_high,2024_xenium_breast_htma,Xenium,CODEX,31.309282


In [7]:
width=800
height=800
fontsize = 15
fig = px.scatter(df_melted, x=f"protein_expression", 
                y="count",
                color="protein_platform", 
                facet_col="protein_platform",
                facet_row="platform",
                width=width, height=height)

fig.update_xaxes(
    matches=None,
    title_text=f'',
    title_font=dict(size=fontsize),
    showline=True,
    linewidth=1,
    linecolor='black',
)

fig.update_yaxes(
    matches=None,
    title_text=f'',
    title_font=dict(size=fontsize),
    showline=True,
    linewidth=1,
    linecolor='black',
)

fig.update_xaxes(col=1, range=[0, 500])
fig.update_xaxes(col=2, range=[0, 20000])
fig.update_xaxes(col=3, range=[0, 80])


# Customize legend and layout
fig.update_layout(
    title_text=f'Protein PD-L1 expression vs. iST {gene} expression at core level',
    legend_title_text="",
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1,
        xanchor="center",
        x=0.5
    ),
    font=dict(size=fontsize, color='black'),
    showlegend=True,
)

for annotation in fig.layout.annotations:
    for platform in df_melted.platform.unique():
        if annotation.text == f'platform={platform}':
            annotation.text = platform
            annotation.font.size = fontsize
            annotation.font.color = 'black'  # Set the annotation text color to black
            annotation.x = 0.98
            annotation.textangle = 90

    for protein_platform in df_melted.protein_platform.unique():
        if annotation.text == f'protein_platform={protein_platform}':
            annotation.text = protein_platform
            annotation.font.size = fontsize
            annotation.font.color = 'black'  # Set the annotation text color to black
            annotation.y = -0.06
            annotation.textangle = 0

fig.show()

pio.write_image(fig, f'{wd}/figures/protein_validation/scatter_plot.png',scale=4, width=width, height=height)
pio.write_image(fig, f'{wd}/figures/protein_validation/scatter_plot.eps',scale=4, width=width, height=height)