In [1]:
import pandas as pd
import plotly.io as pio
import plotly.graph_objects as go
import sys
import geopandas as gpd

from st_utils import correct_platform_panel
from st_utils import correct_tissue_names
from st_utils import get_qced_cell_id, get_processed
from constants import PIXEL_TO_UM
from constants import matching_cores_2024 as matching_cores
from constants import SAMPLES


def update_core(row, sample_col='Sample'):
    if 'tumor2' in row[sample_col]:
        return row['core'] + 300
    elif 'normal' in row[sample_col]:
        return row['core'] + 200
    else:
        return row['core']

year = '2024'

samples = [x for x in SAMPLES]
if year == '2024':
    samples = [x for x in SAMPLES if '2024' in x]
elif year == '2023':
    samples = [x for x in SAMPLES if '2024' not in x]
else:
    samples = [x for x in SAMPLES]
print (samples)

['2024_xenium_breast_htma', '2024_xenium_breast_tumor2', '2024_merscope_breast_htma', '2024_merscope_breast_tumor2', '2024_cosmx_multitissue_htma', '2024_cosmx_multitissue_tumor2']


In [2]:
# Generate all cell area file

gdf_all = gpd.read_parquet(f'data/all_xenium_merscope_cosmx_segmentation_{year}.parquet.gzip')
gdf_plot_all = gdf_all.copy()
gdf_plot_all = gdf_plot_all[~gdf_plot_all['Sample'].str.contains('cosmx')]
gdf_plot_all['Platform'] = gdf_plot_all['Sample'].apply(lambda x: x.split("_")[-3].upper())
gdf_plot_all['area_sqpx'] = gdf_plot_all['geometry'].area
gdf_plot_all['area_squm'] = gdf_plot_all.apply(
    lambda row:row['area_sqpx'] * PIXEL_TO_UM[row['Platform'].lower()] * PIXEL_TO_UM[row['Platform'].lower()] if row['Platform'] == 'COSMX' else row['area_sqpx'],
    axis=1)
df_area_all = gdf_plot_all[['cell_id', 'core', 'tissue_type', 'Sample', 'area_squm']]
df_area_all.to_parquet(f'data/all_xenium_merscope_cell_area_{year}.parquet.gzip', compression='gzip', index=False)


  gdf_plot_all['area_sqpx'] = gdf_plot_all['geometry'].area


In [3]:
                                                                                                                                                                                                            # Get CosMx cell area 
df_area_before = pd.DataFrame()
for sample in samples:
    if 'cosmx' in sample:
        print (sample)
        df_c = get_processed(sample, 'cell_level')[['cell_id','core','tissue_type','Area']]
        df_c = correct_tissue_names(sample,df_c)
        df_c = df_c.rename(columns={'Area':'area_squm'})
        df_c['area_squm'] = df_c['area_squm'] * 0.12**2
        df_c['Sample'] = sample
        df_c = df_c[['cell_id','core','tissue_type','Sample','area_squm']]
        df_area_before = pd.concat([df_area_before, df_c])

# Get Xenium and Merscope cell area
df_xe_mer = pd.read_parquet(f'data/all_xenium_merscope_cell_area_{year}.parquet.gzip', engine='pyarrow')

for sample in samples:
    if 'cosmx' not in sample:
        print (sample)
        df_xe_mer_single = df_xe_mer.loc[df_xe_mer['Sample']==sample]
        df_xe_mer_single = correct_tissue_names(sample,df_xe_mer_single)
        df_area_before = pd.concat([df_area_before, df_xe_mer_single], ignore_index=False)

2024_cosmx_multitissue_htma
2024_cosmx_multitissue_tumor2
2024_xenium_breast_htma
2024_xenium_breast_tumor2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tissue_type'] = df['tissue_type'].replace(htma_correct_names)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tissue_type'] = df['tissue_type'].replace(tumor2_correct_names)


2024_merscope_breast_htma
2024_merscope_breast_tumor2


# Load data

In [4]:
ids, _ = get_qced_cell_id(10,0)

df_area_before.core = df_area_before.core.astype('int')
df_area_before['core'] = df_area_before.apply(update_core, axis=1)
df_area_before.core = df_area_before.core.astype('str')
df_area_before = df_area_before.loc[~df_area_before['tissue_type'].isin(['Marker','MARKER'])]
df_area_before = df_area_before.drop_duplicates()
df_area_before['Filtration'] = 'Before'

df_area_after = df_area_before.loc[df_area_before['cell_id'].isin(ids)]
df_area_after['Filtration'] = 'After'

df_area_all = pd.concat([df_area_before, df_area_after])
df_area_all['TMA'] = df_area_all['Sample'].apply(lambda x: x.split('_')[-1].upper())
df_area_all = df_area_all.sort_values(by=['Sample'])
df_area_all['Platform_Panel'] = df_area_all['Sample'].apply(lambda x: f"{x.split('_')[-3].upper()}_{x.split('_')[-2].capitalize()}")
df_area_all = correct_platform_panel(df_area_all, 'Platform_Panel')
df_area_all = df_area_all.reset_index()
matching_cores = [str(x) for x in matching_cores]
df_area_all = df_area_all.loc[df_area_all['core'].isin(matching_cores)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_area_after['Filtration'] = 'After'


In [5]:
df = df_area_all.rename(columns={"tissue_type":"Tissue Type"})
grouped_df = df[['Tissue Type', 'Platform_Panel', 'area_squm']].groupby(['Tissue Type', 'Platform_Panel']).median().reset_index()
pivoted_df = grouped_df.pivot_table(index='Tissue Type', columns='Platform_Panel', values='area_squm')
pivoted_df.to_csv(f'tables/cell_area_{year}.csv')

# Plot

In [8]:

# fontsize = 40
# width = 1100
# height = 900
# plot_metric = 'area_squm'
# groupby = 'Platform_Panel'
# val_range = [-5,300]

# df = df_area_all.copy()

# fig = go.Figure()
# fig.add_trace(go.Violin(x=df[groupby][df['Filtration'] == 'Before'],
#                         y=df[f'{plot_metric}'][df['Filtration'] == 'Before'],
#                         legendgroup='Yes', scalegroup='Yes', name='Before',
#                         side='negative',
#                         line_color='red',
#                         showlegend=False)
#              )
# fig.add_trace(go.Violin(x=df[groupby][df['Filtration'] == 'After'],
#                         y=df[f'{plot_metric}'][df['Filtration'] == 'After'],
#                         legendgroup='No', scalegroup='No', name='After',
#                         side='positive',
#                         line_color='green',
#                         showlegend=False)
#              )
# fig.update_traces(meanline_visible=True)
# fig.update_yaxes(title_text=f"Segmented cell area (um^2)",
#                 title_font=dict(size=fontsize), range=val_range, tickfont=dict(size=fontsize-2), showline=True, linewidth=2, linecolor='black')
# fig.update_xaxes(title_text="", tickfont=dict(size=fontsize-2), showline=True, linewidth=2, linecolor='black')

# fig.update_layout(
#     violingap=0.35,  # Set the gap between violins
#     violinmode='overlay',  # Overlay mode for violins
#     width=width,  # Set the width of the figure
#     height=height,  # Set the height of the figure
#     legend=dict(
#         orientation="h",  # horizontal alignment
#         yanchor="bottom", 
#         y=0.8,  # position legend just above the plot
#         xanchor="right",
#         x=0.70  # center the legend
#     ),
#     font=dict(color='black', size=fontsize),
#     paper_bgcolor='rgba(0,0,0,0)',
#     plot_bgcolor='rgba(0,0,0,0)',
# )

# fig.show()
# pio.write_image(fig, f'figures/Fig_4_Segmentation/Main_Fig_4_B_cell_area.png', scale=4, width=width, height=height)
# # pio.write_image(fig, f'figures/Fig_4_Segmentation/Main_Fig_4_B_cell_area.eps', scale=4, width=width, height=height)
# pio.write_image(fig, f'figures/Fig_4_Segmentation/Main_Fig_4_B_cell_area.svg', scale=4, width=width, height=height)
