# Secret Pre-Landscape Notebook MERSCOPE

In [1]:
%load_ext autoreload
%autoreload 2
%env ANYWIDGET_HMR=1

env: ANYWIDGET_HMR=1


In [5]:
import pandas as pd
from skimage.io import imread
import os
import celldega as dega
print(dega.pre.__file__)

technology = 'MERSCOPE'

/Users/whuan/dev/celldega/src/celldega/pre/__init__.py


In [6]:
sample = '2024_merscope_breast_htma'
region = 0

DATA_DIR = f'/Users/whuan/dev/ist_benchmarking/data'

data_dir = f'{DATA_DIR}/{sample}'
data_dir_region = f'{DATA_DIR}/{sample}/region_{region}'
path_landscape_files = f'{DATA_DIR}/landscapes/{sample}'
path_landscape_files_region = f'{DATA_DIR}/landscapes/{sample}/region_{region}'


for folder in [data_dir, data_dir_region, path_landscape_files, path_landscape_files_region]:
    if not os.path.exists(folder):
        os.mkdir(folder)
        print (folder)

# Copy data from Google bucket

In [7]:
raw_data_bucket = 'fc-b8e703d3-de2d-4532-94cc-efe864b4feea/SPARC/Revisions/202405032008_SPARCRevisionsHTMAJN3May24_VMSC11302'

for file in [
    'images/mosaic_DAPI_z1.tif',
    'images/mosaic_Cellbound1_z1.tif',
    'images/micron_to_mosaic_pixel_transform.csv',
    'cell_metadata.csv',
    'detected_transcripts.csv',
    'cell_boundaries.parquet',
    'cell_by_gene.csv'
    ]:

    if os.path.exists(f"{data_dir_region}/{file.split('/')[-1]}"):
        print (f'{file} is previously downloaded')
    else:
        cmd = f'gsutil cp gs://{raw_data_bucket}/region_{region}/{file} {data_dir_region}/'
        print (cmd)
        ! {cmd}

images/mosaic_DAPI_z1.tif is previously downloaded
images/mosaic_Cellbound1_z1.tif is previously downloaded
images/micron_to_mosaic_pixel_transform.csv is previously downloaded
cell_metadata.csv is previously downloaded
detected_transcripts.csv is previously downloaded
cell_boundaries.parquet is previously downloaded
cell_by_gene.csv is previously downloaded


## Make Pyramidal Image

In [8]:
image_scale = 1
channel = 'dapi'

img = imread(f'{data_dir_region}/mosaic_DAPI_z1.tif')
img_8bit = dega.pre.check_and_convert_16_to_8_bit(img)
dega.pre.make_deepzoom_pyramid(img_8bit, f"{path_landscape_files_region}/pyramid_images", channel, clahe_tile_size=64, clahe_contrast_limit=20, suffix=".webp[Q=100]")

## Cell Metadata

In [9]:
path_transformation_matrix = f'{data_dir_region}/micron_to_mosaic_pixel_transform.csv'
path_meta_cell_micron = f'{data_dir_region}/cell_metadata.csv'
path_meta_cell_image = f'{path_landscape_files_region}/cell_metadata.parquet'

In [10]:
dega.pre.make_meta_cell_image_coord(
    'MERSCOPE', 
    path_transformation_matrix, 
    path_meta_cell_micron, 
    path_meta_cell_image, 
    image_scale=image_scale
)

## Transcripts

In [11]:
%%time
tile_size = 250 #
path_trx = f'{data_dir_region}/detected_transcripts.csv'
path_trx_tiles = f'{path_landscape_files_region}/transcript_tiles'

tile_bounds = dega.pre.make_trx_tiles(
                'MERSCOPE',
                path_trx,
                path_transformation_matrix,
                path_trx_tiles,
                coarse_tile_size=5000,
                fine_tile_size=tile_size,
                chunk_size=100000,
                verbose=False,
                image_scale=1,
                max_workers=4)
                

tile_bounds


python(47532) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Processing chunks: 100%|██████████| 15/15 [00:00<00:00, 223.91it/s]
Processing coarse tiles: 231tile [01:05,  3.54tile/s]

CPU times: user 2min 12s, sys: 3min 3s, total: 5min 16s
Wall time: 1min 5s





{'x_min': 435.0, 'x_max': 53836.16, 'y_min': 131.36, 'y_max': 100389.39}

## Cell Boundaries

In [12]:
%%time

path_cell_boundaries = f'{data_dir_region}/cell_boundaries.parquet'
path_meta_cell_micron = f'{data_dir_region}/cell_metadata.csv'
path_transformation_matrix = f'{data_dir_region}/micron_to_mosaic_pixel_transform.csv'
path_output = f'{path_landscape_files_region}/cell_segmentation'

dega.pre.make_cell_boundary_tiles(
    'MERSCOPE',
    path_cell_boundaries,
    path_meta_cell_micron,
    path_transformation_matrix,
    path_output,
    coarse_tile_size=5000,
    fine_tile_size=250,
    tile_bounds=tile_bounds,
    image_scale=1,
    max_workers=8)


Processing coarse tiles: 100%|██████████| 11/11 [02:46<00:00, 15.14s/it]


CPU times: user 3min 9s, sys: 35.2 s, total: 3min 44s
Wall time: 3min 9s


## Gene Metadata

In [18]:
path_cbg = f'{data_dir_region}/cell_by_gene.csv'
path_output = f'{path_landscape_files_region}/meta_gene.parquet'
dega.pre.make_meta_gene(technology, path_cbg, path_output)


cbg is a dense DataFrame. Proceeding with dense operations.
Calculating mean expression
Calculating variance
Calculating maximum expression
Calculating proportion of non-zero expression


In [14]:
cbg = pd.read_csv(path_cbg)
cbg = cbg.set_index('cell')
cbg = cbg.rename_axis('__index_level_0__', axis='columns')
cbg = cbg.astype(pd.SparseDtype("float", fill_value=0))


df_meta = pd.read_parquet(f"{path_landscape_files_region}/cell_metadata.parquet")
entity_to_cell_id_dict = pd.Series(df_meta.index.values, index=df_meta.EntityID).to_dict()

cbg['cell'] = cbg.index.map(entity_to_cell_id_dict)

cbg = cbg.set_index('cell')
display(cbg.head())

dega.pre.save_cbg_gene_parquets(path_landscape_files_region, cbg, verbose=True)

__index_level_0__,PDK4,CD79B,CD9,CD4,TNFRSF17,CEACAM6,LAG3,ESR1,KRT23,CTSG,...,Blank-21,Blank-22,Blank-23,Blank-24,Blank-25,Blank-26,Blank-27,Blank-28,Blank-29,Blank-30
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
w6wHcCNCq,0,0,0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RDvx4MllU,0,0,0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
XLdToFTaV,0,0,0,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5kDxZ8TiG,0,0,0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
iDvhSyThC,0,0,0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Processing gene 0: PDK4
Processing gene 100: CD86
Processing gene 200: BANK1


## Get Max Zoom
Save this to a file in the landscape files, read it on the front-end, and use it in the code. 

In [15]:
# Example usage:
path_image_pyramid = f"{path_landscape_files_region}/pyramid_images/dapi_files"  # Change this to your actual directory path
max_pyramid_zoom = dega.pre.get_max_zoom_level(path_image_pyramid)

print(max_pyramid_zoom)

17


## Save Landscape Parameters JSON

In [16]:
image_info =  [
        {
            "name": "dapi",
            "button_name": "DAPI",
            "color": [
                0,
                0,
                255
            ]
        }
    ]

tile_size = 250

In [17]:
dega.pre.save_landscape_parameters(
    technology, 
    path_landscape_files_region,
    'dapi_files',
    tile_size=tile_size,
    image_info=image_info,
    image_format='.webp'
)

/Users/whuan/dev/ist_benchmarking/data/landscapes/2024_merscope_breast_htma/region_0/pyramid_images/dapi_files


In [16]:
des_bucket_name = "fc-secure-cbb15268-8969-436a-818b-ae40f52e3b41/ist_data/landscape_files/"
cmd = f"gsutil -m cp -r {path_landscape_files} gs://{des_bucket_name}"
print (cmd)

gsutil -m cp -r /Users/whuan/dev/ist_benchmarking/data/landscapes/2024_merscope_breast_htma gs://fc-secure-cbb15268-8969-436a-818b-ae40f52e3b41/ist_data/landscape_files/
