# Pre-process_Xenium_Prime_Human_Lymph_Node_Reactive_FFPE_outs

In [1]:
%load_ext autoreload
%autoreload 2
%env ANYWIDGET_HMR=1

env: ANYWIDGET_HMR=1


In [2]:
import numpy as np
import pandas as pd

# macOS requirement
import os
os.environ['DYLD_LIBRARY_PATH'] = '/opt/homebrew/lib:' + os.environ.get('DYLD_LIBRARY_PATH', '')

import celldega as dega

import tifffile
import zarr

import matplotlib.pyplot as plt
from matplotlib.colors import to_hex

import geopandas as gpd
import shapely

import tarfile

In [3]:
ls ../data/xenium_data/Xenium_Prime_Human_Lymph_Node_Reactive_FFPE_outs/

analysis.tar.gz               cells.zarr.zip
analysis.zarr.zip             experiment.xenium
analysis_summary.html         gene_panel.json
aux_outputs.tar.gz            metrics_summary.csv
cell_boundaries.csv.gz        morphology.ome.tif
cell_boundaries.parquet       [1m[36mmorphology_focus[m[m/
cell_feature_matrix.h5        nucleus_boundaries.csv.gz
cell_feature_matrix.tar.gz    nucleus_boundaries.parquet
cell_feature_matrix.zarr.zip  transcripts.parquet
cells.csv.gz                  transcripts.zarr.zip
cells.parquet


In [4]:
ls ../data/xenium_landscapes/

[1m[36mXenium_V1_hBoneMarrow_acute_lymphoid_leukemia_section_outs_landscape_files[m[m/
[1m[36mXenium_V1_hBoneMarrow_nondiseased_section_outs_landscape_files[m[m/


In [5]:
dataset_name = 'Xenium_Prime_Human_Lymph_Node_Reactive_FFPE_outs'

In [6]:
base_path = '../data/xenium_data/' + dataset_name + '/'

In [7]:
path_landscape_files = '../data/xenium_landscapes/' + dataset_name + '_landscape_files/'

In [8]:
base_path

'../data/xenium_data/Xenium_Prime_Human_Lymph_Node_Reactive_FFPE_outs/'

In [9]:
path_landscape_files

'../data/xenium_landscapes/Xenium_Prime_Human_Lymph_Node_Reactive_FFPE_outs_landscape_files/'

In [10]:
if not os.path.exists(path_landscape_files):
    os.mkdir(path_landscape_files)

## Unzip Xenium Data

#### Decompress Cell Feature Matrix MTX Files

In [11]:
# Path to the tar.gz file you want to decompress
tar_file_path = base_path + 'cell_feature_matrix.tar.gz'
# Path to the directory where you want to extract the contents
output_directory = path_landscape_files

# Open the tar.gz file
with tarfile.open(tar_file_path, "r:gz") as tar:
    # Extract all contents to the specified directory
    tar.extractall(path=output_directory)

print(f"File {tar_file_path} has been decompressed to {output_directory}")


File ../data/xenium_data/Xenium_Prime_Human_Lymph_Node_Reactive_FFPE_outs/cell_feature_matrix.tar.gz has been decompressed to ../data/xenium_landscapes/Xenium_Prime_Human_Lymph_Node_Reactive_FFPE_outs_landscape_files/


#### Decompress Xenium Analysis Files

In [12]:
# Path to the tar.gz file you want to decompress
tar_file_path = base_path + 'analysis.tar.gz'
# Path to the directory where you want to extract the contents
output_directory = path_landscape_files

# Open the tar.gz file
with tarfile.open(tar_file_path, "r:gz") as tar:
    # Extract all contents to the specified directory
    tar.extractall(path=output_directory)

print(f"File {tar_file_path} has been decompressed to {output_directory}")


File ../data/xenium_data/Xenium_Prime_Human_Lymph_Node_Reactive_FFPE_outs/analysis.tar.gz has been decompressed to ../data/xenium_landscapes/Xenium_Prime_Human_Lymph_Node_Reactive_FFPE_outs_landscape_files/


# CBG

In [13]:
cbg = dega.pre.read_cbg_mtx(path_landscape_files + 'cell_feature_matrix/')
cbg

read mtx file from  ../data/xenium_landscapes/Xenium_Prime_Human_Lymph_Node_Reactive_FFPE_outs_landscape_files/cell_feature_matrix/


1,A2ML1,AAMP,AAR2,AARSD1,ABAT,ABCA1,ABCA3,ABCA4,ABCB1,ABCB4,...,DeprecatedCodeword_18589,DeprecatedCodeword_18601,DeprecatedCodeword_18609,DeprecatedCodeword_18616,DeprecatedCodeword_18620,DeprecatedCodeword_18632,DeprecatedCodeword_18637,DeprecatedCodeword_18639,DeprecatedCodeword_18646,DeprecatedCodeword_18649
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aaaaadoa-1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaaaclhf-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaaafcfj-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaaagamp-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaaaiako-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
oikoinno-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
oikojopg-1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
oikokaeg-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
oikooakd-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
meta_gene_exp = dega.pre.calc_meta_gene_data(cbg)

calculating mean expression from sparse float data
calculating variance by looping over rows


### Gene Metadata

In [15]:
path_cbg = path_landscape_files + 'cell_feature_matrix/'
path_output = path_landscape_files + 'meta_gene.parquet'
dega.pre.make_meta_gene('Xenium', path_cbg, path_output)

read mtx file from  ../data/xenium_landscapes/Xenium_Prime_Human_Lymph_Node_Reactive_FFPE_outs_landscape_files/cell_feature_matrix/
calculating mean expression from sparse float data
calculating variance by looping over rows


### Cell-by-gene Files

In [17]:
# dega.pre.save_cbg_gene_parquets(path_landscape_files, cbg, verbose=True)

### Image Tiles

In [19]:
import tifffile

# Path to your OME-TIFF file
file_path = base_path + 'morphology_focus/morphology_focus_0000.ome.tif'

# Open the OME-TIFF file and read the image data
with tifffile.TiffFile(file_path) as tif:
    series = tif.series[0]  # Assuming you are interested in the first series
    image_data = series.asarray()


<tifffile.TiffFile 'morphology_focus_0000.ome.tif'> OME series cannot read multi-file pyramids


In [20]:
image_data.shape

(4, 34119, 39776)

### DAPI

In [22]:
image_data_scaled = image_data[0,:,:] * 2

# Save the image data to a regular TIFF file without compression
tifffile.imwrite(path_landscape_files + 'output_regular.tif', image_data_scaled, compression=None)
image_ds = dega.pre.reduce_image_size(path_landscape_files + 'output_regular.tif', 0.5, path_landscape_files)
image_jpeg = dega.pre.convert_to_jpeg(image_ds, quality=100)
dega.pre.make_deepzoom_pyramid(image_jpeg, path_landscape_files + 'pyramid_images/', 'dapi')

In [23]:
image_data_scaled = image_data[1,:,:] * 2

# Save the image data to a regular TIFF file without compression
tifffile.imwrite(path_landscape_files + 'output_regular.tif', image_data_scaled, compression=None)
image_ds = dega.pre.reduce_image_size(path_landscape_files + 'output_regular.tif', 0.5, path_landscape_files)
image_jpeg = dega.pre.convert_to_jpeg(image_ds, quality=100)
dega.pre.make_deepzoom_pyramid(image_jpeg, path_landscape_files + 'pyramid_images/', 'bound')

In [24]:
image_data_scaled = image_data[2,:,:] * 2

# Save the image data to a regular TIFF file without compression
tifffile.imwrite(path_landscape_files + 'output_regular.tif', image_data_scaled, compression=None)
image_ds = dega.pre.reduce_image_size(path_landscape_files + 'output_regular.tif', 0.5, path_landscape_files)
image_jpeg = dega.pre.convert_to_jpeg(image_ds, quality=100)
dega.pre.make_deepzoom_pyramid(image_jpeg, path_landscape_files + 'pyramid_images/', 'rna')

In [25]:
image_data_scaled = image_data[3,:,:] * 2

# Save the image data to a regular TIFF file without compression
tifffile.imwrite(path_landscape_files + 'output_regular.tif', image_data_scaled, compression=None)
image_ds = dega.pre.reduce_image_size(path_landscape_files + 'output_regular.tif', 0.5, path_landscape_files)
image_jpeg = dega.pre.convert_to_jpeg(image_ds, quality=100)
dega.pre.make_deepzoom_pyramid(image_jpeg, path_landscape_files + 'pyramid_images/', 'prot')

### Cell Metadata

In [26]:
# Function to open a Zarr file
def open_zarr(path: str) -> zarr.Group:
    store = (zarr.ZipStore(path, mode="r")
    if path.endswith(".zip")
    else zarr.DirectoryStore(path)
    )
    return zarr.group(store=store)

# For example, use the above function to open the cells Zarr file, which contains segmentation mask Zarr arrays
root = open_zarr(base_path + "cells.zarr.zip")

# # Look at group array info and structure
# root.info
# root.tree() # shows structure, array dimensions, data types


In [27]:
transformation_matrix = root['masks']['homogeneous_transform'][:]
transformation_matrix

array([[4.705882, 0.      , 0.      , 0.      ],
       [0.      , 4.705882, 0.      , 0.      ],
       [0.      , 0.      , 1.      , 0.      ],
       [0.      , 0.      , 0.      , 1.      ]], dtype=float32)

In [28]:
pd.DataFrame(transformation_matrix[:3,:3]).to_csv(
    path_landscape_files + 'xenium_transform.csv', 
    sep=' ', 
    header=False, 
    index=False
)

In [29]:
path_transformation_matrix = path_landscape_files + 'xenium_transform.csv'
path_meta_cell_micron = base_path + 'cells.csv.gz'
path_meta_cell_image = path_landscape_files + 'cell_metadata.parquet'

In [30]:
df_meta = pd.read_csv(path_landscape_files + 'analysis/clustering/gene_expression_graphclust/clusters.csv', index_col=0)
df_meta['Cluster'] = df_meta['Cluster'].astype('string')
df_meta.columns = ['cluster']

In [31]:
dega.pre.make_meta_cell_image_coord(
    'Xenium', 
    path_transformation_matrix, 
    path_meta_cell_micron, 
    path_meta_cell_image, 
    df_meta=df_meta
)

### Cluster Colors

In [32]:
ser_counts = df_meta['cluster'].value_counts()
clusters = ser_counts.index.tolist()

In [33]:
# Get all categorical color palettes from Matplotlib and flatten them into a single list of colors
palettes = [plt.get_cmap(name).colors for name in plt.colormaps() if "tab" in name]
flat_colors = [color for palette in palettes for color in palette]

# Convert RGB tuples to hex codes
flat_colors_hex = [to_hex(color) for color in flat_colors]

# Use modular arithmetic to assign a color to each gene, white for genes with "Blank"
colors = [
    flat_colors_hex[i % len(flat_colors_hex)] if "Blank" not in cluster else "#FFFFFF"
    for i, cluster in enumerate(clusters)
]

# Create a DataFrame with genes and their assigned colors
ser_color = pd.Series(colors, index=clusters, name='color')

meta_cluster = pd.DataFrame(ser_color)

meta_cluster['count'] = ser_counts

meta_cluster.to_parquet(path_landscape_files + 'meta_cluster.parquet')

### Transcripts

In [34]:
# %%time 
# technology = 'Xenium'
# path_trx = base_path + 'transcripts.parquet'
# path_trx_tiles = path_landscape_files + 'transcript_tiles'
# tile_bounds = dega.pre.make_trx_tiles(
#     'Xenium', 
#     path_trx, 
#     path_transformation_matrix, 
#     path_trx_tiles,
#     tile_size=100,
#     # verbose=True
# )

CPU times: user 2h 59min 13s, sys: 2h 18min 17s, total: 5h 17min 31s
Wall time: 12h 31min 20s


### Cell Boundaries

In [35]:
%%time
path_cell_boundaries = base_path + 'cell_boundaries.parquet'
path_output = path_landscape_files + 'cell_segmentation'
dega.pre.make_cell_boundary_tiles(
    'Xenium',
    path_cell_boundaries, 
    path_meta_cell_micron, 
    path_transformation_matrix, 
    path_output,
    tile_size=100,
    tile_bounds=tile_bounds
)

CPU times: user 11min 24s, sys: 1min 4s, total: 12min 28s
Wall time: 12min 41s


### Gene Metadata

In [36]:
path_cbg = path_landscape_files + 'cell_feature_matrix/'
path_output = path_landscape_files + 'gene_metadata.parquet'
dega.pre.make_meta_gene('Xenium', path_cbg, path_output)

read mtx file from  ../data/xenium_landscapes/Xenium_Prime_Human_Lymph_Node_Reactive_FFPE_outs_landscape_files/cell_feature_matrix/
calculating mean expression from sparse float data
calculating variance by looping over rows


### Max Zoom

In [37]:
# Example usage:
path_image_pyramid = path_landscape_files + 'pyramid_images/dapi_files/'  # Change this to your actual directory path
max_pyramid_zoom = dega.pre.get_max_zoom_level(path_image_pyramid)

print(max_pyramid_zoom)

15


### Cluster Gene Expression

In [39]:
usecols = ['cell_id', 'x_centroid', 'y_centroid']
meta_cell = pd.read_csv(base_path + 'cells.csv.gz', index_col=0, usecols=usecols)
meta_cell.columns = ['center_x', 'center_y']
meta_cell

Unnamed: 0_level_0,center_x,center_y
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
aaaaadoa-1,2871.859619,347.729767
aaaaclhf-1,2882.301025,349.938110
aaaafcfj-1,2880.217041,338.575897
aaaagamp-1,2852.795166,356.880615
aaaaiako-1,2854.036133,361.754639
...,...,...
oikoinno-1,6281.145996,4459.715332
oikojopg-1,6231.592285,4434.088379
oikokaeg-1,6243.616211,4449.573730
oikooakd-1,6224.635254,4435.554688


In [40]:
meta_cell['cluster'] = df_meta['cluster']

In [41]:
list_ser = []
for inst_cat in meta_cell['cluster'].unique().tolist():
    if inst_cat is not None:
        inst_cells = meta_cell[meta_cell['cluster'] == inst_cat].index.tolist()
        # print(inst_cat, len(inst_cells))

        inst_ser = cbg.loc[inst_cells].sum()/len(inst_cells)
        inst_ser.name = inst_cat

        list_ser.append(inst_ser)

df_sig = pd.concat(list_ser, axis=1)    


In [42]:
df_sig = pd.concat(list_ser, axis=1)
# handling weird behavior where there is a multiindex it appears
df_sig.columns = df_sig.columns.tolist()
df_sig.index = df_sig.index.tolist()

In [47]:
keep_genes = df_sig.index.tolist()
keep_genes = [x for x in keep_genes if 'Unassigned' not in x]
keep_genes = [x for x in keep_genes if 'NegControl' not in x]
keep_genes = [x for x in keep_genes if 'DeprecatedCodeword' not in x]
len(keep_genes)

df_sig = df_sig.loc[keep_genes, clusters]
df_sig.shape

(4645, 35)

In [48]:
df_sig.sparse.to_dense().to_parquet(path_landscape_files + 'df_sig.parquet')

### Save Landscape Parameters JSON

In [45]:
image_info =  [
        {
            "name": "dapi",
            "button_name": "DAPI",
            "color": [
                0,
                0,
                255
            ]
        },
        {
            "name": "bound",
            "button_name": "BOUND",
            "color": [
                0,
                255,
                0
            ]
        },
        {
            "name": "rna",
            "button_name": "RNA",
            "color": [
                255,
                0,
                0
            ]
        },
        {
            "name": "prot",
            "button_name": "PROT",
            "color": [
                255,
                255,
                255
            ]
        }
    ]

In [46]:
dega.pre.save_landscape_parameters(
    'Xenium', 
    path_landscape_files,
    'dapi_files',
    tile_size=100,
    image_info=image_info
)

../data/xenium_landscapes/Xenium_Prime_Human_Lymph_Node_Reactive_FFPE_outs_landscape_files/pyramid_images/dapi_files/
