# Custom segmentation Xenium data

We here provide commands to segment Xenium data into the different segmentations. Raw data can be accessed via Xenium. If data is not available there anymore, raw data can be requested with the corresponding authors.
This notebook is not intended to be executed but rather is a collection of batch commands to execute segmentation and provides the commands used by us.

## Baysor segmentation

In [1]:
import pandas as pd

In [2]:
#tx_file = pd.read_csv('/external_data/other/Vizgen_redownload/HumanLiverCancerPatient1/detected_transcripts.csv', engine="pyarrow")
tx_file = pd.read_csv('transcripts.csv.gz', compression="gzip")

In [5]:
tx_file['cell_id'] = tx_file['cell_id'] + 1

In [7]:
x_min, y_min = tx_file[['x_location', 'y_location']].min()
x_max, y_max = tx_file[['x_location', 'y_location']].max()

x_max - x_min, y_max - y_min

(10261.238762, 7026.7352137)

In [8]:
import numpy as np
tx_file['x_bin'] = pd.cut(tx_file['x_location'], bins=10, labels=list(np.arange(10)))
tx_file['y_bin'] = pd.cut(tx_file['y_location'], bins=7, labels=list(np.arange(7)))

# Group by the bins to get 49 rectangles
tx_file['bin'] = tx_file['x_bin'].astype(str) + '_' + tx_file['y_bin'].astype(str)

In [9]:
tx_file['fov_coarse'] = tx_file['bin'].astype("category").cat.codes
fov_list = tx_file['fov_coarse'].unique()
fov_list

array([35, 28, 42, 49, 13, 20, 14, 21,  7, 27, 36, 29, 43, 50, 57, 56, 34,
       64, 63,  8,  1,  0, 15, 22, 37, 30, 44, 51, 58,  2,  9, 65, 16, 23,
       31, 38, 45, 52, 59, 66,  3, 10, 17, 32, 39, 24, 46, 53, 67, 60, 11,
       18,  4, 25, 40, 47, 54, 33, 61, 68,  5, 12, 26, 19, 41, 48, 55, 62,
        6, 69], dtype=int8)

In [11]:
#import os
#os.mkdir('baysor_transcripts_new')

In [12]:
for i in tx_file['fov_coarse'].unique():
    tx_file[tx_file['fov_coarse']==i].to_csv('baysor_transcripts_new/transcripts_xenium_' + str(i) + '.csv')

In [13]:
' '.join(fov_list.astype(str))

'35 28 42 49 13 20 14 21 7 27 36 29 43 50 57 56 34 64 63 8 1 0 15 22 37 30 44 51 58 2 9 65 16 23 31 38 45 52 59 66 3 10 17 32 39 24 46 53 67 60 11 18 4 25 40 47 54 33 61 68 5 12 26 19 41 48 55 62 6 69'

In [None]:
%%bash
docker run -it -v /home/cane:/home/cane --rm vpetukhov/baysor:master

cd /data/extra_files/xenium/Xenium_V1_FF_Mouse_Brain_Coronal_outs/baysor_transcripts_new/
N=5
(
for VARIABLE in 35 28 42 49 13 20 14 21 7 27 36 29 43 50 57 56 34 64 63 8 1 0 15 22 37 30 44 51 58 2 9 65 16 23 31 38 45 52 59 66 3 10 17 32 39 24 46 53 67 60 11 18 4 25 40 47 54 33 61 68 5 12 26 19 41 48 55 62 6 69; do
    ((i=i%N)); ((i++==0)) && wait
    baysor run -x x_location -y y_location -z z_location --gene feature_name transcripts_xenium_$VARIABLE.csv :cell_id -o segmentation_fov_prior_$VARIABLE.csv -c config.toml --plot --save-polygons=geojson &  done
)

In [10]:
from glob import glob
json_files = glob('baysor_transcripts/*.json')

In [6]:
csv_file = f'baysor_transcripts/segmentation_fov_22_counts.tsv'
df_transcript = pd.read_csv(csv_file, sep='\t')

In [None]:
import json
import geojson
import pandas as pd
from glob import glob

# Get a list of all JSON files
json_files = glob('baysor_transcripts/segmentation_fov_*_polygons.json')

# Initialize an empty list to hold the GeoJSON data
merged_data = []

# Loop through each JSON file
for file in json_files:
    # Extract fov from the file name
    fov = file.split('_')[3]

    # Open the JSON file
    with open(file, 'r') as f:
        # Load the JSON data
        data = json.load(f)

    # Load the corresponding CSV file
    csv_file = f'baysor_transcripts/segmentation_fov_{fov}_cell_stats.csv'
    df = pd.read_csv(csv_file)
    df['cell'] = df['cell'].astype(str)

    # Convert the JSON data to GeoJSON and append to the list
    for i, geometry in enumerate(data['geometries']):
        print(i, len(data['geometries']))
        # Add the 'cell' key with the corresponding value from the CSV file
        geometry['properties'] = {'cell': fov + '_' + df['cell'].iloc[i]}
        geojson_data = geojson.Polygon(geometry['coordinates'])
        merged_data.append(geojson_data)

# Create a GeoJSON GeometryCollection
geometry_collection = geojson.GeometryCollection(merged_data)

# Write the merged GeoJSON data to a new file
with open('baysor_transcripts/merged.geojson', 'w') as f:
    json.dump(geometry_collection, f)

In [3]:
import json

# Open the GeoJSON file
with open('baysor-cell-polygons.geojson', 'r') as f:
    # Load the GeoJSON data
    data = json.load(f)

In [None]:
import json

# Open the GeoJSON file
with open('baysor_transcripts/merged.geojson', 'r') as f:
    # Load the GeoJSON data
    data_baysor = json.load(f)

In [None]:
import pandas as pd
import glob
import os

# Get a list of all CSV files
csv_files = glob.glob('baysor_transcripts/segmentation_fov_*.csv')
csv_files = [file for file in csv_files if "_cell_stats.csv" not in file]

# Initialize an empty list to hold the dataframes
df_list = []

# Loop through each CSV file
for file in csv_files:
    # Load the data
    df = pd.read_csv(file)

    # Extract fov from the file name
    fov = os.path.basename(file).split('_')[2].split('.')[0]

    # Add a new column 'fov'
    df['fov'] = fov

    # Append the dataframe to the list
    df_list.append(df)

# Concatenate all dataframes in the list
merged_df = pd.concat(df_list, ignore_index=True)
merged_df.to_csv('baysor_transcripts/merged_segmentation.csv')
merged_df.head()

In [10]:
merged_df_sub = merged_df[['transcript_id', 'cell', 'is_noise', 'x', 'y', 'z']]
merged_df_sub['cell'] = 'cell-' + merged_df_sub['cell'].astype(str)
merged_df_sub.to_csv('baysor_transcripts/merged_segmentation_sub.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df_sub['cell'] = 'cell-' + merged_df_sub['cell'].astype(str)


## Xeniumranger

In [None]:
%%bash

export PATH=/data/extra_files/opts/xeniumranger-xenium2.0:$PATH

In [None]:
%%bash

xeniumranger import-segmentation \
    --id Xenium_V1_FF_Mouse_Brain_Coronal_baysor \
    --xenium-bundle /data/extra_files/xenium/Xenium_V1_FF_Mouse_Brain_Coronal_outs \
    --viz-polygons baysor-cell-polygons.geojson \
    --transcript-assignment baysor_transcripts/merged_segmentation.csv \
    --units microns

In [None]:
%%bash

xeniumranger resegment --id=Xenium_V1_FF_Mouse_Brain_Coronal_multimodal \
                       --xenium-bundle=/data/extra_files/xenium/Xenium_V1_FF_Mouse_Brain_Coronal_outs \
                       --expansion-distance=3 \
                       --localcores=20 \
                       --localmem=64

## ProSeg

For ProSeg nucleus set folder to nucleus segmentation

In [None]:
%%bash

proseg transcripts.csv.gz --xenium   
proseg-to-baysor \
    transcript-metadata.csv.gz \
    cell-polygons.geojson.gz \
    --output-transcript-metadata baysor-transcript-metadata.csv \
    --output-cell-polygons baysor-cell-polygons.geojson

In [None]:
%%bash

xeniumranger import-segmentation \
    --id Xenium_V1_FF_Mouse_Brain_Coronal_proseg \
    --xenium-bundle /data/extra_files/xenium/Xenium_V1_FF_Mouse_Brain_Coronal_outs \
    --viz-polygons baysor-cell-polygons.geojson \
    --transcript-assignment baysor-transcript-metadata.csv \
    --units microns