In [1]:
import pandas as pd
import numpy as np
import torch
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt
import h5py

In [2]:
data_path = r"/dartfs/rc/nosnapshots/V/VaickusL-nb/EDIT_Interns_2024/projects/single_cell_xenium_pred/final_workspace/poc/data/"

In [3]:
df = pd.read_parquet(data_path + 'spatially_var_10_df.parquet')
df.head()

Unnamed: 0_level_0,cell_id,arr_index,x_centroid,y_centroid,gene_exp_vector
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,aaaabecc-1,0,3990.303955,4050.244385,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,aaaabggh-1,1,3913.579102,4072.067383,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,aaaacpcd-1,2,7792.178223,41386.574219,"[0.0, 0.0, 0.005249355919659138, 0.00524935591..."
3,aaaaeclm-1,3,7743.524902,41405.804688,"[0.0, 0.0, 0.0068259648978710175, 0.0, 0.01360..."
4,aaaaiale-1,4,7781.018066,41439.066406,"[0.0, 0.0, 0.0, 0.0, 0.004914014600217342, 0.0..."


<h1>Patching</h1>

In [4]:
he_path = r"/dartfs/rc/nosnapshots/V/VaickusL-nb/EDIT_Interns_2024/projects/single_cell_xenium_pred/open_source_temp_data/Xenium_V1_FFPE_Human_Breast_IDC_With_Addon_he_image.ome.tif"

In [5]:
#import h&e tiff
import tifffile
he_load = tifffile.imread(he_path, is_ome = True, level = 0, aszarr = False)

In [6]:
#store as PIL img
he_img = Image.fromarray(he_load.transpose(1,2,0))

In [7]:
wsi_size = he_img.size
wsi_size

(48441, 53833)

In [8]:
patch_size = 1024

In [9]:
num_x_patches = wsi_size[0] // patch_size
num_y_patches = wsi_size[1] // patch_size

In [10]:
print(num_x_patches)
print(num_y_patches)

47
52


In [11]:
df['patch_x'] = df['x_centroid'] // patch_size
df['patch_y'] = df['y_centroid'] // patch_size

In [12]:
df['patch_index'] = df['patch_y'] * (num_x_patches + 1) + df['patch_x']

In [13]:
lens = []
for patch_index in list(set(df['patch_index'])):
    lens.append(len(df[df['patch_index'] == patch_index]))

In [14]:
print("Avg Cells/Patch:", int(np.mean(lens)))

Avg Cells/Patch: 332


<h1>Train/Val/Test Split</h1>

In [15]:
df['set'] = np.full(len(df), "unassigned")

In [16]:
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,cell_id,arr_index,x_centroid,y_centroid,gene_exp_vector,patch_x,patch_y,patch_index,set
0,bcpifpik-1,46651,10540.112305,34214.886719,"[0.0, 0.0, 0.0, 0.0, 0.014184635132551193, 0.0...",10.0,33.0,1594.0,unassigned
1,fnlcbdep-1,232526,10725.459961,49715.078125,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10.0,48.0,2314.0,unassigned
2,afdjlnki-1,12819,16873.613281,27834.960938,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",16.0,27.0,1312.0,unassigned
3,blmlpfic-1,68400,26947.134766,50727.183594,"[0.0, 0.005479465704411268, 0.0054794657044112...",26.0,49.0,2378.0,unassigned
4,balepllg-1,41088,37204.671875,15922.397461,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",36.0,15.0,756.0,unassigned


In [17]:
patch_indexes = np.array(list(set(df['patch_index'])))

In [18]:
train_upper = int(np.percentile(patch_indexes, 75))
val_upper = int(np.percentile(patch_indexes, 85))

In [19]:
print(train_upper)
print(val_upper)

1857
2084


In [20]:
unique_patch_indices = df['patch_index'].unique()
np.random.shuffle(unique_patch_indices)
num_patches = len(unique_patch_indices)

train_split = int(0.75 * num_patches)
val_split = int(0.1 * num_patches)

train_indices = unique_patch_indices[:train_split]
val_indices = unique_patch_indices[train_split:train_split + val_split]
test_indices = unique_patch_indices[train_split + val_split:]

In [21]:
df['set'] = df['patch_index'].apply(lambda x: 'train' if x in train_indices else ('val' if x in val_indices else 'test'))

In [22]:
print(len(df[df['set'] == 'train']))
print(len(df[df['set'] == 'val']))
print(len(df[df['set'] == 'test']))

433895
58785
84283


In [23]:
df_save = df[['cell_id', 'arr_index', 'x_centroid', 'y_centroid', 'gene_exp_vector', 'patch_index', 'set', 'patch_x', 'patch_y']]

In [24]:
df.head()

Unnamed: 0,cell_id,arr_index,x_centroid,y_centroid,gene_exp_vector,patch_x,patch_y,patch_index,set
0,bcpifpik-1,46651,10540.112305,34214.886719,"[0.0, 0.0, 0.0, 0.0, 0.014184635132551193, 0.0...",10.0,33.0,1594.0,train
1,fnlcbdep-1,232526,10725.459961,49715.078125,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10.0,48.0,2314.0,train
2,afdjlnki-1,12819,16873.613281,27834.960938,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",16.0,27.0,1312.0,train
3,blmlpfic-1,68400,26947.134766,50727.183594,"[0.0, 0.005479465704411268, 0.0054794657044112...",26.0,49.0,2378.0,train
4,balepllg-1,41088,37204.671875,15922.397461,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",36.0,15.0,756.0,test


In [26]:
df.at[0, 'gene_exp_vector']

[0.0, 0.0, 0.0, 0.0, 0.014184635132551193, 0.0, 0.0, 0.0, 0.0, 0.0]

In [25]:
df.to_parquet(data_path + "spatially_var_10_patched_df.parquet")