In [1]:
import pathlib
from umap_manager import UMAPManager

In [2]:
DATA_DIR = pathlib.Path(f'../TCGA/downloads')
case = 'TCGA-3C-AALK-01Z-00-DX1'

# instantiate manager
manager = UMAPManager(DATA_DIR / case)

# Disable computation of density column
# manager.compute_density = False

# Filter by class
manager.class_filters = ['CancerEpithelium']

# Sample 500 random cells
manager.sample_size = 500

# Exclude some column patterns
manager.exclude_columns = [
    c for c in manager.columns
    if any(substr in c for substr in ['Identifier.', 'ClassifProbab.', 'Unconstrained.'])
]

# View data
manager.data

Reading HIPS data.
Found 412219 features.
Computing density column.


Unnamed: 0,Orientation.Orientation,Size.Area,Size.ConvexHullArea,Size.MajorAxisLength,Size.MinorAxisLength,Size.Perimeter,Shape.Circularity,Shape.Eccentricity,Shape.EquivalentDiameter,Shape.Extent,...,Cytoplasm.Haralick.Entropy.Range,Cytoplasm.Haralick.DifferenceVariance.Mean,Cytoplasm.Haralick.DifferenceVariance.Range,Cytoplasm.Haralick.DifferenceEntropy.Mean,Cytoplasm.Haralick.DifferenceEntropy.Range,Cytoplasm.Haralick.IMC1.Mean,Cytoplasm.Haralick.IMC1.Range,Cytoplasm.Haralick.IMC2.Mean,Cytoplasm.Haralick.IMC2.Range,density
8,-1.560015,191.0,198.0,18.077746,13.593226,50.142136,0.954635,0.659241,15.594510,0.864253,...,0.374739,0.007623,0.002885,2.340702,0.495576,-0.344098,0.107820,0.947012,0.042932,0.000836
798,1.483843,131.0,132.0,14.125594,11.949627,39.899495,1.034061,0.533253,12.914890,0.916084,...,0.293540,0.012237,0.002102,1.473465,0.307352,-0.329444,0.140064,0.904443,0.073312,0.032736
133,-0.114574,103.0,106.0,12.056126,11.004057,35.656854,1.018031,0.408550,11.451798,0.936364,...,0.364879,0.010166,0.002104,1.670250,0.306937,-0.308579,0.117119,0.918620,0.062797,0.001738
542,-0.377372,112.0,133.0,15.043640,10.445137,43.556349,0.741865,0.719664,11.941643,0.615385,...,0.482832,0.007439,0.002968,2.345356,0.510511,-0.325563,0.133434,0.937030,0.053573,0.027621
344,-0.089966,166.0,173.0,17.092478,12.606436,47.313708,0.931846,0.675300,14.538149,0.864583,...,0.526835,0.008922,0.003034,1.978687,0.504024,-0.307952,0.153480,0.930489,0.071927,0.002143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,0.853129,130.0,142.0,15.936713,10.908067,43.556349,0.861093,0.729049,12.865502,0.714286,...,0.758513,0.009680,0.004214,1.757825,0.591658,-0.282230,0.245203,0.895365,0.127535,0.002329
187,-0.027338,106.0,109.0,14.090210,9.630296,36.485281,1.000646,0.729975,11.617375,0.905983,...,0.415982,0.010439,0.002337,1.660885,0.358797,-0.378287,0.127703,0.958158,0.038006,0.002376
183,-0.872050,128.0,131.0,13.982076,11.803096,39.899495,1.010381,0.536093,12.766153,0.888889,...,0.305275,0.009364,0.002267,1.830165,0.328262,-0.237859,0.112984,0.872258,0.096840,0.001422
156,0.169029,173.0,179.0,16.273710,13.717787,47.313708,0.971140,0.538005,14.841511,0.887179,...,0.580695,0.008615,0.003921,2.149957,0.636197,-0.383342,0.159946,0.959173,0.043161,0.001002


In [3]:
# Show image
manager.image

VBox(children=(Map(center=[40960.0, 47744.0], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom…

In [4]:
# Show cell thumbnails
manager.show_cell_thumbnails()

GridBox(children=(Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00"\x00\x00\x00$\x08\x02\x00\x00\…

GridBox(children=(Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00"\x00\x00\x00$\x08\x02\x00\x00\…

In [5]:
n_training = 400
train_set = manager.data[:n_training]
test_set = manager.data[n_training:]

RESULT_DIR = pathlib.Path('./umap_results')

# First dimensionality reduction will train a UMAP Transform
manager.reduce_dims(train_set, plot=True, parquet_path=RESULT_DIR / 'CancerEpithelium_train_set_400.parquet')

# Subsequent dimensionality reductions will leverage trained UMAP Transform
manager.reduce_dims(test_set, plot=True, parquet_path=RESULT_DIR / 'CancerEpithelium_test_set_100.parquet')

Training UMAP Transform.
Completed training in 5.196994 seconds.
Running inference on 400 cells with trained UMAP Transform.
Completed inference in 0.00071 seconds.


FigureWidget({
    'data': [{'marker': {'color': {'bdata': ('AAABAAIAAwAEAAUABgAHAAgACQAKAA' ... 'UBhgGHAYgBiQGKAYsBjAGNAY4BjwE='),
                                   'dtype': 'i2'}},
              'mode': 'markers',
              'type': 'scatter',
              'uid': '2e119973-b038-43a8-befe-5d9511dcc04a',
              'x': {'bdata': ('03l0QDxXykBRcARBlPWEQMogukDQX6' ... 'AA48RAklf8QB+F8EBWS+pA0jdNQA=='),
                    'dtype': 'f4'},
              'y': {'bdata': ('fd8rQclUQUH71DBBoqQQQcG9SEGTTk' ... 'EGvD1B05skQWOVGUGPahZB4AseQQ=='),
                    'dtype': 'f4'}}],
    'layout': {'template': '...'}
})

VBox()

Running inference on 100 cells with trained UMAP Transform.
Completed inference in 1.829277 seconds.


FigureWidget({
    'data': [{'marker': {'color': {'bdata': ('AAECAwQFBgcICQoLDA0ODxAREhMUFR' ... '9QUVJTVFVWV1hZWltcXV5fYGFiYw=='),
                                   'dtype': 'i1'}},
              'mode': 'markers',
              'type': 'scatter',
              'uid': 'c63ec3a0-edd5-4209-9fbe-9288e1e1fe1b',
              'x': {'bdata': ('wnwkQNEhoj/DEMtAsVhKQNe4RUB7cx' ... 'D8S+JACE4MQdJW3EAr5mlA9U3sQA=='),
                    'dtype': 'f4'},
              'y': {'bdata': ('ixVIQQBeSkF8GDNB2BfxQIdPVEHoFg' ... 'FwrTxBfDktQb7AM0EJyChBFzEsQQ=='),
                    'dtype': 'f4'}}],
    'layout': {'template': '...'}
})

VBox()

In [6]:
# For each target cell in a given list of ids, find the 10 most similar cells and display them
target_cells = [1, 2]
manager.nearest_neighbors(target_cells, n=10, show=True)

Running inference on 500 cells with trained UMAP Transform.
Completed inference in 0.499766 seconds.
Running inference on 2 cells with trained UMAP Transform.
Completed inference in 0.007837 seconds.
Cell 1:


GridBox(children=(Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x002\x00\x00\x00,\x08\x02\x00\x00\…

10 most similar cells:


GridBox(children=(Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x002\x00\x00\x00>\x08\x02\x00\x00\…


Cell 2:


GridBox(children=(Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00$\x00\x00\x00$\x08\x02\x00\x00\…

10 most similar cells:


GridBox(children=(Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00&\x00\x00\x00$\x08\x02\x00\x00\…




In [7]:
# Save trained transform to file
transform_path = DATA_DIR / 'my_umap.transform'
manager.save_transform(transform_path)

Saved UMAP Transform to ../TCGA/downloads/my_umap.transform.


In [8]:
# instantiate a second manager for a different case
manager_2 = UMAPManager(DATA_DIR / 'TCGA-3C-AALI-01Z-00-DX1')
manager_2.class_filters = ['CancerEpithelium']
manager_2.sample_size = 500
manager_2.exclude_columns = [
    c for c in manager_2.columns
    if any(substr in c for substr in ['Identifier.', 'ClassifProbab.', 'Unconstrained.'])
]

# Load the saved transform file
manager_2.load_transform(transform_path)

# Run an inference with the loaded transform
manager_2.reduce_dims(plot=True, parquet_path=RESULT_DIR / 'CancerEpithelium_test_set_500.parquet')

Reading HIPS data.
Found 489293 features.
Loaded UMAP Transform from ../TCGA/downloads/my_umap.transform.
Computing density column.
Running inference on 500 cells with trained UMAP Transform.
Completed inference in 0.244721 seconds.


FigureWidget({
    'data': [{'marker': {'color': {'bdata': ('AAABAAIAAwAEAAUABgAHAAgACQAKAA' ... 'HqAesB7AHtAe4B7wHwAfEB8gHzAQ=='),
                                   'dtype': 'i2'}},
              'mode': 'markers',
              'type': 'scatter',
              'uid': 'fb31f8fa-dc04-4f12-8130-e2667658461b',
              'x': {'bdata': ('CMxmQEPCWEBMhFBAARLhQOUtzkBxca' ... 'tBl/zVQBcgzUB8iF9AOa7RQKoi70A='),
                    'dtype': 'f4'},
              'y': {'bdata': ('N2QkQde1DkHDdDBBw80uQefBL0HvMy' ... 'BBHwYwQUvjMUEymBpBLewPQT3DPEE='),
                    'dtype': 'f4'}}],
    'layout': {'template': '...'}
})

VBox()