Skip to content

Commit

Permalink
Merge 379ad41 into 4929fdb
Browse files Browse the repository at this point in the history
  • Loading branch information
alex-l-kong authored Mar 2, 2021
2 parents 4929fdb + 379ad41 commit 1439a66
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 26 deletions.
15 changes: 14 additions & 1 deletion ark/phenotyping/create_som_matrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# - markers: list of channel columns to use
# - numPasses: passes to make through dataset for training
# - pixelSubsetDir: path to directory containing the subsetted pixel data
# - normValsPath: path to the 99.9% normalized values file
# - pixelWeightsPath: path to the SOM weights file

library(arrow)
Expand All @@ -27,8 +28,11 @@ numPasses <- strtoi(args[3])
# get path to subsetted mat directory
pixelSubsetDir <- args[4]

# get the normalized values write path
normValsPath <- args[5]

# get the weights write path
pixelWeightsPath <- args[5]
pixelWeightsPath <- args[6]

# read the subsetted pixel mat data for training
print("Reading the subsetted pixel matrix data for SOM training")
Expand All @@ -50,6 +54,9 @@ for (i in 1:length(fovs)) {
}

# perform 99.9% normalization on the subsetted data
quantiles <- data.frame(matrix(NA, nrow=1, ncol=length(markers)))
colnames(quantiles) <- markers

print("Performing 99.9% normalization")

for (marker in markers) {
Expand All @@ -59,8 +66,14 @@ for (marker in markers) {
if (marker_quantile != 0) {
pixelSubsetData[, marker] = pixelSubsetData[, marker] / marker_quantile
}

quantiles[marker] = marker_quantile
}

# write 99.9% normalized values to feather
print("Save 99.9% normalized values for each marker")
arrow::write_feather(as.data.table(quantiles), normValsPath)

# run the SOM training step
print("Run the SOM training")
somResults <- SOM(data=pixelSubsetData, rlen=numPasses)
Expand Down
21 changes: 14 additions & 7 deletions ark/phenotyping/run_trained_som.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# - fovs: list of fovs to cluster
# - markers: list of channel columns to use
# - pixelMatDir: path to directory containing the complete pixel data
# - normValsPath: path to the 99.9% normalized values file
# - pixelWeightsPath: path to the SOM weights file
# - pixelClusterDir: path to directory where the clustered data will be written to

Expand All @@ -21,16 +22,24 @@ fovs <- unlist(strsplit(args[1], split=","))
# get path to pixel mat directory
pixelMatDir <- args[2]

# get path to the 99.9% normalized values
normValsPath <- args[3]

# get path to the weights
pixelWeightsPath <- args[3]
pixelWeightsPath <- args[4]

# get the cluster write path directory
pixelClusterDir <- args[4]
pixelClusterDir <- args[5]

# read the weights
print("Reading the weights matrix")
somWeights <- as.matrix(arrow::read_feather(pixelWeightsPath))

# read the normalization values
normVals <- as.matrix(arrow::read_feather(normValsPath))

# convert normVals into a vector
normVals <- as.numeric(normVals[1, ])

# get the marker names from the weights matrix
markers <- colnames(somWeights)

Expand All @@ -44,11 +53,9 @@ for (i in 1:length(fovs)) {

# 99.9% normalize pixel data
for (marker in markers) {
marker_quantile <- quantile(fovPixelData[, marker], 0.999)

# this prevents all-zero columns from getting normalized and becoming NA/Inf
if (marker_quantile != 0) {
fovPixelData[, marker] = fovPixelData[, marker] / marker_quantile
if (normVals[1, marker] != 0) {
fovPixelData[, marker] = fovPixelData[, marker] / normVals[1, marker]
}
}

Expand Down
28 changes: 22 additions & 6 deletions ark/phenotyping/som_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,8 @@ def create_pixel_matrix(img_xr, seg_labels, base_dir,


def train_som(fovs, channels, base_dir,
sub_dir='pixel_mat_subsetted', weights_name='weights.feather', num_passes=1):
sub_dir='pixel_mat_subsetted', norm_vals_name='norm_vals.feather',
weights_name='weights.feather', num_passes=1):
"""Run the SOM training on the subsetted pixel data.
Saves weights to base_dir/weights_name.
Expand All @@ -184,6 +185,8 @@ def train_som(fovs, channels, base_dir,
The path to the data directory
sub_dir (str):
The name of the subsetted data directory
norm_vals_name (str):
The name of the file to store the 99.9% normalized values
weights_name (str):
The name of the weights file
num_passes (int):
Expand All @@ -192,6 +195,7 @@ def train_som(fovs, channels, base_dir,

# define the paths to the data
subsetted_path = os.path.join(base_dir, sub_dir)
norm_vals_path = os.path.join(base_dir, norm_vals_name)
weights_path = os.path.join(base_dir, weights_name)

# if path to the subsetted file does not exist
Expand All @@ -211,7 +215,7 @@ def train_som(fovs, channels, base_dir,

# run the SOM training process
process_args = ['Rscript', '/create_som_matrix.R', ','.join(fovs), ','.join(channels),
str(num_passes), subsetted_path, weights_path]
str(num_passes), subsetted_path, norm_vals_path, weights_path]
process = subprocess.Popen(process_args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

# continuously poll the process for output/error to display in Jupyter notebook
Expand All @@ -227,7 +231,8 @@ def train_som(fovs, channels, base_dir,


def cluster_pixels(fovs, base_dir, pre_dir='pixel_mat_preprocessed',
weights_name='weights.feather', cluster_dir='pixel_mat_clustered'):
norm_vals_name='norm_vals.feather', weights_name='weights.feather',
cluster_dir='pixel_mat_clustered'):
"""Uses trained weights to assign cluster labels on full pixel data
Saves data with cluster labels to cluster_dir
Expand All @@ -240,6 +245,8 @@ def cluster_pixels(fovs, base_dir, pre_dir='pixel_mat_preprocessed',
pre_dir (str):
Name of the directory which contains the preprocessed pixel data,
defaults to pixel_mat_preprocessed
norm_vals_name (str):
The name of the file to store the 99.9% normalized values
weights_name (str):
The name of the weights file
cluster_dir (str):
Expand All @@ -248,6 +255,7 @@ def cluster_pixels(fovs, base_dir, pre_dir='pixel_mat_preprocessed',

# define the paths to the data
preprocessed_path = os.path.join(base_dir, pre_dir)
norm_vals_path = os.path.join(base_dir, norm_vals_name)
weights_path = os.path.join(base_dir, weights_name)
clustered_path = os.path.join(base_dir, cluster_dir)

Expand All @@ -256,6 +264,10 @@ def cluster_pixels(fovs, base_dir, pre_dir='pixel_mat_preprocessed',
raise FileNotFoundError('Pixel preprocessed directory %s does not exist in base_dir %s' %
(pre_dir, base_dir))

if not os.path.exists(norm_vals_path):
raise FileNotFoundError('Normalized values file %s does not exist in base_dir %s' %
(norm_vals_path, base_dir))

# if path to the weights file does not exist
if not os.path.exists(weights_path):
raise FileNotFoundError('Weights file %s does not exist in base_dir %s' %
Expand All @@ -266,10 +278,14 @@ def cluster_pixels(fovs, base_dir, pre_dir='pixel_mat_preprocessed',
misc_utils.verify_in_list(provided_fovs=fovs,
subsetted_fovs=io_utils.remove_file_extensions(files))

# ensure the weights columns are valid indexes
weights = feather.read_dataframe(os.path.join(base_dir, weights_name))
# ensure the norm vals columns are valid indexes
norm_vals = feather.read_dataframe(os.path.join(base_dir, norm_vals_name))
sample_fov = feather.read_dataframe(os.path.join(base_dir, pre_dir, files[0]))
misc_utils.verify_in_list(norm_vals_columns=norm_vals.columns.values,
pixel_data_columns=sample_fov.columns.values)

# ensure the weights columns are valid indexes
weights = feather.read_dataframe(os.path.join(base_dir, weights_name))
misc_utils.verify_in_list(weights_columns=weights.columns.values,
pixel_data_columns=sample_fov.columns.values)

Expand All @@ -279,7 +295,7 @@ def cluster_pixels(fovs, base_dir, pre_dir='pixel_mat_preprocessed',

# run the trained SOM on the dataset, assigning clusters
process_args = ['Rscript', '/run_trained_som.R', ','.join(fovs),
preprocessed_path, weights_path, clustered_path]
preprocessed_path, norm_vals_path, weights_path, clustered_path]

process = subprocess.Popen(process_args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

Expand Down
68 changes: 56 additions & 12 deletions ark/phenotyping/som_utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@


def mocked_train_som(fovs, channels, base_dir,
sub_dir='pixel_mat_subsetted', weights_name='weights.feather'):
sub_dir='pixel_mat_subsetted', norm_vals_name='norm_vals.feather',
weights_name='weights.feather', num_passes=1):
# define the matrix we'll be training on
pixel_mat_sub = pd.DataFrame(columns=channels)

Expand All @@ -31,17 +32,29 @@ def mocked_train_som(fovs, channels, base_dir,
# FlowSOM flattens the weights dimensions, ex. 10x10x10 becomes 100x10
weights = np.random.rand(100, len(channels))

# take 100 random rows from pixel_mat_sub, and element-wise multiply weights by that
# get the 99.9% normalized values and divide weights by that
weights = weights / np.quantile(weights, 0.999, axis=0)

# save 99.9% normalized values
norm_vals = np.expand_dims(np.quantile(weights, 0.999, axis=0).T, axis=0)
quantiles = pd.DataFrame(norm_vals, columns=channels)
feather.write_dataframe(quantiles, os.path.join(base_dir, norm_vals_name))

# take 100 random rows from pixel_mat_sub, element-wise multiply weights by that and num_passes
multiply_factor = pixel_mat_sub.sample(n=100).values
weights = weights * multiply_factor
weights = weights * multiply_factor * num_passes

# write weights to feather, the result in R will be more like a DataFrame
weights = pd.DataFrame(weights, columns=channels)
feather.write_dataframe(weights, os.path.join(base_dir, weights_name))


def mocked_cluster_pixels(fovs, base_dir, pre_dir='pixel_mat_preprocessed',
weights_name='weights.feather', cluster_dir='pixel_mat_clustered'):
norm_vals_name='norm_vals.feather', weights_name='weights.feather',
cluster_dir='pixel_mat_clustered'):
# read in the norm_vals matrix
norm_vals = feather.read_dataframe(os.path.join(base_dir, norm_vals_name))

# read in the weights matrix
weights = feather.read_dataframe(os.path.join(base_dir, weights_name))

Expand All @@ -52,6 +65,9 @@ def mocked_cluster_pixels(fovs, base_dir, pre_dir='pixel_mat_preprocessed',
# only take the specified channel columns
fov_mat_pre = fov_mat_pre[weights.columns.values]

# perform 99.9% normalization
fov_mat_pre = fov_mat_pre.div(norm_vals, axis=1)

# get the mean weight for each channel column
sub_means = weights.mean(axis=1)

Expand Down Expand Up @@ -251,16 +267,38 @@ def test_train_som(mocker):
weights = feather.read_dataframe(os.path.join(temp_dir, 'weights.feather'))
assert weights.shape == (100, 4)

# assert that the weights columns are the same as chan_list
misc_utils.verify_same_elements(weights_channels=weights.columns.values,
provided_channels=chan_list)

# assert that the normalized file has been created
assert os.path.exists(os.path.join(temp_dir, 'norm_vals.feather'))

# assert the shape of norm_vals contains 1 row and number of columns = len(chan_list)
norm_vals = feather.read_dataframe(os.path.join(temp_dir, 'norm_vals.feather'))
assert norm_vals.shape == (1, 4)

# assert the the norm_vals columns are the same as chan_list
misc_utils.verify_same_elements(norm_vals_channels=norm_vals.columns.values,
provided_channels=chan_list)


def test_cluster_pixels(mocker):
# basic error checks: bad path to preprocessed and weights matrices
# basic error checks: bad path to preprocessed data, norm vals matrix, and weights matrix
with tempfile.TemporaryDirectory() as temp_dir:
with pytest.raises(FileNotFoundError):
som_utils.cluster_pixels(fovs=['fov0'], base_dir=temp_dir, pre_dir='bad_path')

# create a preprocessed directory for the undefined weights test
os.mkdir(os.path.join(temp_dir, 'pixel_mat_preprocessed'))

with pytest.raises(FileNotFoundError):
som_utils.cluster_pixels(fovs=['fov0'], base_dir=temp_dir,
norm_vals_name='bad_path.feather')

norm_vals = pd.DataFrame(np.random.rand(1, 2), columns=['Marker1', 'Marker2'])
feather.write_dataframe(norm_vals, os.path.join(temp_dir, 'norm_vals.feather'))

with pytest.raises(FileNotFoundError):
som_utils.cluster_pixels(fovs=['fov0'], base_dir=temp_dir,
weights_name='bad_path.feather')
Expand All @@ -285,20 +323,26 @@ def test_cluster_pixels(mocker):
'pixel_mat_preprocessed',
fov + '.feather'))

# not all of the provided fovs exist
with pytest.raises(ValueError):
weights = pd.DataFrame(np.random.rand(100, 4), columns=chan_list)
feather.write_dataframe(weights, os.path.join(temp_dir, 'weights.feather'))
norm_vals = pd.DataFrame(np.random.rand(1, 2), columns=['Marker4', 'Marker5'])
feather.write_dataframe(norm_vals, os.path.join(temp_dir, 'norm_vals.feather'))

som_utils.cluster_pixels(fovs=['fov2', 'fov3'], base_dir=temp_dir)

# column name mismatch between weights channels and pixel data channels
with pytest.raises(ValueError):
weights = pd.DataFrame(np.random.rand(100, 2), columns=['Marker4', 'Marker5'])
feather.write_dataframe(weights, os.path.join(temp_dir, 'weights.feather'))

# column name mismatch for norm_vals
som_utils.cluster_pixels(fovs=fovs, base_dir=temp_dir)

# column name mismatch for weights
som_utils.cluster_pixels(fovs=fovs, base_dir=temp_dir)

# not all the provided fovs exist
som_utils.cluster_pixels(fovs=['fov2', 'fov3'], base_dir=temp_dir)

# create a dummy normalized values matrix and write to feather
norm_vals = pd.DataFrame(np.ones((1, 4)), columns=chan_list)
feather.write_dataframe(norm_vals, os.path.join(temp_dir, 'norm_vals.feather'))

# create a dummy weights matrix and write to feather
weights = pd.DataFrame(np.random.rand(100, 4), columns=chan_list)
feather.write_dataframe(weights, os.path.join(temp_dir, 'weights.feather'))
Expand Down

0 comments on commit 1439a66

Please sign in to comment.