Merge 379ad41 into 4929fdb

angelolab · Mar 2, 2021 · 1439a66 · 1439a66
2 parents 4929fdb + 379ad41
commit 1439a66
Show file tree

Hide file tree

Showing 4 changed files with 106 additions and 26 deletions.
diff --git a/ark/phenotyping/create_som_matrix.R b/ark/phenotyping/create_som_matrix.R
@@ -6,6 +6,7 @@
 # - markers: list of channel columns to use
 # - numPasses: passes to make through dataset for training
 # - pixelSubsetDir: path to directory containing the subsetted pixel data
+# - normValsPath: path to the 99.9% normalized values file
 # - pixelWeightsPath: path to the SOM weights file
 
 library(arrow)
@@ -27,8 +28,11 @@ numPasses <- strtoi(args[3])
 # get path to subsetted mat directory
 pixelSubsetDir <- args[4]
 
+# get the normalized values write path
+normValsPath <- args[5]
+
 # get the weights write path
-pixelWeightsPath <- args[5]
+pixelWeightsPath <- args[6]
 
 # read the subsetted pixel mat data for training
 print("Reading the subsetted pixel matrix data for SOM training")
@@ -50,6 +54,9 @@ for (i in 1:length(fovs)) {
 }
 
 # perform 99.9% normalization on the subsetted data
+quantiles <- data.frame(matrix(NA, nrow=1, ncol=length(markers)))
+colnames(quantiles) <- markers
+
 print("Performing 99.9% normalization")
 
 for (marker in markers) {
@@ -59,8 +66,14 @@ for (marker in markers) {
     if (marker_quantile != 0) {
         pixelSubsetData[, marker] = pixelSubsetData[, marker] / marker_quantile
     }
+
+    quantiles[marker] = marker_quantile
 }
 
+# write 99.9% normalized values to feather
+print("Save 99.9% normalized values for each marker")
+arrow::write_feather(as.data.table(quantiles), normValsPath)
+
 # run the SOM training step
 print("Run the SOM training")
 somResults <- SOM(data=pixelSubsetData, rlen=numPasses)

diff --git a/ark/phenotyping/run_trained_som.R b/ark/phenotyping/run_trained_som.R
@@ -5,6 +5,7 @@
 # - fovs: list of fovs to cluster
 # - markers: list of channel columns to use
 # - pixelMatDir: path to directory containing the complete pixel data
+# - normValsPath: path to the 99.9% normalized values file
 # - pixelWeightsPath: path to the SOM weights file
 # - pixelClusterDir: path to directory where the clustered data will be written to
 
@@ -21,16 +22,24 @@ fovs <- unlist(strsplit(args[1], split=","))
 # get path to pixel mat directory
 pixelMatDir <- args[2]
 
+# get path to the 99.9% normalized values
+normValsPath <- args[3]
+
 # get path to the weights
-pixelWeightsPath <- args[3]
+pixelWeightsPath <- args[4]
 
 # get the cluster write path directory
-pixelClusterDir <- args[4]
+pixelClusterDir <- args[5]
 
 # read the weights
-print("Reading the weights matrix")
 somWeights <- as.matrix(arrow::read_feather(pixelWeightsPath))
 
+# read the normalization values
+normVals <- as.matrix(arrow::read_feather(normValsPath))
+
+# convert normVals into a vector
+normVals <- as.numeric(normVals[1, ])
+
 # get the marker names from the weights matrix
 markers <- colnames(somWeights)
 
@@ -44,11 +53,9 @@ for (i in 1:length(fovs)) {
 
     # 99.9% normalize pixel data
     for (marker in markers) {
-        marker_quantile <- quantile(fovPixelData[, marker], 0.999)
-
         # this prevents all-zero columns from getting normalized and becoming NA/Inf
-        if (marker_quantile != 0) {
-            fovPixelData[, marker] = fovPixelData[, marker] / marker_quantile
+        if (normVals[1, marker] != 0) {
+            fovPixelData[, marker] = fovPixelData[, marker] / normVals[1, marker]
         }
     }
 

diff --git a/ark/phenotyping/som_utils.py b/ark/phenotyping/som_utils.py
@@ -170,7 +170,8 @@ def create_pixel_matrix(img_xr, seg_labels, base_dir,
 
 
 def train_som(fovs, channels, base_dir,
-              sub_dir='pixel_mat_subsetted', weights_name='weights.feather', num_passes=1):
+              sub_dir='pixel_mat_subsetted', norm_vals_name='norm_vals.feather',
+              weights_name='weights.feather', num_passes=1):
     """Run the SOM training on the subsetted pixel data.
 
     Saves weights to base_dir/weights_name.
@@ -184,6 +185,8 @@ def train_som(fovs, channels, base_dir,
             The path to the data directory
         sub_dir (str):
             The name of the subsetted data directory
+        norm_vals_name (str):
+            The name of the file to store the 99.9% normalized values
         weights_name (str):
             The name of the weights file
         num_passes (int):
@@ -192,6 +195,7 @@ def train_som(fovs, channels, base_dir,
 
     # define the paths to the data
     subsetted_path = os.path.join(base_dir, sub_dir)
+    norm_vals_path = os.path.join(base_dir, norm_vals_name)
     weights_path = os.path.join(base_dir, weights_name)
 
     # if path to the subsetted file does not exist
@@ -211,7 +215,7 @@ def train_som(fovs, channels, base_dir,
 
     # run the SOM training process
     process_args = ['Rscript', '/create_som_matrix.R', ','.join(fovs), ','.join(channels),
-                    str(num_passes), subsetted_path, weights_path]
+                    str(num_passes), subsetted_path, norm_vals_path, weights_path]
     process = subprocess.Popen(process_args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
     # continuously poll the process for output/error to display in Jupyter notebook
@@ -227,7 +231,8 @@ def train_som(fovs, channels, base_dir,
 
 
 def cluster_pixels(fovs, base_dir, pre_dir='pixel_mat_preprocessed',
-                   weights_name='weights.feather', cluster_dir='pixel_mat_clustered'):
+                   norm_vals_name='norm_vals.feather', weights_name='weights.feather',
+                   cluster_dir='pixel_mat_clustered'):
     """Uses trained weights to assign cluster labels on full pixel data
 
     Saves data with cluster labels to cluster_dir
@@ -240,6 +245,8 @@ def cluster_pixels(fovs, base_dir, pre_dir='pixel_mat_preprocessed',
         pre_dir (str):
             Name of the directory which contains the preprocessed pixel data,
             defaults to pixel_mat_preprocessed
+        norm_vals_name (str):
+            The name of the file to store the 99.9% normalized values
         weights_name (str):
             The name of the weights file
         cluster_dir (str):
@@ -248,6 +255,7 @@ def cluster_pixels(fovs, base_dir, pre_dir='pixel_mat_preprocessed',
 
     # define the paths to the data
     preprocessed_path = os.path.join(base_dir, pre_dir)
+    norm_vals_path = os.path.join(base_dir, norm_vals_name)
     weights_path = os.path.join(base_dir, weights_name)
     clustered_path = os.path.join(base_dir, cluster_dir)
 
@@ -256,6 +264,10 @@ def cluster_pixels(fovs, base_dir, pre_dir='pixel_mat_preprocessed',
         raise FileNotFoundError('Pixel preprocessed directory %s does not exist in base_dir %s' %
                                 (pre_dir, base_dir))
 
+    if not os.path.exists(norm_vals_path):
+        raise FileNotFoundError('Normalized values file %s does not exist in base_dir %s' %
+                                (norm_vals_path, base_dir))
+
     # if path to the weights file does not exist
     if not os.path.exists(weights_path):
         raise FileNotFoundError('Weights file %s does not exist in base_dir %s' %
@@ -266,10 +278,14 @@ def cluster_pixels(fovs, base_dir, pre_dir='pixel_mat_preprocessed',
     misc_utils.verify_in_list(provided_fovs=fovs,
                               subsetted_fovs=io_utils.remove_file_extensions(files))
 
-    # ensure the weights columns are valid indexes
-    weights = feather.read_dataframe(os.path.join(base_dir, weights_name))
+    # ensure the norm vals columns are valid indexes
+    norm_vals = feather.read_dataframe(os.path.join(base_dir, norm_vals_name))
     sample_fov = feather.read_dataframe(os.path.join(base_dir, pre_dir, files[0]))
+    misc_utils.verify_in_list(norm_vals_columns=norm_vals.columns.values,
+                              pixel_data_columns=sample_fov.columns.values)
 
+    # ensure the weights columns are valid indexes
+    weights = feather.read_dataframe(os.path.join(base_dir, weights_name))
     misc_utils.verify_in_list(weights_columns=weights.columns.values,
                               pixel_data_columns=sample_fov.columns.values)
 
@@ -279,7 +295,7 @@ def cluster_pixels(fovs, base_dir, pre_dir='pixel_mat_preprocessed',
 
     # run the trained SOM on the dataset, assigning clusters
     process_args = ['Rscript', '/run_trained_som.R', ','.join(fovs),
-                    preprocessed_path, weights_path, clustered_path]
+                    preprocessed_path, norm_vals_path, weights_path, clustered_path]
 
     process = subprocess.Popen(process_args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 

diff --git a/ark/phenotyping/som_utils_test.py b/ark/phenotyping/som_utils_test.py
@@ -14,7 +14,8 @@
 
 
 def mocked_train_som(fovs, channels, base_dir,
-                     sub_dir='pixel_mat_subsetted', weights_name='weights.feather'):
+                     sub_dir='pixel_mat_subsetted', norm_vals_name='norm_vals.feather',
+                     weights_name='weights.feather', num_passes=1):
     # define the matrix we'll be training on
     pixel_mat_sub = pd.DataFrame(columns=channels)
 
@@ -31,17 +32,29 @@ def mocked_train_som(fovs, channels, base_dir,
     # FlowSOM flattens the weights dimensions, ex. 10x10x10 becomes 100x10
     weights = np.random.rand(100, len(channels))
 
-    # take 100 random rows from pixel_mat_sub, and element-wise multiply weights by that
+    # get the 99.9% normalized values and divide weights by that
+    weights = weights / np.quantile(weights, 0.999, axis=0)
+
+    # save 99.9% normalized values
+    norm_vals = np.expand_dims(np.quantile(weights, 0.999, axis=0).T, axis=0)
+    quantiles = pd.DataFrame(norm_vals, columns=channels)
+    feather.write_dataframe(quantiles, os.path.join(base_dir, norm_vals_name))
+
+    # take 100 random rows from pixel_mat_sub, element-wise multiply weights by that and num_passes
     multiply_factor = pixel_mat_sub.sample(n=100).values
-    weights = weights * multiply_factor
+    weights = weights * multiply_factor * num_passes
 
     # write weights to feather, the result in R will be more like a DataFrame
     weights = pd.DataFrame(weights, columns=channels)
     feather.write_dataframe(weights, os.path.join(base_dir, weights_name))
 
 
 def mocked_cluster_pixels(fovs, base_dir, pre_dir='pixel_mat_preprocessed',
-                          weights_name='weights.feather', cluster_dir='pixel_mat_clustered'):
+                          norm_vals_name='norm_vals.feather', weights_name='weights.feather',
+                          cluster_dir='pixel_mat_clustered'):
+    # read in the norm_vals matrix
+    norm_vals = feather.read_dataframe(os.path.join(base_dir, norm_vals_name))
+
     # read in the weights matrix
     weights = feather.read_dataframe(os.path.join(base_dir, weights_name))
 
@@ -52,6 +65,9 @@ def mocked_cluster_pixels(fovs, base_dir, pre_dir='pixel_mat_preprocessed',
         # only take the specified channel columns
         fov_mat_pre = fov_mat_pre[weights.columns.values]
 
+        # perform 99.9% normalization
+        fov_mat_pre = fov_mat_pre.div(norm_vals, axis=1)
+
         # get the mean weight for each channel column
         sub_means = weights.mean(axis=1)
 
@@ -251,16 +267,38 @@ def test_train_som(mocker):
         weights = feather.read_dataframe(os.path.join(temp_dir, 'weights.feather'))
         assert weights.shape == (100, 4)
 
+        # assert that the weights columns are the same as chan_list
+        misc_utils.verify_same_elements(weights_channels=weights.columns.values,
+                                        provided_channels=chan_list)
+
+        # assert that the normalized file has been created
+        assert os.path.exists(os.path.join(temp_dir, 'norm_vals.feather'))
+
+        # assert the shape of norm_vals contains 1 row and number of columns = len(chan_list)
+        norm_vals = feather.read_dataframe(os.path.join(temp_dir, 'norm_vals.feather'))
+        assert norm_vals.shape == (1, 4)
+
+        # assert the the norm_vals columns are the same as chan_list
+        misc_utils.verify_same_elements(norm_vals_channels=norm_vals.columns.values,
+                                        provided_channels=chan_list)
+
 
 def test_cluster_pixels(mocker):
-    # basic error checks: bad path to preprocessed and weights matrices
+    # basic error checks: bad path to preprocessed data, norm vals matrix, and weights matrix
     with tempfile.TemporaryDirectory() as temp_dir:
         with pytest.raises(FileNotFoundError):
             som_utils.cluster_pixels(fovs=['fov0'], base_dir=temp_dir, pre_dir='bad_path')
 
         # create a preprocessed directory for the undefined weights test
         os.mkdir(os.path.join(temp_dir, 'pixel_mat_preprocessed'))
 
+        with pytest.raises(FileNotFoundError):
+            som_utils.cluster_pixels(fovs=['fov0'], base_dir=temp_dir,
+                                     norm_vals_name='bad_path.feather')
+
+        norm_vals = pd.DataFrame(np.random.rand(1, 2), columns=['Marker1', 'Marker2'])
+        feather.write_dataframe(norm_vals, os.path.join(temp_dir, 'norm_vals.feather'))
+
         with pytest.raises(FileNotFoundError):
             som_utils.cluster_pixels(fovs=['fov0'], base_dir=temp_dir,
                                      weights_name='bad_path.feather')
@@ -285,20 +323,26 @@ def test_cluster_pixels(mocker):
                                                                  'pixel_mat_preprocessed',
                                                                  fov + '.feather'))
 
-        # not all of the provided fovs exist
         with pytest.raises(ValueError):
-            weights = pd.DataFrame(np.random.rand(100, 4), columns=chan_list)
-            feather.write_dataframe(weights, os.path.join(temp_dir, 'weights.feather'))
+            norm_vals = pd.DataFrame(np.random.rand(1, 2), columns=['Marker4', 'Marker5'])
+            feather.write_dataframe(norm_vals, os.path.join(temp_dir, 'norm_vals.feather'))
 
-            som_utils.cluster_pixels(fovs=['fov2', 'fov3'], base_dir=temp_dir)
-
-        # column name mismatch between weights channels and pixel data channels
-        with pytest.raises(ValueError):
             weights = pd.DataFrame(np.random.rand(100, 2), columns=['Marker4', 'Marker5'])
             feather.write_dataframe(weights, os.path.join(temp_dir, 'weights.feather'))
 
+            # column name mismatch for norm_vals
             som_utils.cluster_pixels(fovs=fovs, base_dir=temp_dir)
 
+            # column name mismatch for weights
+            som_utils.cluster_pixels(fovs=fovs, base_dir=temp_dir)
+
+            # not all the provided fovs exist
+            som_utils.cluster_pixels(fovs=['fov2', 'fov3'], base_dir=temp_dir)
+
+        # create a dummy normalized values matrix and write to feather
+        norm_vals = pd.DataFrame(np.ones((1, 4)), columns=chan_list)
+        feather.write_dataframe(norm_vals, os.path.join(temp_dir, 'norm_vals.feather'))
+
         # create a dummy weights matrix and write to feather
         weights = pd.DataFrame(np.random.rand(100, 4), columns=chan_list)
         feather.write_dataframe(weights, os.path.join(temp_dir, 'weights.feather'))