brainiak · xoltar · Apr 28, 2017 · Apr 27, 2017 · Apr 27, 2017 · Apr 27, 2017
diff --git a/brainiak/fcma/preprocessing.py b/brainiak/fcma/preprocessing.py
@@ -22,12 +22,21 @@
 import logging
 from scipy.stats.mstats import zscore
 from mpi4py import MPI
+from enum import Enum
 
 from ..image import mask_images, multimask_images
 
 
 logger = logging.getLogger(__name__)
 
+__all__ = [
+    "RandomType",
+    "prepare_fcma_data",
+    "generate_epochs_info",
+    "prepare_mvpa_data",
+    "prepare_searchlight_mvpa_data",
+]
+
 
 def _separate_epochs(activity_data, epoch_list):
     """ create data epoch by epoch
@@ -37,9 +46,9 @@ def _separate_epochs(activity_data, epoch_list):
 
     Parameters
     ----------
-    activity\_data: list of 2D array in shape [nVoxels, nTRs]
+    activity_data: list of 2D array in shape [nVoxels, nTRs]
         the masked activity data organized in voxel*TR formats of all subjects
-    epoch\_list: list of 3D array in shape [condition, nEpochs, nTRs]
+    epoch_list: list of 3D array in shape [condition, nEpochs, nTRs]
         specification of epochs and conditions
         assuming all subjects have the same number of epochs
         len(epoch_list) equals the number of subjects
@@ -83,8 +92,72 @@ def _separate_epochs(activity_data, epoch_list):
     return raw_data, labels
 
 
+def _randomize_single_subject(data, seed=None):
+    """Randomly permute the voxels of the subject.
+
+     The subject is organized as Voxel x TR,
+     this method shuffles the voxel dimension.
+
+    Parameters
+    ----------
+    data: 2D array in shape [nVoxels, nTRs]
+        Activity data.
+    seed: Optional[int]
+        Seed for random state used implicitly for shuffling.
+
+    Returns
+    data: 2D array in shape [nVoxels, nTRs]
+        Activity data with the voxel dimension shuffled.
+    """
+    if seed is not None:
+        np.random.seed(seed)
+    np.random.shuffle(data)
+    return data
+
+
+def _randomize_subject_list(data_list, random):
+    """Randomly permute the voxels of a subject list.
+
+     The method shuffles the subject one by one according to
+     the random type. If RandomType.NORANDOM, return the
+     original list.
+
+    Parameters
+    ----------
+    data_list: list of 2D array in shape [nVxels, nTRs]
+        Activity data list.
+    random: RandomType
+        Randomization type.
+
+    Returns
+    data_list: list of 2D array in shape [nVxels, nTRs]
+        (Randomized) activity data list.
+    """
+    if random == RandomType.REPRODUCIBLE:
+        for i in range(len(data_list)):
+            data_list[i] = _randomize_single_subject(data_list[i], seed=i)
+    elif random == RandomType.UNREPRODUCIBLE:
+        for i in range(len(data_list)):
+            data_list[i] = _randomize_single_subject(data_list[i])
+    return data_list
+
+
+class RandomType(Enum):
+    """Define the random types as enumeration
+
+    NORANDOM means do not randomize the data;
+    REPRODUCIBLE means randomize the data with a fixed seed so that the
+    permutation holds between different runs;
+    UNREPRODUCIBLE means truly randomize the data which returns different
+    results in different runs.
+    """
+    NORANDOM = 0
+    REPRODUCIBLE = 1
+    UNREPRODUCIBLE = 2
+
+
 def prepare_fcma_data(images, conditions, mask1, mask2=None,
-                      comm=MPI.COMM_WORLD):
+                      random=RandomType.NORANDOM, comm=MPI.COMM_WORLD):
     """Prepare data for correlation-based computation and analysis.
 
     Generate epochs of interests, then broadcast to all workers.
@@ -102,6 +175,9 @@ def prepare_fcma_data(images, conditions, mask1, mask2=None,
         If it is not specified, the method will assign None to the returning
         variable raw_data2 and the self-correlation on raw_data1 will be
         computed
+    random: Optional[RandomType]
+        Randomize the data within subject or not.
+        Default NORANDOM
     comm: MPI.Comm
         MPI communicator to use for MPI operations.
 
@@ -122,14 +198,17 @@ def prepare_fcma_data(images, conditions, mask1, mask2=None,
     raw_data1 = []
     raw_data2 = []
     if rank == 0:
+        logger.info('start to apply masks and separate epochs')
         if mask2 is not None:
             masks = (mask1, mask2)
             activity_data1, activity_data2 = zip(*multimask_images(images,
                                                                    masks,
                                                                    np.float32))
+            activity_data2 = _randomize_subject_list(activity_data2, random)
             raw_data2, _ = _separate_epochs(activity_data2, conditions)
         else:
             activity_data1 = list(mask_images(images, mask1, np.float32))
+        activity_data1 = _randomize_subject_list(activity_data1, random)
         raw_data1, labels = _separate_epochs(activity_data1, conditions)
         time1 = time.time()
     raw_data_length = len(raw_data1)
@@ -170,7 +249,7 @@ def generate_epochs_info(epoch_list):
 
     Returns
     -------
-    epoch\_info: list of tuple (label, sid, start, end).
+    epoch_info: list of tuple (label, sid, start, end).
         label is the condition labels of the epochs;
         sid is the subject id, corresponding to the index of raw_data;
         start is the start TR of an epoch (inclusive);
@@ -250,7 +329,8 @@ def prepare_mvpa_data(images, conditions, mask):
     return processed_data, labels
 
 
-def prepare_searchlight_mvpa_data(images, conditions, data_type=np.float32):
+def prepare_searchlight_mvpa_data(images, conditions, data_type=np.float32,
+                                  random=RandomType.NORANDOM):
     """ obtain the data for activity-based voxel selection using Searchlight
 
     Average the activity within epochs and z-scoring within subject,
@@ -266,10 +346,12 @@ def prepare_searchlight_mvpa_data(images, conditions, data_type=np.float32):
         Condition specification.
     data_type
         Type to cast image to.
+    random: Optional[RandomType]
+        Randomize the data within subject or not.
 
     Returns
     -------
-    processed\_data: 4D array in shape [brain 3D + epoch]
+    processed_data: 4D array in shape [brain 3D + epoch]
         averaged epoch by epoch processed data
 
     labels: 1D array
@@ -290,10 +372,23 @@ def prepare_searchlight_mvpa_data(images, conditions, data_type=np.float32):
     # counting the epochs per subject for z-scoring
     subject_count = np.zeros(len(conditions), dtype=np.int32)
 
+    logger.info('start to apply masks and separate epochs')
     for sid, f in enumerate(images):
         data = f.get_data().astype(data_type)
+        [d1, d2, d3, d4] = data.shape
+        if random == RandomType.REPRODUCIBLE:
+            data = _randomize_single_subject(data.reshape((d1 * d2 * d3, d4)),
+                                             seed=sid).reshape((d1,
+                                                                d2,
+                                                                d3,
+                                                                d4))
+        elif random == RandomType.UNREPRODUCIBLE:
+            data = _randomize_single_subject(
+                data.reshape((d1 * d2 * d3, d4))).reshape((d1,
+                                                           d2,
+                                                           d3,
+                                                           d4))
         if processed_data is None:
-            [d1, d2, d3, _] = data.shape
             processed_data = np.empty([d1, d2, d3, num_epochs],
                                       dtype=data_type)
         # averaging
@@ -303,9 +398,9 @@ def prepare_searchlight_mvpa_data(images, conditions, data_type=np.float32):
                 processed_data[:, :, :, idx] = \
                     np.mean(data[:, :, :, epoch[2]:epoch[3]], axis=3)
 
-        logger.info(
-            'file %s is loaded and processed, with data shape %s' %
-            (f.get_filename(), data.shape)
+        logger.debug(
+            'file %s is loaded and processed, with data shape %s',
+            f.get_filename(), data.shape
         )
     # z-scoring
     cur_epoch = 0

diff --git a/brainiak/fcma/voxelselector.py b/brainiak/fcma/voxelselector.py
@@ -163,7 +163,8 @@ def _master(self):
             the length of array equals the number of voxels
         """
         logger.info(
-            'Master starts to allocate tasks'
+            'Master at rank %d starts to allocate tasks',
+            MPI.COMM_WORLD.Get_rank()
         )
         results = []
         comm = MPI.COMM_WORLD
@@ -181,6 +182,10 @@ def _master(self):
             if current_task[1] == 0:
                 using_size = i
                 break
+            logger.debug(
+                'master starts to send a task to worker %d' %
+                i
+            )
             comm.send(current_task,
                       dest=i,
                       tag=self._WORKTAG)
@@ -235,6 +240,10 @@ def _worker(self, clf):
         -------
         None
         """
+        logger.debug(
+            'worker %d is running, waiting for tasks from master at rank %d' %
+            (MPI.COMM_WORLD.Get_rank(), self.master_rank)
+        )
         comm = MPI.COMM_WORLD
         status = MPI.Status()
         while 1:
@@ -264,6 +273,11 @@ def _correlation_computation(self, task):
         time1 = time.time()
         s = task[0]
         nEpochs = len(self.raw_data)
+        logger.debug(
+            'start to compute the correlation: #epochs: %d, '
+            '#processed voxels: %d, #total voxels to compute against: %d' %
+            (nEpochs, task[1], self.num_voxels2)
+        )
         corr = np.zeros((task[1], nEpochs, self.num_voxels2),
                         np.float32, order='C')
         count = 0

diff --git a/brainiak/io.py b/brainiak/io.py
@@ -23,6 +23,7 @@
 from pathlib import Path
 from typing import Callable, Iterable, List, Union
 
+import logging
 import nibabel as nib
 import numpy as np
 
@@ -31,6 +32,8 @@
 
 from .image import SingleConditionSpec
 
+logger = logging.getLogger(__name__)
+
 
 def load_images_from_dir(in_dir: Union[str, Path], suffix: str = "nii.gz",
                          ) -> Iterable[SpatialImage]:
@@ -58,6 +61,9 @@ def load_images_from_dir(in_dir: Union[str, Path], suffix: str = "nii.gz",
         in_dir = Path(in_dir)
     files = sorted(in_dir.glob("*" + suffix))
     for f in files:
+        logger.debug(
+            'Starting to read file %s', f
+        )
         yield nib.load(str(f))
 
 
@@ -86,6 +92,9 @@ def load_images(image_paths: Iterable[Union[str, Path]]
             string_path = str(image_path)
         else:
             string_path = image_path
+        logger.debug(
+            'Starting to read file %s', string_path
+        )
         yield nib.load(string_path)
 
 
@@ -130,7 +139,7 @@ def load_labels(path: Union[str, Path]) -> List[SingleConditionSpec]:
     List[SingleConditionSpec]
         List of SingleConditionSpec stored in labels file.
     """
-    condition_specs = np.load(path)
+    condition_specs = np.load(str(path))
     return [c.view(SingleConditionSpec) for c in condition_specs]
 
 

diff --git a/examples/fcma/mvpa_voxel_selection.py b/examples/fcma/mvpa_voxel_selection.py
@@ -39,7 +39,7 @@
             MPI.COMM_WORLD.Get_size()
         )
     data_dir = sys.argv[1]
-    extension = sys.argv[2]
+    suffix = sys.argv[2]
     mask_file = sys.argv[3]
     epoch_file = sys.argv[4]
 
@@ -53,11 +53,24 @@
             'mask size: %d' %
             np.sum(mask)
         )
-        images = io.load_images_from_dir(data_dir, extension)
+        images = io.load_images_from_dir(data_dir, suffix=suffix)
         conditions = io.load_labels(epoch_file)
         data, labels = prepare_searchlight_mvpa_data(images, conditions)
+
+        # setting the random argument produces random voxel selection results
+        # for non-parametric statistical analysis.
+        # There are three random options:
+        # RandomType.NORANDOM is the default
+        # RandomType.REPRODUCIBLE permutes the voxels in the same way every run
+        # RandomType.UNREPRODUCIBLE permutes the voxels differently across runs
+        # example
+        #from brainiak.fcma.preprocessing import RandomType
+        #data, labels = prepare_searchlight_mvpa_data(images, conditions,
+        #                                                    random=RandomType.UNREPRODUCIBLE)
+
         # the following line is an example to leaving a subject out
         #epoch_info = [x for x in epoch_info if x[1] != 0]
+
     num_subjs = int(sys.argv[5])
     # create a Searchlight object
     sl = Searchlight(sl_rad=1)

diff --git a/examples/fcma/voxel_selection.py b/examples/fcma/voxel_selection.py
@@ -38,19 +38,33 @@
             MPI.COMM_WORLD.Get_size()
         )
     data_dir = sys.argv[1]
-    extension = sys.argv[2]
+    suffix = sys.argv[2]
     mask_file = sys.argv[3]
     epoch_file = sys.argv[4]
-    images = io.load_images_from_dir(data_dir, extension)
+    images = io.load_images_from_dir(data_dir, suffix=suffix)
     mask = io.load_boolean_mask(mask_file)
     conditions = io.load_labels(epoch_file)
     raw_data, _, labels = prepare_fcma_data(images, conditions, mask)
+
+    # setting the random argument produces random voxel selection results
+    # for non-parametric statistical analysis.
+    # There are three random options:
+    # RandomType.NORANDOM is the default
+    # RandomType.REPRODUCIBLE permutes the voxels in the same way every run
+    # RandomType.UNREPRODUCIBLE permutes the voxels differently across runs
+    # example:
+    # from brainiak.fcma.preprocessing import RandomType
+    # raw_data, _, labels = prepare_fcma_data(images, conditions, mask,
+    #                                         random=RandomType.REPRODUCIBLE)
+
     # if providing two masks, just append the second mask as the last input argument
     # and specify raw_data2
+    # example:
     # images = io.load_images_from_dir(data_dir, extension)
     # mask2 = io.load_boolean_mask('face_scene/mask.nii.gz')
     # raw_data, raw_data2, labels = prepare_fcma_data(images, conditions, mask,
     #                                                 mask2)
+
     epochs_per_subj = int(sys.argv[5])
     num_subjs = int(sys.argv[6])
     # the following line is an example to leaving a subject out