Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Random permutation of FCMA #217

Merged
merged 10 commits into from Apr 28, 2017
115 changes: 105 additions & 10 deletions brainiak/fcma/preprocessing.py
Expand Up @@ -22,12 +22,21 @@
import logging
from scipy.stats.mstats import zscore
from mpi4py import MPI
from enum import Enum

from ..image import mask_images, multimask_images


logger = logging.getLogger(__name__)

__all__ = [
"RandomType",
"prepare_fcma_data",
"generate_epochs_info",
"prepare_mvpa_data",
"prepare_searchlight_mvpa_data",
]


def _separate_epochs(activity_data, epoch_list):
""" create data epoch by epoch
Expand All @@ -37,9 +46,9 @@ def _separate_epochs(activity_data, epoch_list):

Parameters
----------
activity\_data: list of 2D array in shape [nVoxels, nTRs]
activity_data: list of 2D array in shape [nVoxels, nTRs]
the masked activity data organized in voxel*TR formats of all subjects
epoch\_list: list of 3D array in shape [condition, nEpochs, nTRs]
epoch_list: list of 3D array in shape [condition, nEpochs, nTRs]
specification of epochs and conditions
assuming all subjects have the same number of epochs
len(epoch_list) equals the number of subjects
Expand Down Expand Up @@ -83,8 +92,72 @@ def _separate_epochs(activity_data, epoch_list):
return raw_data, labels


def _randomize_single_subject(data, seed=None):
"""Randomly permute the voxels of the subject.

The subject is organized as Voxel x TR,
this method shuffles the voxel dimension.

Parameters
----------
data: 2D array in shape [nVoxels, nTRs]
Activity data.
seed: Optional[int]
Seed for random state used implicitly for shuffling.

Returns
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good! Now just need to remove the Returns section from docs of both of these functions since they don't return anything anymore.

data: 2D array in shape [nVoxels, nTRs]
Activity data with the voxel dimension shuffled.
"""
if seed is not None:
np.random.seed(seed)
np.random.shuffle(data)
return data


def _randomize_subject_list(data_list, random):
"""Randomly permute the voxels of a subject list.

The method shuffles the subject one by one according to
the random type. If RandomType.NORANDOM, return the
original list.

Parameters
----------
data_list: list of 2D array in shape [nVxels, nTRs]
Activity data list.
random: RandomType
Randomization type.

Returns
data_list: list of 2D array in shape [nVxels, nTRs]
(Randomized) activity data list.
"""
if random == RandomType.REPRODUCIBLE:
for i in range(len(data_list)):
data_list[i] = _randomize_single_subject(data_list[i], seed=i)
elif random == RandomType.UNREPRODUCIBLE:
for i in range(len(data_list)):
data_list[i] = _randomize_single_subject(data_list[i])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is better, but still mixed usage. If you're going to do the change in place, don't return a value. If you're going to return a value, it should be a new value. So if you want to use in-place, it should look like:

for subject in data_list:
    _randomize_single_subject(subject)

So neither _randomize_single_subject nor this function should return a value. Also, note since you're modifying in place, you don't need to assign to indexes, so you can use the simpler for loop.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

return data_list


class RandomType(Enum):
"""Define the random types as enumeration

NORANDOM means do not randomize the data;
REPRODUCIBLE means randomize the data with a fixed seed so that the
permutation holds between different runs;
UNREPRODUCIBLE means truly randomize the data which returns different
results in different runs.
"""
NORANDOM = 0
REPRODUCIBLE = 1
UNREPRODUCIBLE = 2


def prepare_fcma_data(images, conditions, mask1, mask2=None,
comm=MPI.COMM_WORLD):
random=RandomType.NORANDOM, comm=MPI.COMM_WORLD):
"""Prepare data for correlation-based computation and analysis.

Generate epochs of interests, then broadcast to all workers.
Expand All @@ -102,6 +175,9 @@ def prepare_fcma_data(images, conditions, mask1, mask2=None,
If it is not specified, the method will assign None to the returning
variable raw_data2 and the self-correlation on raw_data1 will be
computed
random: Optional[RandomType]
Randomize the data within subject or not.
Default NORANDOM
comm: MPI.Comm
MPI communicator to use for MPI operations.

Expand All @@ -122,14 +198,17 @@ def prepare_fcma_data(images, conditions, mask1, mask2=None,
raw_data1 = []
raw_data2 = []
if rank == 0:
logger.info('start to apply masks and separate epochs')
if mask2 is not None:
masks = (mask1, mask2)
activity_data1, activity_data2 = zip(*multimask_images(images,
masks,
np.float32))
activity_data2 = _randomize_subject_list(activity_data2, random)
raw_data2, _ = _separate_epochs(activity_data2, conditions)
else:
activity_data1 = list(mask_images(images, mask1, np.float32))
activity_data1 = _randomize_subject_list(activity_data1, random)
raw_data1, labels = _separate_epochs(activity_data1, conditions)
time1 = time.time()
raw_data_length = len(raw_data1)
Expand Down Expand Up @@ -170,7 +249,7 @@ def generate_epochs_info(epoch_list):

Returns
-------
epoch\_info: list of tuple (label, sid, start, end).
epoch_info: list of tuple (label, sid, start, end).
label is the condition labels of the epochs;
sid is the subject id, corresponding to the index of raw_data;
start is the start TR of an epoch (inclusive);
Expand Down Expand Up @@ -250,7 +329,8 @@ def prepare_mvpa_data(images, conditions, mask):
return processed_data, labels


def prepare_searchlight_mvpa_data(images, conditions, data_type=np.float32):
def prepare_searchlight_mvpa_data(images, conditions, data_type=np.float32,
random=RandomType.NORANDOM):
""" obtain the data for activity-based voxel selection using Searchlight

Average the activity within epochs and z-scoring within subject,
Expand All @@ -266,10 +346,12 @@ def prepare_searchlight_mvpa_data(images, conditions, data_type=np.float32):
Condition specification.
data_type
Type to cast image to.
random: Optional[RandomType]
Randomize the data within subject or not.

Returns
-------
processed\_data: 4D array in shape [brain 3D + epoch]
processed_data: 4D array in shape [brain 3D + epoch]
averaged epoch by epoch processed data

labels: 1D array
Expand All @@ -290,10 +372,23 @@ def prepare_searchlight_mvpa_data(images, conditions, data_type=np.float32):
# counting the epochs per subject for z-scoring
subject_count = np.zeros(len(conditions), dtype=np.int32)

logger.info('start to apply masks and separate epochs')
for sid, f in enumerate(images):
data = f.get_data().astype(data_type)
[d1, d2, d3, d4] = data.shape
if random == RandomType.REPRODUCIBLE:
data = _randomize_single_subject(data.reshape((d1 * d2 * d3, d4)),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could make this reshaping logic part of _randomize_single_subject (i.e. have it flatten, randomize, then reshape into whatever shape the original array was).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't do this because I would like to make the input argument data of _randomize_single_subject an nVoxels x nTRs 2D array for clarity purpose.

seed=sid).reshape((d1,
d2,
d3,
d4))
elif random == RandomType.UNREPRODUCIBLE:
data = _randomize_single_subject(
data.reshape((d1 * d2 * d3, d4))).reshape((d1,
d2,
d3,
d4))
if processed_data is None:
[d1, d2, d3, _] = data.shape
processed_data = np.empty([d1, d2, d3, num_epochs],
dtype=data_type)
# averaging
Expand All @@ -303,9 +398,9 @@ def prepare_searchlight_mvpa_data(images, conditions, data_type=np.float32):
processed_data[:, :, :, idx] = \
np.mean(data[:, :, :, epoch[2]:epoch[3]], axis=3)

logger.info(
'file %s is loaded and processed, with data shape %s' %
(f.get_filename(), data.shape)
logger.debug(
'file %s is loaded and processed, with data shape %s',
f.get_filename(), data.shape
)
# z-scoring
cur_epoch = 0
Expand Down
16 changes: 15 additions & 1 deletion brainiak/fcma/voxelselector.py
Expand Up @@ -163,7 +163,8 @@ def _master(self):
the length of array equals the number of voxels
"""
logger.info(
'Master starts to allocate tasks'
'Master at rank %d starts to allocate tasks',
MPI.COMM_WORLD.Get_rank()
)
results = []
comm = MPI.COMM_WORLD
Expand All @@ -181,6 +182,10 @@ def _master(self):
if current_task[1] == 0:
using_size = i
break
logger.debug(
'master starts to send a task to worker %d' %
i
)
comm.send(current_task,
dest=i,
tag=self._WORKTAG)
Expand Down Expand Up @@ -235,6 +240,10 @@ def _worker(self, clf):
-------
None
"""
logger.debug(
'worker %d is running, waiting for tasks from master at rank %d' %
(MPI.COMM_WORLD.Get_rank(), self.master_rank)
)
comm = MPI.COMM_WORLD
status = MPI.Status()
while 1:
Expand Down Expand Up @@ -264,6 +273,11 @@ def _correlation_computation(self, task):
time1 = time.time()
s = task[0]
nEpochs = len(self.raw_data)
logger.debug(
'start to compute the correlation: #epochs: %d, '
'#processed voxels: %d, #total voxels to compute against: %d' %
(nEpochs, task[1], self.num_voxels2)
)
corr = np.zeros((task[1], nEpochs, self.num_voxels2),
np.float32, order='C')
count = 0
Expand Down
11 changes: 10 additions & 1 deletion brainiak/io.py
Expand Up @@ -23,6 +23,7 @@
from pathlib import Path
from typing import Callable, Iterable, List, Union

import logging
import nibabel as nib
import numpy as np

Expand All @@ -31,6 +32,8 @@

from .image import SingleConditionSpec

logger = logging.getLogger(__name__)


def load_images_from_dir(in_dir: Union[str, Path], suffix: str = "nii.gz",
) -> Iterable[SpatialImage]:
Expand Down Expand Up @@ -58,6 +61,9 @@ def load_images_from_dir(in_dir: Union[str, Path], suffix: str = "nii.gz",
in_dir = Path(in_dir)
files = sorted(in_dir.glob("*" + suffix))
for f in files:
logger.debug(
'Starting to read file %s', f
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would say you should use the debug level for the messages you added in "io.py".

Also, do not break lines if they do not exceed the maximum line length.

Finally, the other message placement, before yield, makes more sense, but I would change the wording to "Starting to read [...]" and put a full stop at the end of the message.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done, but I would put some brief logger info to indicate in the beginning of some stages (reading file, apply masking, separating epochs, etc.)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, but not in "io.py".

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not? I think within the io methods are the best places to go.
BTW, I am feeling that io.load_images_from_dir and io.load_images should be merged.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

anyway, I will leave io.py intact

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because they are not useful. Why would a function log an info-level message saying it has been called every time it is called? Such messages could be emitted by the caller, if relevant.

We can discuss merging the load functions, but I would rather not, because "_dir" requires a suffix as well, so we would need to explain that.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

well, the suffix can be an optional argument. Let's discuss this in the next standup.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please write sentences in log messages, at least in this file.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

yield nib.load(str(f))


Expand Down Expand Up @@ -86,6 +92,9 @@ def load_images(image_paths: Iterable[Union[str, Path]]
string_path = str(image_path)
else:
string_path = image_path
logger.debug(
'Starting to read file %s', string_path
)
yield nib.load(string_path)


Expand Down Expand Up @@ -130,7 +139,7 @@ def load_labels(path: Union[str, Path]) -> List[SingleConditionSpec]:
List[SingleConditionSpec]
List of SingleConditionSpec stored in labels file.
"""
condition_specs = np.load(path)
condition_specs = np.load(str(path))
return [c.view(SingleConditionSpec) for c in condition_specs]


Expand Down
17 changes: 15 additions & 2 deletions examples/fcma/mvpa_voxel_selection.py
Expand Up @@ -39,7 +39,7 @@
MPI.COMM_WORLD.Get_size()
)
data_dir = sys.argv[1]
extension = sys.argv[2]
suffix = sys.argv[2]
mask_file = sys.argv[3]
epoch_file = sys.argv[4]

Expand All @@ -53,11 +53,24 @@
'mask size: %d' %
np.sum(mask)
)
images = io.load_images_from_dir(data_dir, extension)
images = io.load_images_from_dir(data_dir, suffix=suffix)
conditions = io.load_labels(epoch_file)
data, labels = prepare_searchlight_mvpa_data(images, conditions)

# setting the random argument produces random voxel selection results
# for non-parametric statistical analysis.
# There are three random options:
# RandomType.NORANDOM is the default
# RandomType.REPRODUCIBLE permutes the voxels in the same way every run
# RandomType.UNREPRODUCIBLE permutes the voxels differently across runs
# example
#from brainiak.fcma.preprocessing import RandomType
#data, labels = prepare_searchlight_mvpa_data(images, conditions,
# random=RandomType.UNREPRODUCIBLE)

# the following line is an example to leaving a subject out
#epoch_info = [x for x in epoch_info if x[1] != 0]

num_subjs = int(sys.argv[5])
# create a Searchlight object
sl = Searchlight(sl_rad=1)
Expand Down
18 changes: 16 additions & 2 deletions examples/fcma/voxel_selection.py
Expand Up @@ -38,19 +38,33 @@
MPI.COMM_WORLD.Get_size()
)
data_dir = sys.argv[1]
extension = sys.argv[2]
suffix = sys.argv[2]
mask_file = sys.argv[3]
epoch_file = sys.argv[4]
images = io.load_images_from_dir(data_dir, extension)
images = io.load_images_from_dir(data_dir, suffix=suffix)
mask = io.load_boolean_mask(mask_file)
conditions = io.load_labels(epoch_file)
raw_data, _, labels = prepare_fcma_data(images, conditions, mask)

# setting the random argument produces random voxel selection results
# for non-parametric statistical analysis.
# There are three random options:
# RandomType.NORANDOM is the default
# RandomType.REPRODUCIBLE permutes the voxels in the same way every run
# RandomType.UNREPRODUCIBLE permutes the voxels differently across runs
# example:
# from brainiak.fcma.preprocessing import RandomType
# raw_data, _, labels = prepare_fcma_data(images, conditions, mask,
# random=RandomType.REPRODUCIBLE)

# if providing two masks, just append the second mask as the last input argument
# and specify raw_data2
# example:
# images = io.load_images_from_dir(data_dir, extension)
# mask2 = io.load_boolean_mask('face_scene/mask.nii.gz')
# raw_data, raw_data2, labels = prepare_fcma_data(images, conditions, mask,
# mask2)

epochs_per_subj = int(sys.argv[5])
num_subjs = int(sys.argv[6])
# the following line is an example to leaving a subject out
Expand Down