Packaged our code as a python module called stat159lambda

berkeley-stat159 · Dec 9, 2015 · 2ad3680 · 2ad3680
1 parent 4adfa6c
commit 2ad3680
Show file tree

Hide file tree

Showing 27 changed files with 1,291 additions and 0 deletions.
diff --git a/code/__init__.py b/code/__init__.py
diff --git a/code/stat159lambda/Makefile b/code/stat159lambda/Makefile
@@ -0,0 +1,5 @@
+test:
+	nosetests utils
+
+coverage:
+	nosetests utils --with-coverage --cover-package=utils 
diff --git a/code/stat159lambda/__init__.py b/code/stat159lambda/__init__.py
diff --git a/code/stat159lambda/config.py b/code/stat159lambda/config.py
@@ -0,0 +1,12 @@
+import os
+import sys
+
+REPO_HOME_PATH = os.path.realpath(__file__).replace('/code/stat159lambda/config.py', '')
+
+USE_CACHED_DATA = os.environ.get('STAT159_CACHED_DATA', 'True') == 'True'
+
+NUM_PROCESSES = int(os.environ.get('STAT159_NUM_PROCESSES', 5))
+
+NUM_VOXELS = 1108800
+
+NUM_TOTAL_VOLUMES = 3543
diff --git a/code/stat159lambda/reproduction/__init__.py b/code/stat159lambda/reproduction/__init__.py
diff --git a/code/stat159lambda/reproduction/inter_run_diagnostics.py b/code/stat159lambda/reproduction/inter_run_diagnostics.py
@@ -0,0 +1,31 @@
+from __future__ import print_function
+from __future__ import division
+import numpy as np
+import sys
+import matplotlib.pyplot as plt
+import gc
+from stat159lambda.config import REPO_HOME_PATH
+
+
+def calc_vol_rms_diff(data_file_path):
+    data = np.load(open(data_file_path))
+    diff_data = np.diff(data, axis=1)
+    del data
+    gc.collect()
+    vol_rms_diff = np.sqrt(np.mean(diff_data**2, axis=0))
+    return vol_rms_diff[9:]
+
+
+def save_plot(vol_rms_diff, subj_num):
+    plt.plot(vol_rms_diff)
+    plt.savefig('{0}/figures/subj{1}_vol_rms_diff.png'.format(
+        REPO_HOME_PATH, subj_num))
+
+
+if __name__ == '__main__':
+    subj_num = sys.argv[1]
+    data_file_path = '{0}/data/processed/sub{1}_rcds_2d.npy'.format(
+        REPO_HOME_PATH, subj_num)
+    vol_rms_diff = calc_vol_rms_diff(data_file_path)
+    save_plot(vol_rms_diff, subj_num)
+    del vol_rms_diff
diff --git a/code/stat159lambda/reproduction/preprocess.py b/code/stat159lambda/reproduction/preprocess.py
@@ -0,0 +1,84 @@
+from __future__ import print_function
+from __future__ import division
+import json
+import nibabel as nib
+import numpy as np
+import sys
+from glob import glob
+from os.path import exists
+from multiprocessing import Pool
+from stat159lambda.config import REPO_HOME_PATH
+
+DATA_PATHS = json.load(open('{0}/data/data_path.json'.format(REPO_HOME_PATH)))
+
+INCORRECT_NUM_ARGS_MESSAGE = 'Invalid number of arguments: specify alignment type'
+ILLEGAL_ARG_MESSAGE = 'preprocess.py must be provided with alignment argument'
+
+
+def concatenate_runs(alignment):
+    for i, subject in enumerate(DATA_PATHS['subjects']):
+        nii_file_name = '{0}/data/processed/sub{1}_{2}'.format(
+            REPO_HOME_PATH, i + 1, alignment)
+        if not exists(nii_file_name + '.nii') or not cf.USE_CACHED_DATA:
+            run_data = []
+            for j, run in enumerate(subject['runs']):
+                task_path = run[alignment]['path']
+                img = nib.load(REPO_HOME_PATH + '/' + task_path)
+                data = img.get_data()
+                if j == 0:
+                    run_data.append(data[..., :-4])
+                elif j >= 1 and j <= 6:
+                    run_data.append(data[..., 4:-4])
+                else:
+                    run_data.append(data[..., 4:])
+            concatenated_run_data = np.concatenate(run_data, axis=3)
+            concatenated_img = nib.Nifti1Image(concatenated_run_data,
+                                               np.eye(4))
+            nib.save(concatenated_img, nii_file_name)
+            print('Saved {0}'.format(nii_file_name + '.nii'))
+        else:
+            print('Using cached version of {0}'.format(nii_file_name + '.nii'))
+
+
+def reshape_data_to_2d(alignment):
+    files_to_reshape = np.sort(glob('{0}/data/processed/sub*_{1}.nii'.format(
+        REPO_HOME_PATH, alignment)))
+    for f in files_to_reshape:
+        file_name_2d = f.replace('.nii', '_2d')
+        if not exists(file_name_2d + '.npy') or not cf.USE_CACHED_DATA:
+            data = nib.load(f).get_data()
+            data_chunks = partition_4d_data(data)
+            p = Pool(cf.NUM_PROCESSES)
+            reshaped_chunks = p.imap(reshape_4d_data, data_chunks)
+            data_2d = merge_2d_data(reshaped_chunks)
+            np.save(file_name_2d, data_2d)
+            print('Saved {0}'.format(file_name_2d + '.npy'))
+        else:
+            print('Using cached version of {0}'.format(file_name_2d + '.npy'))
+
+
+def partition_4d_data(data):
+    num_partitions = cf.NUM_PROCESSES
+    num_volunes = data.shape[-1]
+    partition_indices = [(num_volunes // num_partitions) * i
+                        for i in range(1, num_partitions)]
+    return np.split(data, partition_indices, axis=3)
+
+
+def reshape_4d_data(data):
+    return np.reshape(data, (-1, data.shape[-1]))
+
+
+def merge_2d_data(data_slices):
+    return np.hstack(tuple(data_slices))
+
+# def band_pass_filter():
+
+if __name__ == '__main__':
+    if len(sys.argv) != 2:
+        raise ValueError(INCORRECT_NUM_ARGS_MESSAGE)
+    alignment = sys.argv[1]
+    if alignment not in ['linear', 'non_linear', 'rcds']:
+        raise ValueError(ILLEGAL_ARG_MESSAGE)
+    concatenate_runs(alignment)
+    reshape_data_to_2d(alignment)
diff --git a/code/stat159lambda/reproduction/similarity.py b/code/stat159lambda/reproduction/similarity.py
@@ -0,0 +1,85 @@
+from __future__ import print_function
+from __future__ import division
+import nibabel as nib
+import numpy as np
+from scipy import stats
+from multiprocessing import Process
+import itertools
+import sys
+from os import path
+import sharedmem as sm
+import gc
+from stat159lambda.config import REPO_HOME_PATH
+
+INCORRECT_NUM_ARGS_MESSAGE = 'invalid number of arguments: specify alignment type, subject1 and subject2'
+ILLEGAL_ALIGNMENT_ARG_MESSAGE = 'alignment argument must be either <rcds>, <linear>, <non_linear>'
+ILLEGAL_SUBJECTS_ARG_MESSAGE = 'subject arguments must be integers corresponding to subject numbers'
+
+
+def parallelize_correlation():
+    chunk_size = len(shared_subject1) // cf.NUM_PROCESSES
+    output_correlations = sm.empty(len(shared_subject1))
+    processes = [
+        Process(target=correlation,
+                args=(shared_subject1, shared_subject2, i * chunk_size, min(
+                    (i + 1) * chunk_size, len(shared_subject1)),
+                      output_correlations)) for i in xrange(cf.NUM_PROCESSES)
+    ]
+    for p in processes:
+        p.start()
+    for p in processes:
+        p.join()
+    return output_correlations
+
+
+def correlation(shared_subject1, shared_subject2, start_index, stop_index,
+                output_correlations):
+    for i in range(start_index, stop_index):
+        output_correlations[i] = stats.pearsonr(shared_subject1[i, :],
+                                                shared_subject2[
+                                                    i, :])[0]
+
+
+def check_command_line_arguments(arguments):
+    if len(arguments) != 4:
+        raise ValueError(INCORRECT_NUM_ARGS_MESSAGE)
+    alignment = arguments[1]
+    if alignment not in ['linear', 'non_linear', 'rcds']:
+        raise ValueError(ILLEGAL_ALIGNMENT_ARG_MESSAGE)
+    try:
+        subject1 = int(arguments[2])
+        subject2 = int(arguments[3])
+    except:
+        raise ValueError(ILLEGAL_SUBJECTS_ARG_MESSAGE)
+    subject1_file_path = '{0}/data/processed/sub{1}_{2}_2d.npy'.format(
+        REPO_HOME_PATH, subject1, alignment)
+    subject2_file_path = '{0}/data/processed/sub{1}_{2}_2d.npy'.format(
+        REPO_HOME_PATH, subject2, alignment)
+    if not path.exists(subject1_file_path):
+        raise ValueError(
+            'Filing missing: {0}, run preprocess.py to generate necessary file'.format(
+                subject1_file_path))
+    if not path.exists(subject2_file_path):
+        raise ValueError(
+            'Filing missing: {0}, run preprocess.py to generate necessary file'.format(
+                subject2_file_path))
+    return (subject1_file_path, subject2_file_path)
+
+
+if __name__ == '__main__':
+    subject1_file_path, subject2_file_path = check_command_line_arguments(
+        sys.argv)
+    correlation_file_name = '{0}/data/processed/r_sub{1}_sub{2}_{3}'.format(
+        REPO_HOME_PATH, sys.argv[2], sys.argv[3], sys.argv[1])
+    if not path.exists(correlation_file_name +
+                       '.npy') or not cf.USE_CACHED_DATA:
+        shared_subject1 = sm.copy(np.load(subject1_file_path))
+        gc.collect()
+        shared_subject2 = sm.copy(np.load(subject2_file_path))
+        gc.collect()
+        voxel_correlations = parallelize_correlation()
+        np.save(correlation_file_name, voxel_correlations)
+        print('Saved {0}'.format(correlation_file_name + '.npy'))
+    else:
+        print('Using cached version of {0}'.format(correlation_file_name +
+                                                   '.npy'))
diff --git a/code/stat159lambda/utils/__init__.py b/code/stat159lambda/utils/__init__.py
diff --git a/code/stat159lambda/utils/linear_modeling.py b/code/stat159lambda/utils/linear_modeling.py
@@ -0,0 +1,154 @@
+# Python 3 compatibility
+from __future__ import division, print_function
+import numpy as np
+import matplotlib.pyplot as plt
+import numpy.linalg as npl
+from scipy.stats import t as t_dist
+from scipy.ndimage import gaussian_filter
+import stat159lambda.utils.scene_slicer as ssm
+import nibabel as nib
+
+
+
+def get_design_matrix():
+
+    """
+    Returns
+    -------
+    Design matrix with 5 columns, including the 3 columns of interest,
+    the linear drift column, and the column of ones
+    """
+    data = nib.load(data_path)
+    n_trs = data.shape[-1]
+    X = np.ones((n_trs, 3))
+    ss = ssm.SceneSlicer('test_data.nii','scenes.csv')
+    day_night, int_ext = ss.get_scene_slices()
+    X[:, 1] = np.linspace(-1, 1, n_trs)
+    X[:, 2] = day_night
+    return X
+
+#design matrix that is passed into random forrest
+def get_rf_design_matrix(voxels,data):
+    ss = ssm.SceneSlicer('test_data.nii','scenes.csv')
+    day_night, int_ext = ss.get_scene_slices()
+    new_X = np.zeros((data.shape[-1], len(voxels)))
+    for num in range(len(voxels)):
+        new_X[:, num] = data[voxels[num]]
+    return new_X, day_night
+
+
+def plot_design_matrix(X):
+    """
+    Returns
+    -------
+    None
+    """
+    plt.imshow(X, aspect=0.1, cmap='gray', interpolation='nearest')
+    plt.xticks([])
+
+
+def get_betas_Y(X, data):
+    """
+    Returns
+    -------
+    B: 2D array, p x B, the number of voxels
+    """
+    Y = np.reshape(data,(-1,data.shape[-1]))
+    B = npl.pinv(X).dot(Y.T)
+    return B,Y.T
+
+
+def get_betas_4d(B, data):
+    """
+    Returns
+    -------
+    4D array, beta for each voxel; need this format to plot
+    """
+    return np.reshape(B.T, data.shape[:-1] + (-1,))
+
+
+def plot_betas(b_vols, col):
+    """
+    Parameters
+    ----------
+    b_vols: 2D array
+    col: integer between 0 and p
+    Returns
+    -------
+    None
+    """
+    if col >= b_vols.shape[-1]:
+        raise RuntimeError("Error: select a column between 0 and p")
+    c = b_vols.shape[2]//2
+    plt.imshow(b_vols[:, :, c, col], cmap='gray', interpolation='nearest')
+
+
+def t_stat(y, X, c):
+    """ betas, t statistic and significance test given data,
+    design matix, contrast
+    This is OLS estimation; we assume the errors to have independent
+    and identical normal distributions around zero for each $i$ in
+    $\e_i$ (i.i.d).
+    """
+    # Make sure y, X, c are all arrays
+    y = np.asarray(y)
+    X = np.asarray(X)
+    c = np.atleast_2d(c).T  # As column vector
+    # Calculate the parameters - b hat
+    beta = npl.pinv(X).dot(y)
+    # The fitted values - y hat
+    fitted = X.dot(beta)
+    # Residual error
+    errors = y - fitted
+    # Residual sum of squares
+    RSS = (errors**2).sum(axis=0)
+    # Degrees of freedom is the number of observations n minus the number
+    # of independent regressors we have used.  If all the regressor
+    # columns in X are independent then the (matrix rank of X) == p
+    # (where p the number of columns in X). If there is one column that
+    # can be expressed as a linear sum of the other columns then
+    # (matrix rank of X) will be p - 1 - and so on.
+    df = X.shape[0] - npl.matrix_rank(X)
+    # Mean residual sum of squares
+    MRSS = RSS / df
+    # calculate bottom half of t statistic
+    SE = np.sqrt(MRSS * c.T.dot(npl.pinv(X.T.dot(X)).dot(c)))
+    t = c.T.dot(beta) / SE
+    return abs(t[0])
+
+
+def get_top_32(t,thresh=100/1108800):                                          
+    """     
+    Parameters                                                                  
+    ----------                                                                 
+    t: 1D array of t-statistics for each voxel
+
+    Returns
+    -------
+    1D array of position of voxels in top 32 of t-statistics (all are positive
+    """
+    a = np.int32(round(len(t) * thresh))
+    return t.argsort()[-a:][::-1]
+
+
+def get_index_4d(top_32_voxels, data):
+    """
+    Parameters
+    ---------
+    top_32_voxels: 1D array of indices of top 32 voxels
+
+    Returns
+    -------
+    Indices in terms of 4D array of each voxel in top 20% of t-statistics
+    """                                                                         
+    shape = data[...,-1].shape	
+    axes = np.unravel_index(top_32_voxels,shape)
+    return zip(*axes)
+
+def plot_single_voxel(data, top_100_voxels):
+    """
+    Returns
+    -------
+    None
+    """
+    plt.plot(data[get_index_4d(data, top_100_voxels)[0]])