Skip to content

Commit

Permalink
Packaged our code as a python module called stat159lambda
Browse files Browse the repository at this point in the history
  • Loading branch information
AlonDaks committed Dec 9, 2015
1 parent 4adfa6c commit 2ad3680
Show file tree
Hide file tree
Showing 27 changed files with 1,291 additions and 0 deletions.
Empty file added code/__init__.py
Empty file.
5 changes: 5 additions & 0 deletions code/stat159lambda/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
test:
nosetests utils

coverage:
nosetests utils --with-coverage --cover-package=utils
Empty file added code/stat159lambda/__init__.py
Empty file.
12 changes: 12 additions & 0 deletions code/stat159lambda/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import os
import sys

REPO_HOME_PATH = os.path.realpath(__file__).replace('/code/stat159lambda/config.py', '')

USE_CACHED_DATA = os.environ.get('STAT159_CACHED_DATA', 'True') == 'True'

NUM_PROCESSES = int(os.environ.get('STAT159_NUM_PROCESSES', 5))

NUM_VOXELS = 1108800

NUM_TOTAL_VOLUMES = 3543
Empty file.
31 changes: 31 additions & 0 deletions code/stat159lambda/reproduction/inter_run_diagnostics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from __future__ import print_function
from __future__ import division
import numpy as np
import sys
import matplotlib.pyplot as plt
import gc
from stat159lambda.config import REPO_HOME_PATH


def calc_vol_rms_diff(data_file_path):
data = np.load(open(data_file_path))
diff_data = np.diff(data, axis=1)
del data
gc.collect()
vol_rms_diff = np.sqrt(np.mean(diff_data**2, axis=0))
return vol_rms_diff[9:]


def save_plot(vol_rms_diff, subj_num):
plt.plot(vol_rms_diff)
plt.savefig('{0}/figures/subj{1}_vol_rms_diff.png'.format(
REPO_HOME_PATH, subj_num))


if __name__ == '__main__':
subj_num = sys.argv[1]
data_file_path = '{0}/data/processed/sub{1}_rcds_2d.npy'.format(
REPO_HOME_PATH, subj_num)
vol_rms_diff = calc_vol_rms_diff(data_file_path)
save_plot(vol_rms_diff, subj_num)
del vol_rms_diff
84 changes: 84 additions & 0 deletions code/stat159lambda/reproduction/preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from __future__ import print_function
from __future__ import division
import json
import nibabel as nib
import numpy as np
import sys
from glob import glob
from os.path import exists
from multiprocessing import Pool
from stat159lambda.config import REPO_HOME_PATH

DATA_PATHS = json.load(open('{0}/data/data_path.json'.format(REPO_HOME_PATH)))

INCORRECT_NUM_ARGS_MESSAGE = 'Invalid number of arguments: specify alignment type'
ILLEGAL_ARG_MESSAGE = 'preprocess.py must be provided with alignment argument'


def concatenate_runs(alignment):
for i, subject in enumerate(DATA_PATHS['subjects']):
nii_file_name = '{0}/data/processed/sub{1}_{2}'.format(
REPO_HOME_PATH, i + 1, alignment)
if not exists(nii_file_name + '.nii') or not cf.USE_CACHED_DATA:
run_data = []
for j, run in enumerate(subject['runs']):
task_path = run[alignment]['path']
img = nib.load(REPO_HOME_PATH + '/' + task_path)
data = img.get_data()
if j == 0:
run_data.append(data[..., :-4])
elif j >= 1 and j <= 6:
run_data.append(data[..., 4:-4])
else:
run_data.append(data[..., 4:])
concatenated_run_data = np.concatenate(run_data, axis=3)
concatenated_img = nib.Nifti1Image(concatenated_run_data,
np.eye(4))
nib.save(concatenated_img, nii_file_name)
print('Saved {0}'.format(nii_file_name + '.nii'))
else:
print('Using cached version of {0}'.format(nii_file_name + '.nii'))


def reshape_data_to_2d(alignment):
files_to_reshape = np.sort(glob('{0}/data/processed/sub*_{1}.nii'.format(
REPO_HOME_PATH, alignment)))
for f in files_to_reshape:
file_name_2d = f.replace('.nii', '_2d')
if not exists(file_name_2d + '.npy') or not cf.USE_CACHED_DATA:
data = nib.load(f).get_data()
data_chunks = partition_4d_data(data)
p = Pool(cf.NUM_PROCESSES)
reshaped_chunks = p.imap(reshape_4d_data, data_chunks)
data_2d = merge_2d_data(reshaped_chunks)
np.save(file_name_2d, data_2d)
print('Saved {0}'.format(file_name_2d + '.npy'))
else:
print('Using cached version of {0}'.format(file_name_2d + '.npy'))


def partition_4d_data(data):
num_partitions = cf.NUM_PROCESSES
num_volunes = data.shape[-1]
partition_indices = [(num_volunes // num_partitions) * i
for i in range(1, num_partitions)]
return np.split(data, partition_indices, axis=3)


def reshape_4d_data(data):
return np.reshape(data, (-1, data.shape[-1]))


def merge_2d_data(data_slices):
return np.hstack(tuple(data_slices))

# def band_pass_filter():

if __name__ == '__main__':
if len(sys.argv) != 2:
raise ValueError(INCORRECT_NUM_ARGS_MESSAGE)
alignment = sys.argv[1]
if alignment not in ['linear', 'non_linear', 'rcds']:
raise ValueError(ILLEGAL_ARG_MESSAGE)
concatenate_runs(alignment)
reshape_data_to_2d(alignment)
85 changes: 85 additions & 0 deletions code/stat159lambda/reproduction/similarity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from __future__ import print_function
from __future__ import division
import nibabel as nib
import numpy as np
from scipy import stats
from multiprocessing import Process
import itertools
import sys
from os import path
import sharedmem as sm
import gc
from stat159lambda.config import REPO_HOME_PATH

INCORRECT_NUM_ARGS_MESSAGE = 'invalid number of arguments: specify alignment type, subject1 and subject2'
ILLEGAL_ALIGNMENT_ARG_MESSAGE = 'alignment argument must be either <rcds>, <linear>, <non_linear>'
ILLEGAL_SUBJECTS_ARG_MESSAGE = 'subject arguments must be integers corresponding to subject numbers'


def parallelize_correlation():
chunk_size = len(shared_subject1) // cf.NUM_PROCESSES
output_correlations = sm.empty(len(shared_subject1))
processes = [
Process(target=correlation,
args=(shared_subject1, shared_subject2, i * chunk_size, min(
(i + 1) * chunk_size, len(shared_subject1)),
output_correlations)) for i in xrange(cf.NUM_PROCESSES)
]
for p in processes:
p.start()
for p in processes:
p.join()
return output_correlations


def correlation(shared_subject1, shared_subject2, start_index, stop_index,
output_correlations):
for i in range(start_index, stop_index):
output_correlations[i] = stats.pearsonr(shared_subject1[i, :],
shared_subject2[
i, :])[0]


def check_command_line_arguments(arguments):
if len(arguments) != 4:
raise ValueError(INCORRECT_NUM_ARGS_MESSAGE)
alignment = arguments[1]
if alignment not in ['linear', 'non_linear', 'rcds']:
raise ValueError(ILLEGAL_ALIGNMENT_ARG_MESSAGE)
try:
subject1 = int(arguments[2])
subject2 = int(arguments[3])
except:
raise ValueError(ILLEGAL_SUBJECTS_ARG_MESSAGE)
subject1_file_path = '{0}/data/processed/sub{1}_{2}_2d.npy'.format(
REPO_HOME_PATH, subject1, alignment)
subject2_file_path = '{0}/data/processed/sub{1}_{2}_2d.npy'.format(
REPO_HOME_PATH, subject2, alignment)
if not path.exists(subject1_file_path):
raise ValueError(
'Filing missing: {0}, run preprocess.py to generate necessary file'.format(
subject1_file_path))
if not path.exists(subject2_file_path):
raise ValueError(
'Filing missing: {0}, run preprocess.py to generate necessary file'.format(
subject2_file_path))
return (subject1_file_path, subject2_file_path)


if __name__ == '__main__':
subject1_file_path, subject2_file_path = check_command_line_arguments(
sys.argv)
correlation_file_name = '{0}/data/processed/r_sub{1}_sub{2}_{3}'.format(
REPO_HOME_PATH, sys.argv[2], sys.argv[3], sys.argv[1])
if not path.exists(correlation_file_name +
'.npy') or not cf.USE_CACHED_DATA:
shared_subject1 = sm.copy(np.load(subject1_file_path))
gc.collect()
shared_subject2 = sm.copy(np.load(subject2_file_path))
gc.collect()
voxel_correlations = parallelize_correlation()
np.save(correlation_file_name, voxel_correlations)
print('Saved {0}'.format(correlation_file_name + '.npy'))
else:
print('Using cached version of {0}'.format(correlation_file_name +
'.npy'))
Empty file.
154 changes: 154 additions & 0 deletions code/stat159lambda/utils/linear_modeling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
# Python 3 compatibility
from __future__ import division, print_function
import numpy as np
import matplotlib.pyplot as plt
import numpy.linalg as npl
from scipy.stats import t as t_dist
from scipy.ndimage import gaussian_filter
import stat159lambda.utils.scene_slicer as ssm
import nibabel as nib



def get_design_matrix():

"""
Returns
-------
Design matrix with 5 columns, including the 3 columns of interest,
the linear drift column, and the column of ones
"""
data = nib.load(data_path)
n_trs = data.shape[-1]
X = np.ones((n_trs, 3))
ss = ssm.SceneSlicer('test_data.nii','scenes.csv')
day_night, int_ext = ss.get_scene_slices()
X[:, 1] = np.linspace(-1, 1, n_trs)
X[:, 2] = day_night
return X

#design matrix that is passed into random forrest
def get_rf_design_matrix(voxels,data):
ss = ssm.SceneSlicer('test_data.nii','scenes.csv')
day_night, int_ext = ss.get_scene_slices()
new_X = np.zeros((data.shape[-1], len(voxels)))
for num in range(len(voxels)):
new_X[:, num] = data[voxels[num]]
return new_X, day_night


def plot_design_matrix(X):
"""
Returns
-------
None
"""
plt.imshow(X, aspect=0.1, cmap='gray', interpolation='nearest')
plt.xticks([])


def get_betas_Y(X, data):
"""
Returns
-------
B: 2D array, p x B, the number of voxels
"""
Y = np.reshape(data,(-1,data.shape[-1]))
B = npl.pinv(X).dot(Y.T)
return B,Y.T


def get_betas_4d(B, data):
"""
Returns
-------
4D array, beta for each voxel; need this format to plot
"""
return np.reshape(B.T, data.shape[:-1] + (-1,))


def plot_betas(b_vols, col):
"""
Parameters
----------
b_vols: 2D array
col: integer between 0 and p
Returns
-------
None
"""
if col >= b_vols.shape[-1]:
raise RuntimeError("Error: select a column between 0 and p")
c = b_vols.shape[2]//2
plt.imshow(b_vols[:, :, c, col], cmap='gray', interpolation='nearest')


def t_stat(y, X, c):
""" betas, t statistic and significance test given data,
design matix, contrast
This is OLS estimation; we assume the errors to have independent
and identical normal distributions around zero for each $i$ in
$\e_i$ (i.i.d).
"""
# Make sure y, X, c are all arrays
y = np.asarray(y)
X = np.asarray(X)
c = np.atleast_2d(c).T # As column vector
# Calculate the parameters - b hat
beta = npl.pinv(X).dot(y)
# The fitted values - y hat
fitted = X.dot(beta)
# Residual error
errors = y - fitted
# Residual sum of squares
RSS = (errors**2).sum(axis=0)
# Degrees of freedom is the number of observations n minus the number
# of independent regressors we have used. If all the regressor
# columns in X are independent then the (matrix rank of X) == p
# (where p the number of columns in X). If there is one column that
# can be expressed as a linear sum of the other columns then
# (matrix rank of X) will be p - 1 - and so on.
df = X.shape[0] - npl.matrix_rank(X)
# Mean residual sum of squares
MRSS = RSS / df
# calculate bottom half of t statistic
SE = np.sqrt(MRSS * c.T.dot(npl.pinv(X.T.dot(X)).dot(c)))
t = c.T.dot(beta) / SE
return abs(t[0])


def get_top_32(t,thresh=100/1108800):
"""
Parameters
----------
t: 1D array of t-statistics for each voxel
Returns
-------
1D array of position of voxels in top 32 of t-statistics (all are positive
"""
a = np.int32(round(len(t) * thresh))
return t.argsort()[-a:][::-1]


def get_index_4d(top_32_voxels, data):
"""
Parameters
---------
top_32_voxels: 1D array of indices of top 32 voxels
Returns
-------
Indices in terms of 4D array of each voxel in top 20% of t-statistics
"""
shape = data[...,-1].shape
axes = np.unravel_index(top_32_voxels,shape)
return zip(*axes)

def plot_single_voxel(data, top_100_voxels):
"""
Returns
-------
None
"""
plt.plot(data[get_index_4d(data, top_100_voxels)[0]])
Loading

0 comments on commit 2ad3680

Please sign in to comment.