Merge pull request #199 from timothy1191xa/lin_reg

Updated linear_regression
berkeley-stat159 · Dec 12, 2015 · 75f0d20 · 75f0d20
2 parents 09ae1c1 + de285e4
commit 75f0d20
Show file tree

Hide file tree

Showing 14 changed files with 453 additions and 71 deletions.
diff --git a/Makefile b/Makefile
@@ -6,7 +6,7 @@ clean:
 	find . -name "*.so" -o -name "*.pyc" -o -name "*.pyx.md5" | xargs rm -f
 
 coverage:
-	nosetests code/utils/tests data/tests/ --with-coverage --cover-package=code/utils/functions,data/tests/test_get_check_hashes.py
+	nosetests code/utils/tests data/tests/ --with-coverage --cover-package=code/utils/functions,data/data_hashes.py
 
 test:
 	nosetests code/utils/tests data/tests/

diff --git a/code/utils/diagnostics.py → code/utils/functions/diagnostics.py b/code/utils/diagnostics.py → code/utils/functions/diagnostics.py
@@ -6,7 +6,7 @@
 """
 
 import numpy as np
-import pdb
+
 
 def vol_std(data):
     """ Return standard deviation across voxels for 4D array `data`
@@ -23,10 +23,12 @@ def vol_std(data):
         One dimensonal array where ``std_values[i]`` gives the standard
         deviation of all voxels contained in ``data[..., i]``.
     """
-    std_values = [np.std(np.ravel(data[...,i])) for i in range(0, np.shape(data)[3])]
-    return np.array(std_values)
+    T = data.shape[-1]
+    data_2d = np.reshape(data, (-1, T))
+    return np.std(data_2d, axis=0)
+
 
-def iqr_outliers(arr_1d, iqr_scale=1):
+def iqr_outliers(arr_1d, iqr_scale=1.5):
     """ Return indices of outliers identified by interquartile range
 
     Parameters
@@ -49,16 +51,13 @@ def iqr_outliers(arr_1d, iqr_scale=1):
     """
     # Hint : np.lookfor('centile')
     # Hint : np.lookfor('nonzero')
-    sevenfifth = np.percentile(arr_1d,75)
-    twenfifth = np.percentile(arr_1d,25)
-    iqr = sevenfifth-twenfifth
-    low = twenfifth-iqr_scale*iqr
-    hi = iqr_scale*iqr+sevenfifth
-    outliers1 = np.array(np.nonzero(arr_1d > hi))[0]
-    outliers2 = np.array(np.nonzero((arr_1d < low)))[0]
-    outliers = np.hstack((outliers1,outliers2))
-    outliers.sort()
-    return outliers,(low, hi)
+    pct_25, pct_75 = np.percentile(arr_1d, [25, 75])
+    iqr = pct_75 - pct_25
+    lo_thresh = pct_25 - iqr * iqr_scale
+    hi_thresh = pct_75 + iqr * iqr_scale
+    is_outlier = (arr_1d < lo_thresh) | (arr_1d > hi_thresh)
+    return np.nonzero(is_outlier)[0], (lo_thresh, hi_thresh)
+
 
 def vol_rms_diff(arr_4d):
     """ Return root mean square of differences between sequential volumes
@@ -76,12 +75,11 @@ def vol_rms_diff(arr_4d):
         the mean (across voxels) of the squared difference between volume i and
         volume i + 1.
     """
-
-    rms_values=[]
-    for i in range(arr_4d.shape[-1]-1):
-        diff_vol = arr_4d[...,i+1] - arr_4d[...,i]
-        rms_values.append(np.sqrt(np.mean(diff_vol ** 2)))
-    return np.array(rms_values)
+    T = arr_4d.shape[-1]
+    diff_data = np.diff(arr_4d, axis=-1)
+    diff_data_2d = np.reshape(diff_data, (-1, T-1))
+    return np.sqrt(np.mean(diff_data_2d ** 2, axis=0))
+
 
 def extend_diff_outliers(diff_indices):
     """ Extend difference-based outlier indices `diff_indices` by pairing
@@ -95,22 +93,14 @@ def extend_diff_outliers(diff_indices):
 
     Returns
     -------
-    extended_indices : 
+    extended_indices : array
         Array where each index ``j`` in `diff_indices has been replaced by two
         indices, ``j`` and ``j+1``, unless ``j+1`` is present in
         ``diff_indices``.  For example, if the input was ``[3, 7, 8, 12, 20]``,
         ``[3, 4, 7, 8, 9, 12, 13, 20, 21]``.
     """
-    i=0
-    while (i < len(diff_indices)):
-        if i is (len(diff_indices)-1):
-            diff_indices = np.insert(diff_indices,i+1,diff_indices[i]+1)
-            break
-        if diff_indices[i+1] == (diff_indices[i]+1):
-            i=i+1
-            continue
-        diff_indices = np.insert(diff_indices,i+1,diff_indices[i]+1)
-        i=i+2
-    return diff_indices
-
-
+    # Make two columns with the second being the first plus 1
+    ext_outliers = np.column_stack((diff_indices, diff_indices + 1))
+    # Use numpy reshape ordering to get the values from the first row, then the
+    # second row etc...
+    return np.unique(ext_outliers.ravel())
diff --git a/code/utils/glm.py → code/utils/functions/glm.py b/code/utils/glm.py → code/utils/functions/glm.py
diff --git a/code/utils/smoothing.py → code/utils/functions/smoothing.py b/code/utils/smoothing.py → code/utils/functions/smoothing.py
diff --git a/code/utils/tests/test_diagnostics.py b/code/utils/tests/test_diagnostics.py
@@ -0,0 +1,108 @@
+"""test_diagnostics.py
+Tests for the functions in diagnostics.py module 
+
+Run with:
+
+    nosetests test_diagnostics.py
+"""
+import os, sys, pdb
+import numpy as np
+from nose.tools import assert_equal
+from numpy.testing import assert_almost_equal, assert_array_equal
+
+#Append path to functions
+sys.path.append(os.path.join(os.path.dirname(__file__), "../functions/"))
+from diagnostics import *
+
+def test_vol_std():
+    # We make a fake 4D image
+    shape_3d = (2, 3, 4)
+    V = np.prod(shape_3d)
+    T = 10  # The number of 3D volumes
+    # Make a 2D array that we will reshape to 4D
+    arr_2d = np.random.normal(size=(V, T))
+    expected_stds = np.std(arr_2d, axis=0)
+    # Reshape to 4D
+    arr_4d = np.reshape(arr_2d, shape_3d + (T,))
+    actual_stds = vol_std(arr_4d)
+    assert_almost_equal(expected_stds, actual_stds)
+
+def test_vol_rms_diff():
+    # We make a fake 4D image
+    shape_3d = (2, 3, 4)
+    V = np.prod(shape_3d)
+    T = 10  # The number of 3D volumes
+    # Make a 2D array that we will reshape to 4D
+    arr_2d = np.random.normal(size=(V, T))
+    differences = np.diff(arr_2d, axis=1)
+    exp_rms = np.sqrt(np.mean(differences ** 2, axis=0))
+    # Reshape to 4D and run function
+    arr_4d = np.reshape(arr_2d, shape_3d + (T,))
+    actual_rms = vol_rms_diff(arr_4d)
+    assert_almost_equal(actual_rms, exp_rms)
+
+def test_iqr_outliers():
+    # Test with simplest possible array
+    arr = np.arange(101)  # percentile same as value
+    # iqr = 50
+    exp_lo = 25 - 75
+    exp_hi = 75 + 75
+    indices, thresholds = iqr_outliers(arr)
+    assert_array_equal(indices, [])
+    assert_equal(thresholds, (exp_lo, exp_hi))
+    # Reverse, same values
+    indices, thresholds = iqr_outliers(arr[::-1])
+    assert_array_equal(indices, [])
+    assert_equal(thresholds, (exp_lo, exp_hi))
+    # Add outliers
+    arr[0] = -51
+    arr[1] = 151
+    arr[100] = 1  # replace lost value to keep centiles same
+    indices, thresholds = iqr_outliers(arr)
+    assert_array_equal(indices, [0, 1])
+    assert_equal(thresholds, (exp_lo, exp_hi))
+    # Reversed, then the indices are reversed
+    indices, thresholds = iqr_outliers(arr[::-1])
+    assert_array_equal(indices, [99, 100])
+    assert_equal(thresholds, (exp_lo, exp_hi))
+
+def test_iqr_scaling():
+    # Check that the scaling of IQR works
+    # Test with simplest possible array
+    arr = np.arange(101)  # percentile same as value
+    # iqr = 50
+    exp_lo = 25 - 100
+    exp_hi = 75 + 100
+    indices, thresholds = iqr_outliers(arr, 2)
+    assert_array_equal(indices, [])
+    assert_equal(thresholds, (exp_lo, exp_hi))
+    # Add outliers - but these aren't big enough now
+    arr[0] = -51
+    arr[1] = 151
+    indices, thresholds = iqr_outliers(arr, 2)
+    assert_array_equal(indices, [])
+    # Add outliers - that are big enough
+    arr[0] = -76
+    arr[1] = 176
+    arr[100] = 1  # replace lost value to keep centiles same
+    indices, thresholds = iqr_outliers(arr, 2)
+    assert_array_equal(indices, [0, 1])
+
+def test_extend_diff_outliers():
+    # Test function to extend difference outlier indices
+    indices = np.array([3, 7, 12, 20])
+    assert_array_equal(extend_diff_outliers(indices),
+                       [3, 4, 7, 8, 12, 13, 20, 21])
+
+
+def test_sequential_input():
+    indices = np.array([4, 5, 9, 10])
+    assert_array_equal(extend_diff_outliers(indices),
+                       [4, 5, 6, 9, 10, 11])
+    indices = np.array([1, 2, 4, 5, 9, 10])
+    assert_array_equal(extend_diff_outliers(indices),
+                       [1, 2, 3, 4, 5, 6, 9, 10, 11])
+    indices = np.array([3, 7, 8, 12, 20])
+    assert_array_equal(extend_diff_outliers(indices),
+                       [3, 4, 7, 8, 9, 12, 13, 20, 21])
+
diff --git a/code/utils/tests/test_glm.py b/code/utils/tests/test_glm.py
@@ -0,0 +1,71 @@
+""" Tests for glm function in glm module
+This checks the glm function.
+
+Run at the tests directory with:
+    nosetests code/utils/tests/test_glm.py
+
+"""
+# Loading modules.
+import numpy as np
+import numpy.linalg as npl
+import nibabel as nib
+import os
+import sys
+from numpy.testing import assert_almost_equal, assert_array_equal
+
+
+# Add path to functions to the system path.
+sys.path.append(os.path.join(os.path.dirname(__file__), "../functions/"))
+
+
+# Load our GLM functions. 
+from glm import glm_beta, glm_mrss
+
+def test_glm_beta():
+    # Read in the image data.
+    img = nib.load('data/ds114/sub009/BOLD/task002_run001/ds114_sub009_t2r1.nii')
+    data = img.get_data()
+    # Read in the convolutions.
+    p = 2
+    convolved1 = np.loadtxt('data/ds114/sub009/behav/task002_run001/ds114_sub009_t2r1_conv.txt')
+    # Create design matrix. 
+    X_matrix = np.ones((len(convolved1), p))
+    X_matrix[:, 1] = convolved1
+
+    # Calculate betas, copied from the exercise. 
+    data_2d = np.reshape(data, (-1, data.shape[-1]))
+    B = npl.pinv(X_matrix).dot(data_2d.T)
+    B_4d = np.reshape(B.T, img.shape[:-1] + (-1,))
+
+    # Run function.
+    test_B_4d = glm_beta(data, X_matrix)
+    assert_almost_equal(B_4d, test_B_4d)
+
+
+def test_glm_mrss():
+    img = nib.load('data/ds114/sub009/BOLD/task002_run001/ds114_sub009_t2r1.nii')
+    data = img.get_data()
+    convolved1 = np.loadtxt('data/ds114/sub009/behav/task002_run001/ds114_sub009_t2r1_conv.txt')
+    X_matrix = np.ones((len(convolved1), 2))
+    X_matrix[:, 1] = convolved1
+    data_2d = np.reshape(data, (-1, data.shape[-1]))
+    B = npl.pinv(X_matrix).dot(data_2d.T)
+    B_4d = np.reshape(B.T, img.shape[:-1] + (-1,))
+    test_B_4d = glm_beta(data, X_matrix)
+
+    # Pick a single voxel to check mrss functiom.
+    # Calculate actual fitted values, residuals, and MRSS of voxel.
+    fitted = X_matrix.dot(B_4d[12, 22, 10])
+    residuals = data[12, 22, 10] - fitted
+    MRSS = np.sum(residuals**2)/(X_matrix.shape[0] - npl.matrix_rank(X_matrix))
+
+    # Calculate using glm_diagnostics function.
+    test_MRSS, test_fitted, test_residuals = glm_mrss(test_B_4d, X_matrix, data)
+    assert_almost_equal(MRSS, test_MRSS[12, 22, 10])
+    assert_almost_equal(fitted, test_fitted[12, 22, 10])
+    assert_almost_equal(residuals, test_residuals[12, 22, 10])
+
+
+
+
+
diff --git a/code/utils/tests/test_smoothing.py b/code/utils/tests/test_smoothing.py
@@ -0,0 +1,38 @@
+""" Tests for smoothvoxels in smooth module
+Run at the tests directory with:
+    nosetests code/utils/tests/test_smoothing.py
+"""
+
+import os
+import sys
+import numpy as np
+import itertools
+import scipy.ndimage
+from scipy.ndimage.filters import gaussian_filter
+import matplotlib.pyplot as plt
+import nibabel as nib
+from numpy.testing import assert_almost_equal
+from nose.tools import assert_not_equals
+
+
+# Add path to functions to the system path.
+sys.path.append(os.path.join(os.path.dirname(__file__), "../functions/"))
+
+# Load smoothing function.
+from smoothing import smoothing
+
+def test_smooth():
+    # Read in the image data.
+    img = nib.load('data/ds114/sub009/BOLD/task002_run001/ds114_sub009_t2r1.nii')
+    data = img.get_data()
+
+    # Run the smoothing function with sigma 0 at time 12
+    non_smoothed_data = smoothing(data, 0, 12)
+
+	# assert that data at time 12 and non_smoothed_data are equal since sigma = 0
+    assert_almost_equal(data[..., 12], non_smoothed_data)
+
+	# Run the smoothvoxels function with sigma 5 at time 100
+    smoothed_data = smoothing(data, 5, 100)
+	# assert that data at time 16 and smoothed_data are not equal
+    assert_not_equals(data[..., 100].all(), smoothed_data.all())
diff --git a/data/Makefile b/data/Makefile
@@ -18,9 +18,9 @@ download_test_data:
 	wget -P ../data/ds114/sub009/behav/task002_run001/ http://www.jarrodmillman.com/rcsds/_downloads/ds114_sub009_t2r1_conv.txt
 
 test: 
-	 nosetests tests -w ..
+	 nosetests tests 
 
 coverage:
-	 nosetests tests -w .. --with-coverage --cover-package=data.py 
+	 nosetests tests --with-coverage --cover-package=data_hashes.py 
 
 
diff --git a/data/data_hashes.py b/data/data_hashes.py
@@ -11,27 +11,6 @@
 import pdb
 import json
 
-def create_dict(filename):
-    """Return a dictionary of hashes from a .txt file
-
-       Parameters
-       ----------
-       filename: .txt file containing the each file hash in the
-           first column and the file location in the data in the
-            second column.
-       Returns
-       ---------
-       newDict: a dictionary of hashes
-    """
-    newDict={}
-    f=open(filename)
-    num_lines = sum(1 for line in open(filename))
-    for line,i in zip(f,range(0,num_lines)):
-        info = line.split()
-        newDict[info[1]]=info[0]
-    f.close()
-    return newDict
-
 def generate_file_md5(filename, blocksize=2**20):
     """Generate the md5 hash for filename
     Parameters

diff --git a/data/ds114/sub009/BOLD/task002_run001/ds114_sub009_t2r1.nii b/data/ds114/sub009/BOLD/task002_run001/ds114_sub009_t2r1.nii