Merge pull request #34 from lisaannyu/master

Design matrix and array of voxels in top 20% of t-statistics
berkeley-stat159 · Dec 6, 2015 · c886f50 · c886f50
2 parents b8e8907 + b0bf7d2
commit c886f50
Show file tree

Hide file tree

Showing 14 changed files with 186 additions and 100 deletions.
diff --git a/code/utils/linear_modeling.py b/code/utils/linear_modeling.py
@@ -53,15 +53,9 @@ def get_betas_Y(X, data):
     -------
     B: 2D array, p x B, the number of voxels
     """
-    print(data.shape[-1])
-    print(type(data))
-    # Y = np.reshape(data, (data.shape[-1], -1))
-    Y = np.reshape(data, (-1, data.shape[-1]))
-    print(Y.shape)
-    # B = npl.pinv(X).dot(Y)
+    Y = np.reshape(data,(-1,data.shape[-1]))
     B = npl.pinv(X).dot(Y.T)
-    print(B.shape)
-    return B, Y.T
+    return B,Y.T
 
 
 def get_betas_4d(B, data):
@@ -70,7 +64,6 @@ def get_betas_4d(B, data):
     -------
     4D array, beta for each voxel; need this format to plot
     """
-    print(B.shape)
     return np.reshape(B.T, data.shape[:-1] + (-1,))
 
 
@@ -121,64 +114,36 @@ def t_stat(y, X, c):
     # calculate bottom half of t statistic
     SE = np.sqrt(MRSS * c.T.dot(npl.pinv(X.T.dot(X)).dot(c)))
     t = c.T.dot(beta) / SE
-    return abs(t)
-
-
-#def get_top_100(t,thresh=100/1108800):
-#    """
-#    Parameters
-#    ----------
-#    t: 1D array of t-statistics for each voxel
-#
-#    Returns
-#    -------
-#    1D array of position of voxels in top 100 of t-statistics (all are
-#    positive)
-#    """
-#    a = np.int32(round(len(t) * thresh))
-#    # top_100_voxels = np.argpartition(t, -a)[-a:]
-#    # problem: nans, try
-#    top_100_voxels = np.argpartition(~np.isnan(t),-1)[-a:]
-
-#    return top_100_voxels
-
-
-#def  get_index_4d(top_100_voxels, data):
-#    """
-#    Returns
-#    -------
-#    Indices in terms of 4D array of each voxel in top 20% of t-statistics
-#    """
-#    shape = data[...,-1].shape
-#    axes = np.unravel_index(top_100_voxels, shape)
-#    return zip(axes[0], axes[1], axes[2]) # sequence too large, try n = 32
-
-# Solve the problem by using 32 for next two functions intead
-def get_top_32(t, thresh=100/1108800):
-    """
-    Parameters
-    ----------
+    return abs(t[0])
+
+
+def get_top_32(t,thresh=100/1108800):                                          
+    """     
+    Parameters                                                                  
+    ----------                                                                 
     t: 1D array of t-statistics for each voxel
 
     Returns
     -------
     1D array of position of voxels in top 32 of t-statistics (all are positive
     """
     a = np.int32(round(len(t) * thresh))
-    top_32_voxels = np.argpartition(~np.isnan(t), -1)[-a:]
-    return top_32_voxels
-
+    return t.argsort()[-a:][::-1]
+
 
 def get_index_4d(top_32_voxels, data):
     """
+    Parameters
+    ---------
+    top_32_voxels: 1D array of indices of top 32 voxels
+
     Returns
     -------
     Indices in terms of 4D array of each voxel in top 20% of t-statistics
-    """
-    shape = data[..., -1].shape
-    axes = np.unravel_index(top_32_voxels, shape)
-    return zip(axes[0], axes[1], axes[2])
-
+    """                                                                         
+    shape = data[...,-1].shape	
+    axes = np.unravel_index(top_32_voxels,shape)
+    return zip(*axes)
 
 def plot_single_voxel(data, top_100_voxels):
     """
@@ -187,5 +152,3 @@ def plot_single_voxel(data, top_100_voxels):
     None
     """
     plt.plot(data[get_index_4d(data, top_100_voxels)[0]])
-# fix so only get top voxel
-
diff --git a/code/utils/rf.py b/code/utils/rf.py
diff --git a/code/utils/script.py b/code/utils/script.py
@@ -13,7 +13,7 @@
 import rf
 
 # Load data
-img = nib.load('test_data.nii')  
+img = nib.load('bold_dico_dico_rcds_nl.nii')  
 data = img.get_data()
 
 # Design Matrix

diff --git a/code/utils/tests/test_linear_modeling.py b/code/utils/tests/test_linear_modeling.py
@@ -11,22 +11,14 @@
 import matplotlib.pyplot as plt
 import numpy.linalg as npl
 from scipy.stats import t as t_dist
-from .. import scene_slicer, plot
 
 from .. import linear_modeling
-from numpy.testing import assert_almost_equal
+from numpy.testing import assert_equal, assert_almost_equal
 
 # Make an X, y
 X = np.ones((4, 2))
-X[:, 0] = np.random.randint(10, size=4)
-y = np.zeros((4, 6))
-y[:, 0] = X[:, 0] * 2 + 3
-y[:, 1] = X[:, 0] * -1.5 + 2
-y[:, 2] = X[:, 0]
-y[:, 3] = X[:, 0] - 2
-y[:, 4] = X[:, 0] * 2
-y[:, 5] = X[:, 0] * -1 + 1
-data = np.reshape(y, (1, 2, 3, 4))
+X[:, 0] = np.array([4, 8, 2, 9])
+data = np.array([[[1, 0, 7, 0], [1, 0, 4, 4], [3, 7, 9, 7]]])
 
 # def test_get_design_matrix():
 # not done writing this function, need to wait
@@ -35,20 +27,21 @@
 # not sure how to test plots
 
 
-def test_get_betas():
-    actual = linear_modeling.get_betas(X, y[:, 0])
-    expected = np.array([[2., 3]]).T
+def test_get_betas_Y():
+    actual = linear_modeling.get_betas_Y(X, data)[0]
+    expected = np.array([[-0.85496183, -0.11450382, -0.01526718],
+                         [6.91603053, 2.90839695, 6.58778626]])
+    assert_almost_equal(expected, actual)
+    actual = linear_modeling.get_betas_Y(X, data)[1]
+    expected = np.array([[1, 1, 3], [0, 0, 7], [7, 4, 9], [0, 4, 7]])
     assert_almost_equal(expected, actual)
 
 
 def test_get_betas_4d():
     actual = linear_modeling.get_betas_4d(
-        linear_modeling.get_betas(X, data), data)
-    expected = np.array([[[
-        [2.00000000e+00, -1.50000000e+00
-         ], [1.00000000e+00, 1.00000000e+00], [2.00000000e+00, -1.00000000e+00]
-    ], [[3.00000000e+00, 2.00000000e+00], [-3.55271368e-15, -2.00000000e+00],
-        [-7.10542736e-15, 1.00000000e+00]]]])
+        linear_modeling.get_betas_Y(X, data)[0], data)
+    expected = np.array([[[-0.85496183, 6.91603053], [-0.11450382, 2.90839695],
+                          [-0.01526718, 6.58778626]]])
     assert_almost_equal(expected, actual)
 
 # def test_plot_betas():
@@ -57,33 +50,25 @@ def test_get_betas_4d():
 
 
 def test_t_stat():
-    actual = linear_modeling.t_stat(y, X, [0, 1])
-    expected = np.array([[4.90591615e+14, 3.77185234e+15, -5.32661312e-01, -
-                          2.17767996e+15, -5.32661312e-01, 6.46867339e+14]])
+    actual = linear_modeling.t_stat(
+        linear_modeling.get_betas_Y(X, data)[1], X, [0, 1])
+    expected = np.array([2.7475368, 1.04410995, 1.90484058])
     assert_almost_equal(expected, actual)
 
 
-def test_get_ts():
-    actual = linear_modeling.get_ts(y, X, [0, 1], data)
-    expected = np.array([9.84892820e+14, 6.46867339e+14, -4.81391897e-01, -
-                         8.59684933e+14, -4.81391897e-01, 7.99631096e+14])
-    assert_almost_equal(actual, expected)
-
-
-def test_get_top_100():
-    a = np.arange(21)
-    assert_equal(linear_modeling.get_top_100(a,.2), [17, 18, 19, 20])
-    a = np.arange(21)[::-1]
-    assert_equal(linear_modeling.get_top_100(a,.2), [3, 2, 1, 0])
-    actual = linear_modeling.get_top_100(linear_modeling.get_ts(y, X, [0, 1],
-                                                               data),.2)
-    expected = np.array([0])
+def test_get_top_32():
+    a = np.array([6, 4, 1, 2, 8, 8, 1, 9, 5, 2, 1, 9, 5, 4, 3, 6, 5, 3, 5, 8])
+    assert_equal(linear_modeling.get_top_32(a, .2), [11, 7, 19, 4])
+    actual = linear_modeling.get_top_32(
+        linear_modeling.t_stat(
+            linear_modeling.get_betas_Y(X, data)[1], X, [0, 1]), .5)
+    expected = np.array([0, 2])
     assert_almost_equal(actual, expected)
 
 
 def test_get_index_4d():
-    actual = linear_modeling.get_index_4d([3, 8, 23, 1], (2, 3, 4))
-    assert_equal(actual, [(0, 0, 3), (0, 2, 0), (1, 2, 3), (0, 0, 1)])
+    actual = linear_modeling.get_index_4d([0, 2], data)
+    assert_equal(actual, [(0, 0), (0, 2)])
 
     # def test_plot_single_voxel():
     # 	not sure how to test plots
diff --git a/code/utils/tests/test_parse_demographics.py b/code/utils/tests/test_parse_demographics.py
@@ -0,0 +1,51 @@
+from __future__ import absolute_import
+from .. import parse_demographics
+
+import os
+import csv
+
+
+def prepare_for_tests():
+    with open('demographics.csv', 'w') as csvfile:
+        file_writer = csv.writer(csvfile, delimiter=',', quotechar='"')
+        file_writer.writerow(['id', 'gender', 'age', 'forrest_seen_count'])
+        file_writer.writerow(['1', 'm', '30-35', '5'])
+        file_writer.writerow(['2', 'm', '30-35', '1'])
+    test_object = parse_demographics.parse_csv('demographics.csv')
+    return test_object
+
+
+def test_seen_most_times():
+	test_subjects = prepare_for_tests()
+	seen_count = parse_demographics.seen_most_times(test_subjects)
+	assert seen_count[0] == 5 
+	assert seen_count[1] == 1
+	delete_file()
+
+
+def test_seen_least_times():
+	test_subjects = prepare_for_tests()
+	seen_count = parse_demographics.seen_least_times(test_subjects)
+	assert seen_count[0] == 1
+	assert seen_count[1] == 2
+	delete_file()
+
+
+def test_find_id_by_gender():
+	test_subjects = prepare_for_tests()
+	id_list = parse_demographics.find_id_by_gender(test_subjects, 'm')
+	assert len(id_list) == 2
+	assert id_list[0] == 'm'
+	assert id_list[1] == 'm'
+	delete_file()
+
+
+def test_find_count_by_id():
+	test_subjects = prepare_for_tests()
+	count = parse_demographics.find_count_by_id(test_subjects, 1)
+	assert count == 5
+	delete_file()
+
+
+def delete_file():
+	os.remove('demographics.csv')
diff --git a/slides/Makefile b/slides/Makefile
@@ -1,9 +1,13 @@
-.PHONY: all clean
+.PHONY: all clean progress.pdf final.pdf
 
-all: clean progress.pdf
+all: clean progress.pdf final.pdf
 
 clean:
-	rm -f progress.pdf
+	rm -f progress.pdf final.pdf
 
 progress.pdf: progress.md
 	pandoc -t beamer -s progress.md -o progress.pdf
+
+
+final.pdf: final.md
+	pandoc -t beamer -s final.md -o final.pdf
diff --git a/slides/day_night_3.png b/slides/day_night_3.png
diff --git a/slides/day_night_on_off.png b/slides/day_night_on_off.png
diff --git a/slides/design_matrix_new_3.png b/slides/design_matrix_new_3.png
diff --git a/slides/final.md b/slides/final.md
@@ -0,0 +1,83 @@
+% Project Lambda
+% Jordeen Chang, Alon Daks, Ying Luo, Lisa Ann Yu
+% December 3, 2015
+
+## The Paper
+- A high-resolution 7-Tesla fMRI dataset from complex natural stimulation with an audio movie
+
+## Abstract
+
+- Brief Overview of Original Study:
+    - 20 participants recorded at a high field strength of 7 Tesla while listening to *Forrest Gump*
+    - Collected fMRI data for entire movie in .niigz format with additional information regarding movie scenes
+    - Used MRI scanner and pulse oximetry to conduct blood oxygen level dependent (BOLD) imaging, structural MRI imaging, and physiological assay
+    - Its goal is to provide data for others to explore auditory cognition, language and music perception, social perception, etc.
+- Our Goal:                                                                     
+    - Reproduce a subject of analysis conducted by Hanke et. al                 
+    - Apply machine learning to see if we can predict if a subject was listening to a day or night movie scene based on brain state
+
+# Data Extraction & Exploration
+
+##Extraction                                                                    
+- First had to overcome the large amount of data to sift through                
+    - Testing only on Subject 1 for sake of time and efficiency: limited project scope from 320 GB to 5 GB
+    - Chose to only use nonlinear alignment out of raw data, linear alignment, and nonlinear
+- Smoothed data by applying Gaussian filter                                     
+- Designed a clean schema by introducing a data_path.json file to reference where each raw data file is located
+
+##
+![This is a plot of the SD’s across volumes in the 4-D array for subject 1, run 1 and task 1. Though the SD’s are all within the range (171, 173.5), there is still quite a bit of variability within that range.](sd.jpg?raw=true)
+
+# Methods & Results
+
+##Process Overview: Reproducing                                                 
+- Two methods for correlation mentioned in paper, and chose to reproduce the first: taking BOLD time-series and calculating voxel-wise pearson correlation on the raw data that Matthew provided.
+- Using multiple EC2 instances to run analysis, so that we can load both two hour time courses in a pair and parallelize the process since correlation takes 10 minutes on our laptops.
+- Defined a UNIX environment variable STAT159_CACHED_DATA that allows users to choose if they want data to be recomputed.
+- Results are still being processed.
+
+##Analysis Overview: Predicting                                                 
+- Parsed through “scenes.csv” and “demographics.csv” for information about movie scenes and subjects
+- Conducted t-test to determine if brain signal is significantly different between day and night scenes for each voxel
+- Retrieved top 32 voxels with biggest change between the groups.  
+
+##
+Step 1: Determine which slices are on and which are off      
+
+![Day-night time course for Task 1, Run 1.  Very few scenes take place at night.](day_night_on_off.png)
+
+##
+Step 2: Create a Design Matrix                                            
+
+![Intercept, day/night, linear drift](design_matrix_new_3.png?raw=true)  
+
+##
+Step 3: Calculate the betas for each voxel
+
+![Day_Night](day_night_3.png?raw=true)
+
+##
+![Linear Drift](linear_drift_3.png?raw=true)
+
+##
+![Intercept (baseline)](intercept_3.png?raw=true)
+
+##
+Step 4: Test the hypothesis that beta = 0 to generate t-statistics
+![Largest 32 t-statistics.  Note: We took the absolute value of the t-statistics to find the farthest distances from 0.  Additionally, since we are always comparing the estimate of beta to 0, the t-statistic should be positive.](top_32_bar.png?raw=true)
+
+##
+![Time course for the voxel with the highest t-statistic.](highest_t_day_night.png?raw=true)
+
+## Analysis Overview: Predicting (cont.)                                         
+- Using random forest with 1000 in the voxels as the features                   
+- 80% training, 20% testing                                                     
+- 83% accuracy, but the data is 86% day, so not doing too well.                 
+- Next steps include cross validation on feature set and running prediction on other parts of the data like sentiment and interior/exterior
+
+## Future Work                                                                  
+- Considered exploring possible physiological responses to certain movie scenes by gender and age, but not enough computing resources and data to do so
+- Predict familiarity with *Forrest Gump*
+    - Difference in voxel activation between participants with high and low  amounts of prior exposure
+    - Machine learning
+    - However, there are a relatively small number of participants, since fMRI is expensive, so the predictive ability of that study would be rather limited
diff --git a/slides/highest_t_day_night.png b/slides/highest_t_day_night.png
diff --git a/slides/intercept_3.png b/slides/intercept_3.png
diff --git a/slides/linear_drift_3.png b/slides/linear_drift_3.png
diff --git a/slides/top_32_bar.png b/slides/top_32_bar.png