diff --git a/code/utils/linear_modeling.py b/code/utils/linear_modeling.py index f72c192..c9116cf 100644 --- a/code/utils/linear_modeling.py +++ b/code/utils/linear_modeling.py @@ -53,15 +53,9 @@ def get_betas_Y(X, data): ------- B: 2D array, p x B, the number of voxels """ - print(data.shape[-1]) - print(type(data)) - # Y = np.reshape(data, (data.shape[-1], -1)) - Y = np.reshape(data, (-1, data.shape[-1])) - print(Y.shape) - # B = npl.pinv(X).dot(Y) + Y = np.reshape(data,(-1,data.shape[-1])) B = npl.pinv(X).dot(Y.T) - print(B.shape) - return B, Y.T + return B,Y.T def get_betas_4d(B, data): @@ -70,7 +64,6 @@ def get_betas_4d(B, data): ------- 4D array, beta for each voxel; need this format to plot """ - print(B.shape) return np.reshape(B.T, data.shape[:-1] + (-1,)) @@ -121,43 +114,13 @@ def t_stat(y, X, c): # calculate bottom half of t statistic SE = np.sqrt(MRSS * c.T.dot(npl.pinv(X.T.dot(X)).dot(c))) t = c.T.dot(beta) / SE - return abs(t) - - -#def get_top_100(t,thresh=100/1108800): -# """ -# Parameters -# ---------- -# t: 1D array of t-statistics for each voxel -# -# Returns -# ------- -# 1D array of position of voxels in top 100 of t-statistics (all are -# positive) -# """ -# a = np.int32(round(len(t) * thresh)) -# # top_100_voxels = np.argpartition(t, -a)[-a:] -# # problem: nans, try -# top_100_voxels = np.argpartition(~np.isnan(t),-1)[-a:] - -# return top_100_voxels - - -#def get_index_4d(top_100_voxels, data): -# """ -# Returns -# ------- -# Indices in terms of 4D array of each voxel in top 20% of t-statistics -# """ -# shape = data[...,-1].shape -# axes = np.unravel_index(top_100_voxels, shape) -# return zip(axes[0], axes[1], axes[2]) # sequence too large, try n = 32 - -# Solve the problem by using 32 for next two functions intead -def get_top_32(t, thresh=100/1108800): - """ - Parameters - ---------- + return abs(t[0]) + + +def get_top_32(t,thresh=100/1108800): + """ + Parameters + ---------- t: 1D array of t-statistics for each voxel Returns @@ -165,20 +128,22 @@ def get_top_32(t, thresh=100/1108800): 1D array of position of voxels in top 32 of t-statistics (all are positive """ a = np.int32(round(len(t) * thresh)) - top_32_voxels = np.argpartition(~np.isnan(t), -1)[-a:] - return top_32_voxels - + return t.argsort()[-a:][::-1] + def get_index_4d(top_32_voxels, data): """ + Parameters + --------- + top_32_voxels: 1D array of indices of top 32 voxels + Returns ------- Indices in terms of 4D array of each voxel in top 20% of t-statistics - """ - shape = data[..., -1].shape - axes = np.unravel_index(top_32_voxels, shape) - return zip(axes[0], axes[1], axes[2]) - + """ + shape = data[...,-1].shape + axes = np.unravel_index(top_32_voxels,shape) + return zip(*axes) def plot_single_voxel(data, top_100_voxels): """ @@ -187,5 +152,3 @@ def plot_single_voxel(data, top_100_voxels): None """ plt.plot(data[get_index_4d(data, top_100_voxels)[0]]) -# fix so only get top voxel - diff --git a/code/utils/rf.py b/code/utils/rf.py old mode 100644 new mode 100755 diff --git a/code/utils/script.py b/code/utils/script.py index c561c0d..c5695b7 100644 --- a/code/utils/script.py +++ b/code/utils/script.py @@ -13,7 +13,7 @@ import rf # Load data -img = nib.load('test_data.nii') +img = nib.load('bold_dico_dico_rcds_nl.nii') data = img.get_data() # Design Matrix diff --git a/code/utils/tests/test_linear_modeling.py b/code/utils/tests/test_linear_modeling.py index dfd6670..e3d9d77 100644 --- a/code/utils/tests/test_linear_modeling.py +++ b/code/utils/tests/test_linear_modeling.py @@ -11,22 +11,14 @@ import matplotlib.pyplot as plt import numpy.linalg as npl from scipy.stats import t as t_dist -from .. import scene_slicer, plot from .. import linear_modeling -from numpy.testing import assert_almost_equal +from numpy.testing import assert_equal, assert_almost_equal # Make an X, y X = np.ones((4, 2)) -X[:, 0] = np.random.randint(10, size=4) -y = np.zeros((4, 6)) -y[:, 0] = X[:, 0] * 2 + 3 -y[:, 1] = X[:, 0] * -1.5 + 2 -y[:, 2] = X[:, 0] -y[:, 3] = X[:, 0] - 2 -y[:, 4] = X[:, 0] * 2 -y[:, 5] = X[:, 0] * -1 + 1 -data = np.reshape(y, (1, 2, 3, 4)) +X[:, 0] = np.array([4, 8, 2, 9]) +data = np.array([[[1, 0, 7, 0], [1, 0, 4, 4], [3, 7, 9, 7]]]) # def test_get_design_matrix(): # not done writing this function, need to wait @@ -35,20 +27,21 @@ # not sure how to test plots -def test_get_betas(): - actual = linear_modeling.get_betas(X, y[:, 0]) - expected = np.array([[2., 3]]).T +def test_get_betas_Y(): + actual = linear_modeling.get_betas_Y(X, data)[0] + expected = np.array([[-0.85496183, -0.11450382, -0.01526718], + [6.91603053, 2.90839695, 6.58778626]]) + assert_almost_equal(expected, actual) + actual = linear_modeling.get_betas_Y(X, data)[1] + expected = np.array([[1, 1, 3], [0, 0, 7], [7, 4, 9], [0, 4, 7]]) assert_almost_equal(expected, actual) def test_get_betas_4d(): actual = linear_modeling.get_betas_4d( - linear_modeling.get_betas(X, data), data) - expected = np.array([[[ - [2.00000000e+00, -1.50000000e+00 - ], [1.00000000e+00, 1.00000000e+00], [2.00000000e+00, -1.00000000e+00] - ], [[3.00000000e+00, 2.00000000e+00], [-3.55271368e-15, -2.00000000e+00], - [-7.10542736e-15, 1.00000000e+00]]]]) + linear_modeling.get_betas_Y(X, data)[0], data) + expected = np.array([[[-0.85496183, 6.91603053], [-0.11450382, 2.90839695], + [-0.01526718, 6.58778626]]]) assert_almost_equal(expected, actual) # def test_plot_betas(): @@ -57,33 +50,25 @@ def test_get_betas_4d(): def test_t_stat(): - actual = linear_modeling.t_stat(y, X, [0, 1]) - expected = np.array([[4.90591615e+14, 3.77185234e+15, -5.32661312e-01, - - 2.17767996e+15, -5.32661312e-01, 6.46867339e+14]]) + actual = linear_modeling.t_stat( + linear_modeling.get_betas_Y(X, data)[1], X, [0, 1]) + expected = np.array([2.7475368, 1.04410995, 1.90484058]) assert_almost_equal(expected, actual) -def test_get_ts(): - actual = linear_modeling.get_ts(y, X, [0, 1], data) - expected = np.array([9.84892820e+14, 6.46867339e+14, -4.81391897e-01, - - 8.59684933e+14, -4.81391897e-01, 7.99631096e+14]) - assert_almost_equal(actual, expected) - - -def test_get_top_100(): - a = np.arange(21) - assert_equal(linear_modeling.get_top_100(a,.2), [17, 18, 19, 20]) - a = np.arange(21)[::-1] - assert_equal(linear_modeling.get_top_100(a,.2), [3, 2, 1, 0]) - actual = linear_modeling.get_top_100(linear_modeling.get_ts(y, X, [0, 1], - data),.2) - expected = np.array([0]) +def test_get_top_32(): + a = np.array([6, 4, 1, 2, 8, 8, 1, 9, 5, 2, 1, 9, 5, 4, 3, 6, 5, 3, 5, 8]) + assert_equal(linear_modeling.get_top_32(a, .2), [11, 7, 19, 4]) + actual = linear_modeling.get_top_32( + linear_modeling.t_stat( + linear_modeling.get_betas_Y(X, data)[1], X, [0, 1]), .5) + expected = np.array([0, 2]) assert_almost_equal(actual, expected) def test_get_index_4d(): - actual = linear_modeling.get_index_4d([3, 8, 23, 1], (2, 3, 4)) - assert_equal(actual, [(0, 0, 3), (0, 2, 0), (1, 2, 3), (0, 0, 1)]) + actual = linear_modeling.get_index_4d([0, 2], data) + assert_equal(actual, [(0, 0), (0, 2)]) # def test_plot_single_voxel(): # not sure how to test plots diff --git a/code/utils/tests/test_parse_demographics.py b/code/utils/tests/test_parse_demographics.py new file mode 100755 index 0000000..f121c2e --- /dev/null +++ b/code/utils/tests/test_parse_demographics.py @@ -0,0 +1,51 @@ +from __future__ import absolute_import +from .. import parse_demographics + +import os +import csv + + +def prepare_for_tests(): + with open('demographics.csv', 'w') as csvfile: + file_writer = csv.writer(csvfile, delimiter=',', quotechar='"') + file_writer.writerow(['id', 'gender', 'age', 'forrest_seen_count']) + file_writer.writerow(['1', 'm', '30-35', '5']) + file_writer.writerow(['2', 'm', '30-35', '1']) + test_object = parse_demographics.parse_csv('demographics.csv') + return test_object + + +def test_seen_most_times(): + test_subjects = prepare_for_tests() + seen_count = parse_demographics.seen_most_times(test_subjects) + assert seen_count[0] == 5 + assert seen_count[1] == 1 + delete_file() + + +def test_seen_least_times(): + test_subjects = prepare_for_tests() + seen_count = parse_demographics.seen_least_times(test_subjects) + assert seen_count[0] == 1 + assert seen_count[1] == 2 + delete_file() + + +def test_find_id_by_gender(): + test_subjects = prepare_for_tests() + id_list = parse_demographics.find_id_by_gender(test_subjects, 'm') + assert len(id_list) == 2 + assert id_list[0] == 'm' + assert id_list[1] == 'm' + delete_file() + + +def test_find_count_by_id(): + test_subjects = prepare_for_tests() + count = parse_demographics.find_count_by_id(test_subjects, 1) + assert count == 5 + delete_file() + + +def delete_file(): + os.remove('demographics.csv') diff --git a/slides/Makefile b/slides/Makefile index 6f463ac..8634020 100644 --- a/slides/Makefile +++ b/slides/Makefile @@ -1,9 +1,13 @@ -.PHONY: all clean +.PHONY: all clean progress.pdf final.pdf -all: clean progress.pdf +all: clean progress.pdf final.pdf clean: - rm -f progress.pdf + rm -f progress.pdf final.pdf progress.pdf: progress.md pandoc -t beamer -s progress.md -o progress.pdf + + +final.pdf: final.md + pandoc -t beamer -s final.md -o final.pdf diff --git a/slides/day_night_3.png b/slides/day_night_3.png new file mode 100644 index 0000000..6d8a222 Binary files /dev/null and b/slides/day_night_3.png differ diff --git a/slides/day_night_on_off.png b/slides/day_night_on_off.png new file mode 100644 index 0000000..a8e74e4 Binary files /dev/null and b/slides/day_night_on_off.png differ diff --git a/slides/design_matrix_new_3.png b/slides/design_matrix_new_3.png new file mode 100644 index 0000000..d25ca11 Binary files /dev/null and b/slides/design_matrix_new_3.png differ diff --git a/slides/final.md b/slides/final.md new file mode 100644 index 0000000..5d78a98 --- /dev/null +++ b/slides/final.md @@ -0,0 +1,83 @@ +% Project Lambda +% Jordeen Chang, Alon Daks, Ying Luo, Lisa Ann Yu +% December 3, 2015 + +## The Paper +- A high-resolution 7-Tesla fMRI dataset from complex natural stimulation with an audio movie + +## Abstract + +- Brief Overview of Original Study: + - 20 participants recorded at a high field strength of 7 Tesla while listening to *Forrest Gump* + - Collected fMRI data for entire movie in .niigz format with additional information regarding movie scenes + - Used MRI scanner and pulse oximetry to conduct blood oxygen level dependent (BOLD) imaging, structural MRI imaging, and physiological assay + - Its goal is to provide data for others to explore auditory cognition, language and music perception, social perception, etc. +- Our Goal: + - Reproduce a subject of analysis conducted by Hanke et. al + - Apply machine learning to see if we can predict if a subject was listening to a day or night movie scene based on brain state + +# Data Extraction & Exploration + +##Extraction +- First had to overcome the large amount of data to sift through + - Testing only on Subject 1 for sake of time and efficiency: limited project scope from 320 GB to 5 GB + - Chose to only use nonlinear alignment out of raw data, linear alignment, and nonlinear +- Smoothed data by applying Gaussian filter +- Designed a clean schema by introducing a data_path.json file to reference where each raw data file is located + +## +![This is a plot of the SD’s across volumes in the 4-D array for subject 1, run 1 and task 1. Though the SD’s are all within the range (171, 173.5), there is still quite a bit of variability within that range.](sd.jpg?raw=true) + +# Methods & Results + +##Process Overview: Reproducing +- Two methods for correlation mentioned in paper, and chose to reproduce the first: taking BOLD time-series and calculating voxel-wise pearson correlation on the raw data that Matthew provided. +- Using multiple EC2 instances to run analysis, so that we can load both two hour time courses in a pair and parallelize the process since correlation takes 10 minutes on our laptops. +- Defined a UNIX environment variable STAT159_CACHED_DATA that allows users to choose if they want data to be recomputed. +- Results are still being processed. + +##Analysis Overview: Predicting +- Parsed through “scenes.csv” and “demographics.csv” for information about movie scenes and subjects +- Conducted t-test to determine if brain signal is significantly different between day and night scenes for each voxel +- Retrieved top 32 voxels with biggest change between the groups. + +## +Step 1: Determine which slices are on and which are off + +![Day-night time course for Task 1, Run 1. Very few scenes take place at night.](day_night_on_off.png) + +## +Step 2: Create a Design Matrix + +![Intercept, day/night, linear drift](design_matrix_new_3.png?raw=true) + +## +Step 3: Calculate the betas for each voxel + +![Day_Night](day_night_3.png?raw=true) + +## +![Linear Drift](linear_drift_3.png?raw=true) + +## +![Intercept (baseline)](intercept_3.png?raw=true) + +## +Step 4: Test the hypothesis that beta = 0 to generate t-statistics +![Largest 32 t-statistics. Note: We took the absolute value of the t-statistics to find the farthest distances from 0. Additionally, since we are always comparing the estimate of beta to 0, the t-statistic should be positive.](top_32_bar.png?raw=true) + +## +![Time course for the voxel with the highest t-statistic.](highest_t_day_night.png?raw=true) + +## Analysis Overview: Predicting (cont.) +- Using random forest with 1000 in the voxels as the features +- 80% training, 20% testing +- 83% accuracy, but the data is 86% day, so not doing too well. +- Next steps include cross validation on feature set and running prediction on other parts of the data like sentiment and interior/exterior + +## Future Work +- Considered exploring possible physiological responses to certain movie scenes by gender and age, but not enough computing resources and data to do so +- Predict familiarity with *Forrest Gump* + - Difference in voxel activation between participants with high and low amounts of prior exposure + - Machine learning + - However, there are a relatively small number of participants, since fMRI is expensive, so the predictive ability of that study would be rather limited diff --git a/slides/highest_t_day_night.png b/slides/highest_t_day_night.png new file mode 100644 index 0000000..08731f7 Binary files /dev/null and b/slides/highest_t_day_night.png differ diff --git a/slides/intercept_3.png b/slides/intercept_3.png new file mode 100644 index 0000000..3a3e3ae Binary files /dev/null and b/slides/intercept_3.png differ diff --git a/slides/linear_drift_3.png b/slides/linear_drift_3.png new file mode 100644 index 0000000..306c3d1 Binary files /dev/null and b/slides/linear_drift_3.png differ diff --git a/slides/top_32_bar.png b/slides/top_32_bar.png new file mode 100644 index 0000000..e053d7e Binary files /dev/null and b/slides/top_32_bar.png differ