Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Design matrix and array of voxels in top 20% of t-statistics #34

Merged
merged 14 commits into from
Dec 6, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 19 additions & 56 deletions code/utils/linear_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,9 @@ def get_betas_Y(X, data):
-------
B: 2D array, p x B, the number of voxels
"""
print(data.shape[-1])
print(type(data))
# Y = np.reshape(data, (data.shape[-1], -1))
Y = np.reshape(data, (-1, data.shape[-1]))
print(Y.shape)
# B = npl.pinv(X).dot(Y)
Y = np.reshape(data,(-1,data.shape[-1]))
B = npl.pinv(X).dot(Y.T)
print(B.shape)
return B, Y.T
return B,Y.T


def get_betas_4d(B, data):
Expand All @@ -68,7 +62,6 @@ def get_betas_4d(B, data):
-------
4D array, beta for each voxel; need this format to plot
"""
print(B.shape)
return np.reshape(B.T, data.shape[:-1] + (-1,))


Expand Down Expand Up @@ -119,64 +112,36 @@ def t_stat(y, X, c):
# calculate bottom half of t statistic
SE = np.sqrt(MRSS * c.T.dot(npl.pinv(X.T.dot(X)).dot(c)))
t = c.T.dot(beta) / SE
return abs(t)


#def get_top_100(t,thresh=100/1108800):
# """
# Parameters
# ----------
# t: 1D array of t-statistics for each voxel
#
# Returns
# -------
# 1D array of position of voxels in top 100 of t-statistics (all are
# positive)
# """
# a = np.int32(round(len(t) * thresh))
# # top_100_voxels = np.argpartition(t, -a)[-a:]
# # problem: nans, try
# top_100_voxels = np.argpartition(~np.isnan(t),-1)[-a:]

# return top_100_voxels


#def get_index_4d(top_100_voxels, data):
# """
# Returns
# -------
# Indices in terms of 4D array of each voxel in top 20% of t-statistics
# """
# shape = data[...,-1].shape
# axes = np.unravel_index(top_100_voxels, shape)
# return zip(axes[0], axes[1], axes[2]) # sequence too large, try n = 32

# Solve the problem by using 32 for next two functions intead
def get_top_32(t, thresh=100/1108800):
"""
Parameters
----------
return abs(t[0])


def get_top_32(t,thresh=100/1108800):
"""
Parameters
----------
t: 1D array of t-statistics for each voxel

Returns
-------
1D array of position of voxels in top 32 of t-statistics (all are positive
"""
a = np.int32(round(len(t) * thresh))
top_32_voxels = np.argpartition(~np.isnan(t), -1)[-a:]
return top_32_voxels

return t.argsort()[-a:][::-1]


def get_index_4d(top_32_voxels, data):
"""
Parameters
---------
top_32_voxels: 1D array of indices of top 32 voxels

Returns
-------
Indices in terms of 4D array of each voxel in top 20% of t-statistics
"""
shape = data[..., -1].shape
axes = np.unravel_index(top_32_voxels, shape)
return zip(axes[0], axes[1], axes[2])

"""
shape = data[...,-1].shape
axes = np.unravel_index(top_32_voxels,shape)
return zip(*axes)

def plot_single_voxel(data, top_100_voxels):
"""
Expand All @@ -185,5 +150,3 @@ def plot_single_voxel(data, top_100_voxels):
None
"""
plt.plot(data[get_index_4d(data, top_100_voxels)[0]])
# fix so only get top voxel

Empty file modified code/utils/rf.py
100644 → 100755
Empty file.
2 changes: 1 addition & 1 deletion code/utils/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import rf

# Load data
img = nib.load('test_data.nii')
img = nib.load('bold_dico_dico_rcds_nl.nii')
data = img.get_data()

# Design Matrix
Expand Down
65 changes: 25 additions & 40 deletions code/utils/tests/test_linear_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,14 @@
import matplotlib.pyplot as plt
import numpy.linalg as npl
from scipy.stats import t as t_dist
from .. import scene_slicer, plot

from .. import linear_modeling
from numpy.testing import assert_almost_equal
from numpy.testing import assert_equal, assert_almost_equal

# Make an X, y
X = np.ones((4, 2))
X[:, 0] = np.random.randint(10, size=4)
y = np.zeros((4, 6))
y[:, 0] = X[:, 0] * 2 + 3
y[:, 1] = X[:, 0] * -1.5 + 2
y[:, 2] = X[:, 0]
y[:, 3] = X[:, 0] - 2
y[:, 4] = X[:, 0] * 2
y[:, 5] = X[:, 0] * -1 + 1
data = np.reshape(y, (1, 2, 3, 4))
X[:, 0] = np.array([4, 8, 2, 9])
data = np.array([[[1, 0, 7, 0], [1, 0, 4, 4], [3, 7, 9, 7]]])

# def test_get_design_matrix():
# not done writing this function, need to wait
Expand All @@ -35,20 +27,21 @@
# not sure how to test plots


def test_get_betas():
actual = linear_modeling.get_betas(X, y[:, 0])
expected = np.array([[2., 3]]).T
def test_get_betas_Y():
actual = linear_modeling.get_betas_Y(X, data)[0]
expected = np.array([[-0.85496183, -0.11450382, -0.01526718],
[6.91603053, 2.90839695, 6.58778626]])
assert_almost_equal(expected, actual)
actual = linear_modeling.get_betas_Y(X, data)[1]
expected = np.array([[1, 1, 3], [0, 0, 7], [7, 4, 9], [0, 4, 7]])
assert_almost_equal(expected, actual)


def test_get_betas_4d():
actual = linear_modeling.get_betas_4d(
linear_modeling.get_betas(X, data), data)
expected = np.array([[[
[2.00000000e+00, -1.50000000e+00
], [1.00000000e+00, 1.00000000e+00], [2.00000000e+00, -1.00000000e+00]
], [[3.00000000e+00, 2.00000000e+00], [-3.55271368e-15, -2.00000000e+00],
[-7.10542736e-15, 1.00000000e+00]]]])
linear_modeling.get_betas_Y(X, data)[0], data)
expected = np.array([[[-0.85496183, 6.91603053], [-0.11450382, 2.90839695],
[-0.01526718, 6.58778626]]])
assert_almost_equal(expected, actual)

# def test_plot_betas():
Expand All @@ -57,33 +50,25 @@ def test_get_betas_4d():


def test_t_stat():
actual = linear_modeling.t_stat(y, X, [0, 1])
expected = np.array([[4.90591615e+14, 3.77185234e+15, -5.32661312e-01, -
2.17767996e+15, -5.32661312e-01, 6.46867339e+14]])
actual = linear_modeling.t_stat(
linear_modeling.get_betas_Y(X, data)[1], X, [0, 1])
expected = np.array([2.7475368, 1.04410995, 1.90484058])
assert_almost_equal(expected, actual)


def test_get_ts():
actual = linear_modeling.get_ts(y, X, [0, 1], data)
expected = np.array([9.84892820e+14, 6.46867339e+14, -4.81391897e-01, -
8.59684933e+14, -4.81391897e-01, 7.99631096e+14])
assert_almost_equal(actual, expected)


def test_get_top_100():
a = np.arange(21)
assert_equal(linear_modeling.get_top_100(a,.2), [17, 18, 19, 20])
a = np.arange(21)[::-1]
assert_equal(linear_modeling.get_top_100(a,.2), [3, 2, 1, 0])
actual = linear_modeling.get_top_100(linear_modeling.get_ts(y, X, [0, 1],
data),.2)
expected = np.array([0])
def test_get_top_32():
a = np.array([6, 4, 1, 2, 8, 8, 1, 9, 5, 2, 1, 9, 5, 4, 3, 6, 5, 3, 5, 8])
assert_equal(linear_modeling.get_top_32(a, .2), [11, 7, 19, 4])
actual = linear_modeling.get_top_32(
linear_modeling.t_stat(
linear_modeling.get_betas_Y(X, data)[1], X, [0, 1]), .5)
expected = np.array([0, 2])
assert_almost_equal(actual, expected)


def test_get_index_4d():
actual = linear_modeling.get_index_4d([3, 8, 23, 1], (2, 3, 4))
assert_equal(actual, [(0, 0, 3), (0, 2, 0), (1, 2, 3), (0, 0, 1)])
actual = linear_modeling.get_index_4d([0, 2], data)
assert_equal(actual, [(0, 0), (0, 2)])

# def test_plot_single_voxel():
# not sure how to test plots
51 changes: 51 additions & 0 deletions code/utils/tests/test_parse_demographics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from __future__ import absolute_import
from .. import parse_demographics

import os
import csv


def prepare_for_tests():
with open('demographics.csv', 'w') as csvfile:
file_writer = csv.writer(csvfile, delimiter=',', quotechar='"')
file_writer.writerow(['id', 'gender', 'age', 'forrest_seen_count'])
file_writer.writerow(['1', 'm', '30-35', '5'])
file_writer.writerow(['2', 'm', '30-35', '1'])
test_object = parse_demographics.parse_csv('demographics.csv')
return test_object


def test_seen_most_times():
test_subjects = prepare_for_tests()
seen_count = parse_demographics.seen_most_times(test_subjects)
assert seen_count[0] == 5
assert seen_count[1] == 1
delete_file()


def test_seen_least_times():
test_subjects = prepare_for_tests()
seen_count = parse_demographics.seen_least_times(test_subjects)
assert seen_count[0] == 1
assert seen_count[1] == 2
delete_file()


def test_find_id_by_gender():
test_subjects = prepare_for_tests()
id_list = parse_demographics.find_id_by_gender(test_subjects, 'm')
assert len(id_list) == 2
assert id_list[0] == 'm'
assert id_list[1] == 'm'
delete_file()


def test_find_count_by_id():
test_subjects = prepare_for_tests()
count = parse_demographics.find_count_by_id(test_subjects, 1)
assert count == 5
delete_file()


def delete_file():
os.remove('demographics.csv')
10 changes: 7 additions & 3 deletions slides/Makefile
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
.PHONY: all clean
.PHONY: all clean progress.pdf final.pdf

all: clean progress.pdf
all: clean progress.pdf final.pdf

clean:
rm -f progress.pdf
rm -f progress.pdf final.pdf

progress.pdf: progress.md
pandoc -t beamer -s progress.md -o progress.pdf


final.pdf: final.md
pandoc -t beamer -s final.md -o final.pdf
Binary file added slides/day_night_3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added slides/day_night_on_off.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added slides/design_matrix_new_3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
83 changes: 83 additions & 0 deletions slides/final.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
% Project Lambda
% Jordeen Chang, Alon Daks, Ying Luo, Lisa Ann Yu
% December 3, 2015

## The Paper
- A high-resolution 7-Tesla fMRI dataset from complex natural stimulation with an audio movie

## Abstract

- Brief Overview of Original Study:
- 20 participants recorded at a high field strength of 7 Tesla while listening to *Forrest Gump*
- Collected fMRI data for entire movie in .niigz format with additional information regarding movie scenes
- Used MRI scanner and pulse oximetry to conduct blood oxygen level dependent (BOLD) imaging, structural MRI imaging, and physiological assay
- Its goal is to provide data for others to explore auditory cognition, language and music perception, social perception, etc.
- Our Goal:
- Reproduce a subject of analysis conducted by Hanke et. al
- Apply machine learning to see if we can predict if a subject was listening to a day or night movie scene based on brain state

# Data Extraction & Exploration

##Extraction
- First had to overcome the large amount of data to sift through
- Testing only on Subject 1 for sake of time and efficiency: limited project scope from 320 GB to 5 GB
- Chose to only use nonlinear alignment out of raw data, linear alignment, and nonlinear
- Smoothed data by applying Gaussian filter
- Designed a clean schema by introducing a data_path.json file to reference where each raw data file is located

##
![This is a plot of the SD’s across volumes in the 4-D array for subject 1, run 1 and task 1. Though the SD’s are all within the range (171, 173.5), there is still quite a bit of variability within that range.](sd.jpg?raw=true)

# Methods & Results

##Process Overview: Reproducing
- Two methods for correlation mentioned in paper, and chose to reproduce the first: taking BOLD time-series and calculating voxel-wise pearson correlation on the raw data that Matthew provided.
- Using multiple EC2 instances to run analysis, so that we can load both two hour time courses in a pair and parallelize the process since correlation takes 10 minutes on our laptops.
- Defined a UNIX environment variable STAT159_CACHED_DATA that allows users to choose if they want data to be recomputed.
- Results are still being processed.

##Analysis Overview: Predicting
- Parsed through “scenes.csv” and “demographics.csv” for information about movie scenes and subjects
- Conducted t-test to determine if brain signal is significantly different between day and night scenes for each voxel
- Retrieved top 32 voxels with biggest change between the groups.

##
Step 1: Determine which slices are on and which are off

![Day-night time course for Task 1, Run 1. Very few scenes take place at night.](day_night_on_off.png)

##
Step 2: Create a Design Matrix

![Intercept, day/night, linear drift](design_matrix_new_3.png?raw=true)

##
Step 3: Calculate the betas for each voxel

![Day_Night](day_night_3.png?raw=true)

##
![Linear Drift](linear_drift_3.png?raw=true)

##
![Intercept (baseline)](intercept_3.png?raw=true)

##
Step 4: Test the hypothesis that beta = 0 to generate t-statistics
![Largest 32 t-statistics. Note: We took the absolute value of the t-statistics to find the farthest distances from 0. Additionally, since we are always comparing the estimate of beta to 0, the t-statistic should be positive.](top_32_bar.png?raw=true)

##
![Time course for the voxel with the highest t-statistic.](highest_t_day_night.png?raw=true)

## Analysis Overview: Predicting (cont.)
- Using random forest with 1000 in the voxels as the features
- 80% training, 20% testing
- 83% accuracy, but the data is 86% day, so not doing too well.
- Next steps include cross validation on feature set and running prediction on other parts of the data like sentiment and interior/exterior

## Future Work
- Considered exploring possible physiological responses to certain movie scenes by gender and age, but not enough computing resources and data to do so
- Predict familiarity with *Forrest Gump*
- Difference in voxel activation between participants with high and low amounts of prior exposure
- Machine learning
- However, there are a relatively small number of participants, since fMRI is expensive, so the predictive ability of that study would be rather limited
Binary file added slides/highest_t_day_night.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added slides/intercept_3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added slides/linear_drift_3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added slides/top_32_bar.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.