Merge pull request #78 from AlonDaks/classification

Finished plots and figures for reproduction
berkeley-stat159 · Dec 14, 2015 · c0fd181 · c0fd181
2 parents 3bd3483 + 7d44c5e
commit c0fd181
Show file tree

Hide file tree

Showing 20 changed files with 302 additions and 367 deletions.
diff --git a/code/stat159lambda/classification/design_matrix.py b/code/stat159lambda/classification/design_matrix.py
@@ -6,25 +6,13 @@
 
 
 class DesignMatrix:
-    def __init__(self, data_file, volume_indices, voxels_indices):
-        self.data_file = data_file
-        self.volume_indices = volume_indices
-        self.voxels_indices = voxels_indices
-        self.X = self.generate_design_matrix_()
-        gc.collect()
-        self.y = self.generate_labels_(ssm.SceneSlicer())
+    def __init__(self, data_file):
+        self.X = np.load(data_file).T[NUM_OFFSET_VOLUMES:, :]
+        ss = ssm.SceneSlicer()
+        self.y = np.array(ss.get_scene_slices()[0][NUM_OFFSET_VOLUMES:])
 
-    def get_design_matrix(self):
-        return self.X
+    def get_design_matrix(self, volume_indices, voxels_indices):
+        return self.X[volume_indices, :][:, voxels_indices]
 
-    def get_labels(self):
-        return self.y
-
-    def generate_design_matrix_(self):
-        data = np.load(self.data_file)
-        return data.T[NUM_OFFSET_VOLUMES:, self.voxels_indices][
-            self.volume_indices, :]
-
-    def generate_labels_(self, ss):
-        return np.array(ss.get_scene_slices()[0][NUM_OFFSET_VOLUMES:][
-            self.volume_indices])
+    def get_labels(self, volume_indices):
+        return self.y[volume_indices]
diff --git a/code/stat159lambda/classification/partition_volumes.py b/code/stat159lambda/classification/partition_volumes.py
@@ -1,9 +1,12 @@
 import numpy as np
 import os
 from stat159lambda.config import REPO_HOME_PATH, NUM_OFFSET_VOLUMES, NUM_VOLUMES
+from stat159lambda.utils import scene_slicer as ss 
 
 def partition_volumes():
-	volume_indices = range(NUM_VOLUMES - NUM_OFFSET_VOLUMES)
+	volume_indices = np.array(range(NUM_VOLUMES - NUM_OFFSET_VOLUMES))
+	clean_slice_mask = ss.SceneSlicer().get_clean_slice_mask()
+	volume_indices = volume_indices[clean_slice_mask]
 	np.random.shuffle(volume_indices)
 	num_train = int(.8*len(volume_indices))
 	train_indices = volume_indices[:num_train]

diff --git a/code/stat159lambda/classification/random_forest/rf.py b/code/stat159lambda/classification/random_forest/rf.py
@@ -9,7 +9,7 @@ def __init__(self,
                  y,
                  max_features='auto',
                  depth=None,
-                 n_estimators=400):
+                 n_estimators=300):
         self.model = RandomForestClassifier(max_features=max_features,
                                             n_estimators=n_estimators,
                                             max_depth=None,

diff --git a/code/stat159lambda/classification/random_forest/rf_cross_validate.py b/code/stat159lambda/classification/random_forest/rf_cross_validate.py
@@ -1,29 +1,38 @@
+from __future__ import print_function, division
 import numpy as np
 from sklearn.cross_validation import KFold
 from sklearn.metrics import accuracy_score
 from stat159lambda.classification import design_matrix as dm
 from stat159lambda.classification.random_forest import rf
 from stat159lambda.classification import partition_volumes as pv
-from stat159lambda.config import REPO_HOME_PATH
+from stat159lambda.config import REPO_HOME_PATH, NUM_VOXELS
 from stat159lambda.linear_modeling import linear_modeling as lm
+from stat159lambda.utils import data_path as dp
 
-design_matrix = dm.DesignMatrix(
-    '{0}/data/processed/sub1_rcds_2d.npy'.format(REPO_HOME_PATH),
-    pv.get_train_indices(), range(200))
 
-X_train = design_matrix.get_design_matrix()
-y_train = np.array(design_matrix.get_labels())
+voxels_sorted_by_t_statistic = lm.VoxelExtractor(1, 'int-ext').t_stat()
+num_features_values = [200, 400, 600, 1000, 1500, 2000, 3000, 5000, 10000]
+design_matrix = dm.DesignMatrix(dp.get_smoothed_2d_path(1, 4))
+train_volume_indices = pv.get_train_indices()
+cv_values = []
+for num_features in num_features_values:
+	voxel_feature_indices = voxels_sorted_by_t_statistic[:num_features]
 
-cv_accuracies = []
-for train, test in KFold(len(X_train), 5):
-	X_cv_train = X_train[train, :]
-	y_cv_train = y_train[train]
-	X_cv_test = X_train[test, :]
-	y_cv_test = y_train[test]
-	model = rf.Classifier(X_cv_train, y_cv_train)
-	model.train()
-	y_predicted = model.predict(X_cv_test)
-	cv_accuracies.append(accuracy_score(y_predicted, y_cv_test))
+	X_train = design_matrix.get_design_matrix(train_volume_indices, voxel_feature_indices)
+	y_train = np.array(design_matrix.get_labels(train_volume_indices))
 
-print np.mean(cv_accuracies)
+	cv_accuracies = []
+	for train, test in KFold(len(X_train), 5):
+		X_cv_train = X_train[train, :]
+		y_cv_train = y_train[train]
+		X_cv_test = X_train[test, :]
+		y_cv_test = y_train[test]
+		model = rf.Classifier(X_cv_train, y_cv_train)
+		model.train()
+		y_predicted = model.predict(X_cv_test)
+		cv_accuracies.append(accuracy_score(y_predicted, y_cv_test))
+
+	cv_values.append(np.mean(cv_accuracies))
+
+np.save('cv_values', cv_values)
 
diff --git a/code/stat159lambda/classification/svm/svm.py b/code/stat159lambda/classification/svm/svm.py
@@ -25,7 +25,7 @@ class Classifier:
     -------
     None
     """
-    def __init__(self, X, y, C=1.0, kernel='rbf', degree=2):
+    def __init__(self, X, y, C=.001, kernel='linear', degree=2):
         if kernel == 'linear':
             self.model = LinearSVC(C=C)
         else:
@@ -35,30 +35,30 @@ def __init__(self, X, y, C=1.0, kernel='rbf', degree=2):
 
 
     def train(self):
-    """
-    Classifier method that fits the SVM model according to the given training
-    data X
+        """
+        Classifier method that fits the SVM model according to the given training
+        data X
 
-    Parameters
-    ----------
-    None
+        Parameters
+        ----------
+        None
 
-    Returns
-    -------
-    self : object
-    """
+        Returns
+        -------
+        self : object
+        """
         self.model.fit(self.X, self.y)
 
     def predict(self, new_data):
-    """
-    Performs classification on samples in new_data
+        """
+        Performs classification on samples in new_data
 
-    Parameters
-    ----------
-    new_data : array
+        Parameters
+        ----------
+        new_data : array
 
-    Returns
-    -------
-    pred: array that contains class labels for samples in new_data
-    """
+        Returns
+        -------
+        pred: array that contains class labels for samples in new_data
+        """
         return self.model.predict(new_data)
diff --git a/code/stat159lambda/classification/svm/svm_cross_validate.py b/code/stat159lambda/classification/svm/svm_cross_validate.py
@@ -0,0 +1,35 @@
+from __future__ import print_function, division
+import numpy as np
+from sklearn.cross_validation import KFold
+from sklearn.metrics import accuracy_score
+from stat159lambda.classification import design_matrix as dm
+from stat159lambda.classification.svm import svm
+from stat159lambda.classification import partition_volumes as pv
+from stat159lambda.config import REPO_HOME_PATH, NUM_VOXELS
+from stat159lambda.linear_modeling import linear_modeling as lm
+from stat159lambda.utils import data_path as dp
+
+
+voxels_sorted_by_t_statistic = lm.VoxelExtractor(1, 'int-ext').t_stat()
+# num_features_values = range(100, NUM_VOXELS/100, 100)
+# for num_features in num_features_values:
+voxel_feature_indices = voxels_sorted_by_t_statistic[:1000]
+design_matrix = dm.DesignMatrix(dp.get_smoothed_2d_path(1, 4), pv.get_train_indices(), voxel_feature_indices)
+
+print('classifying')
+X_train = design_matrix.get_design_matrix()
+y_train = np.array(design_matrix.get_labels())
+
+cv_accuracies = []
+for train, test in KFold(len(X_train), 5):
+	X_cv_train = X_train[train, :]
+	y_cv_train = y_train[train]
+	X_cv_test = X_train[test, :]
+	y_cv_test = y_train[test]
+	model = svm.Classifier(X_cv_train, y_cv_train)
+	model.train()
+	y_predicted = model.predict(X_cv_test)
+	cv_accuracies.append(accuracy_score(y_predicted, y_cv_test))
+
+print(np.mean(cv_accuracies))
+
diff --git a/code/stat159lambda/classification/test_indices.npy b/code/stat159lambda/classification/test_indices.npy
diff --git a/code/stat159lambda/classification/train_indices.npy b/code/stat159lambda/classification/train_indices.npy
diff --git a/code/stat159lambda/config.py b/code/stat159lambda/config.py
@@ -12,16 +12,16 @@
 
 USE_CACHED_DATA = os.environ.get('STAT159_CACHED_DATA', 'True') == 'True'
 
-NUM_PROCESSES = int(os.environ.get('STAT159_NUM_PROCESSES', 5))
 
 # fMRI data specific constants
-
 NUM_OFFSET_VOLUMES = 9
 
 NUM_VOLUMES = 3543
 
 NUM_VOXELS = 1108800
 
+VOXEL_DIMENSIONS = (132, 175, 48)
+
 NUM_RUNS = 8
 
 SUBJECTS = [1, 2, 3, 5, 6]

diff --git a/code/stat159lambda/linear_modeling/linear_modeling.py b/code/stat159lambda/linear_modeling/linear_modeling.py
@@ -10,7 +10,6 @@
 
 
 class VoxelExtractor:
-
     def __init__(self, subject, interest_col_str, data=None):
         """
         VoxelExtractor generates the t-statistic for each voxel based on a
@@ -28,13 +27,13 @@ def __init__(self, subject, interest_col_str, data=None):
         self.subject = subject
         self.interest_col_str = interest_col_str
         if not data:
-            data_path = dp.get_smoothed_path_2d(self.subject, 4)
-            data = nib.load(data_path).get_data()
-            data = data[:, :, :, NUM_OFFSET_VOLUMES:]
-        # Data is shaped as number of voxels by time
-        self.data = np.reshape(data, (-1, data.shape[-1]))
+            data_path = dp.get_smoothed_2d_path(self.subject, 4)
+            data = np.load(data_path)
+            data = data[:, NUM_OFFSET_VOLUMES:]
+        self.data = data
         self.design = None
         self.B = None
+        self.t_values = None
 
     def get_design_matrix(self):
         """
@@ -51,7 +50,8 @@ def get_design_matrix(self):
             elif self.interest_col_str == "day-night":
                 interest_col_ind = 0
             else:
-                print("Incorrect interest column name: please use either 'int-ext' or 'day-night'")
+                print(
+                    "Incorrect interest column name: please use either 'int-ext' or 'day-night'")
             interest_col = ss.get_scene_slices()[interest_col_ind]
             n_trs = self.data.shape[-1]
             design = np.ones((n_trs, 3))
@@ -70,9 +70,13 @@ def plot_design_matrix(self):
         if self.design is None:
             self.get_design_matrix()
         design_fig = plt.gcf()
-        plt.imshow(self.design, aspect=0.1, cmap='gray', interpolation='nearest')
+        plt.imshow(self.design,
+                   aspect=0.1,
+                   cmap='gray',
+                   interpolation='nearest')
         plt.xticks([])
-        design_fig_path = '{0}/figures/design_fig_{1}.png'.format(REPO_HOME_PATH, self.interest_col_str)
+        design_fig_path = '{0}/figures/design_fig_{1}.png'.format(
+            REPO_HOME_PATH, self.interest_col_str)
         design_fig.savefig(design_fig_path, dpi=100)
         plt.clf()
 
@@ -98,21 +102,23 @@ def t_stat(self):
         """
         if self.design is None:
             self.get_design_matrix()
-        y = np.asarray(self.data.T)
-        X = np.asarray(self.design)
-        c = [0, 0, 1]
-        c = np.atleast_2d(c).T
-        beta = npl.pinv(X).dot(y)
-        fitted = X.dot(beta)
-        errors = y - fitted
-        RSS = (errors**2).sum(axis=0)
-        df = X.shape[0] - npl.matrix_rank(X)
-        MRSS = RSS / df
-        SE = np.sqrt(MRSS * c.T.dot(npl.pinv(X.T.dot(X)).dot(c)))
-        SE[SE == 0] = np.amin(SE[SE != 0])
-        t = c.T.dot(beta) / SE
-        self.t_values = abs(t[0])
-        self.t_indices = np.array(self.t_values).argsort()[::-1][:self.t_values.size]
+        if self.t_values is None:
+            y = self.data.T
+            X = self.design
+            c = [0, 0, 1]
+            c = np.atleast_2d(c).T
+            beta = npl.pinv(X).dot(y)
+            fitted = X.dot(beta)
+            errors = y - fitted
+            RSS = (errors**2).sum(axis=0)
+            df = X.shape[0] - npl.matrix_rank(X)
+            MRSS = RSS / df
+            SE = np.sqrt(MRSS * c.T.dot(npl.pinv(X.T.dot(X)).dot(c)))
+            SE[SE == 0] = np.amin(SE[SE != 0])
+            t = c.T.dot(beta) / SE
+            self.t_values = abs(t[0])
+        self.t_indices = np.array(self.t_values).argsort(
+        )[::-1][:self.t_values.size]
         return self.t_indices
 
     def plot_single_voxel(self, voxel_index):
@@ -125,6 +131,7 @@ def plot_single_voxel(self, voxel_index):
         """
         voxel_img = plt.gcf()
         plt.plot(self.data[voxel_index, :])
-        voxel_img_path = '{0}/figures/voxel_{1}.png'.format(REPO_HOME_PATH, voxel_index)
+        voxel_img_path = '{0}/figures/voxel_{1}.png'.format(REPO_HOME_PATH,
+                                                            voxel_index)
         voxel_img.savefig(voxel_img_path, dpi=100)
         plt.clf()