diff --git a/code/stat159lambda/classification/svm/svm.py b/code/stat159lambda/classification/svm/svm.py index 1f23818..ed3fa49 100644 --- a/code/stat159lambda/classification/svm/svm.py +++ b/code/stat159lambda/classification/svm/svm.py @@ -4,6 +4,27 @@ class Classifier: + """ + An instance of the Classifer class has the following attributes: + (1) model : Support Vector Machine for Regression or Scalable Linear + Support Vector Machine, depending on kernel specification + (2) X : vector of training data + (3) y : vector of target values + + Parameters + ---------- + X : array + y : array + C : float that is the penalty parameter of the error term + kernel : string that specifies the kernel type to be used in the + classification algorithm + degree : int that specifies the degree of the polynomial kernel + function + + Returns + ------- + None + """ def __init__(self, X, y, C=1.0, kernel='rbf', degree=2): if kernel == 'linear': self.model = LinearSVC(C=C) @@ -12,8 +33,32 @@ def __init__(self, X, y, C=1.0, kernel='rbf', degree=2): self.X = X self.y = y + def train(self): + """ + Classifier method that fits the SVM model according to the given training + data X + + Parameters + ---------- + None + + Returns + ------- + self : object + """ self.model.fit(self.X, self.y) def predict(self, new_data): + """ + Performs classification on samples in new_data + + Parameters + ---------- + new_data : array + + Returns + ------- + pred: array that contains class labels for samples in new_data + """ return self.model.predict(new_data) diff --git a/code/stat159lambda/reproduction/analyze_similarity.py b/code/stat159lambda/reproduction/analyze_similarity.py index c9c1abb..9e51015 100644 --- a/code/stat159lambda/reproduction/analyze_similarity.py +++ b/code/stat159lambda/reproduction/analyze_similarity.py @@ -13,24 +13,58 @@ def get_pairwise_correlations(): + """ + Finds and returns the paths to the correlations of all possible pairs of + subjects (if the paths exist) + + Parameters + ---------- + None + + Returns + ------- + paths : string array + """ subject_pairs = itertools.combinations(SUBJECTS, 2) return [np.load(dp.get_correlation_path(subj_a, subj_b)) for subj_a, subj_b in subject_pairs] def get_correlations(aggregation='pooled'): + """ + Calculates correlations either using means or the pooled data, depending + on specification + + Parameters + ---------- + aggregation : string (optional) + + Returns + ------- + correlations : array + """ correlations = np.concatenate(tuple(get_pairwise_correlations())) if aggregation == 'mean': correlations = get_pairwise_correlations() correlations = np.mean(np.matrix(correlations).T, axis=1) correlations = correlations[~np.isnan(correlations)] return np.squeeze(np.asarray(correlations)) - if aggregation == 'pooled': - correlations = np.concatenate(tuple(get_pairwise_correlations())) - return correlations[~np.isnan(correlations)] + return correlations[~np.isnan(correlations)] def save_correlation_histogram(aggregation): + """ + Plots and saves the histogram of all correlations calculated by the + specified aggregation into figures folder + + Parameters + ---------- + aggregation : string + + Returns + ------- + None + """ plt.hist(get_correlations(aggregation), bins=40) output_file_name = '{0}/figures/{1}_correlation_histogram.png'.format( REPO_HOME_PATH, aggregation) @@ -40,6 +74,18 @@ def save_correlation_histogram(aggregation): def save_correlation_percentiles(aggregation): + """ + Calculates and saves the correlation percentiles calculated by the + specified aggregation into figures folder + + Parameters + ---------- + aggregation : string + + Returns + ------- + None + """ correlations = get_correlations(aggregation) results = [[p, np.percentile(correlations, p)] for p in PERCENTILES] output_file_name = '{0}/figures/{1}_correlation_percentiles.txt'.format( diff --git a/code/stat159lambda/reproduction/inter_run_diagnostics.py b/code/stat159lambda/reproduction/inter_run_diagnostics.py index 022cc83..2b315b4 100644 --- a/code/stat159lambda/reproduction/inter_run_diagnostics.py +++ b/code/stat159lambda/reproduction/inter_run_diagnostics.py @@ -8,6 +8,19 @@ def calc_vol_rms_diff(data_file_path): + """ + Finds the difference between data[n+1] and data[n] for all elements in data + array to calculate the root mean squares. Does not include the data points + when they are tuning in the first 17 seconds. + + Parameters + ---------- + data_file_path : string + + Returns + ------- + vol_rms_diff : array + """ data = np.load(open(data_file_path)) diff_data = np.diff(data, axis=1) del data @@ -17,6 +30,19 @@ def calc_vol_rms_diff(data_file_path): def save_plot(vol_rms_diff, subj_num): + """ + Plots the root mean square differences for a particular subject and saves + that plot into the figures folder + + Parameters + ---------- + vol_rms_diff : array + subj_num : int + + Returns + ------- + None + """ plt.plot(vol_rms_diff) plt.savefig('{0}/figures/subj{1}_vol_rms_diff.png'.format( REPO_HOME_PATH, subj_num)) diff --git a/code/stat159lambda/reproduction/similarity.py b/code/stat159lambda/reproduction/similarity.py index 66d6418..0dabd07 100644 --- a/code/stat159lambda/reproduction/similarity.py +++ b/code/stat159lambda/reproduction/similarity.py @@ -13,6 +13,20 @@ def pearson_r(X, Y): + """ + Calculates the correlation between every row of two matrices. Assumes the + two matrices given are the same shape. + + Parameters + ---------- + X : array representation of an (n x n) matrix + Y : array representation of an (n x n) matrix + + Returns + ------- + r : vector of length n, where each element is the correlation of rows X_n + and Y_n + """ X_centered = X - np.mean(X, axis=1)[:, np.newaxis] Y_centered = Y - np.mean(Y, axis=1)[:, np.newaxis] return inner1d(X_centered, Y_centered) / (np.linalg.norm(X_centered, @@ -22,6 +36,19 @@ def pearson_r(X, Y): def correlation(subj_a_data, subj_b_data): + """ + Calculates the averaged correlation using every pair of data points between two + subjects. + + Parameters + ---------- + subj_a_data : array + subj_b_data : array + + Returns + ------- + correlations : float + """ run_split_a_data = np.split(subj_a_data, RUN_DIVISIONS[:-1], axis=1) run_split_b_data = np.split(subj_b_data, RUN_DIVISIONS[:-1], axis=1) correlations = np.zeros(NUM_VOXELS) @@ -32,6 +59,21 @@ def correlation(subj_a_data, subj_b_data): def calculate_and_save_correlation(subj_1_num, subj_2_num): + """ + Calculates correlation using smoothed 2-D data with 8 full width half + maximum mm, and saves values into a designated correlation_path. If a file + with calculated correlations already exists, uses that cached version + instead. + + Parameters + ---------- + subj_1_num : int + subj_2_num : int + + Returns + ------- + None + """ correlation_path = dp.get_correlation_path(subj_1_num, subj_2_num) if not exists(correlation_path) or not USE_CACHED_DATA: subj_1_data = np.load(dp.get_smoothed_2d_path(subj_1_num, 8)) diff --git a/code/stat159lambda/utils/data_path.py b/code/stat159lambda/utils/data_path.py index 4893871..d5511cb 100644 --- a/code/stat159lambda/utils/data_path.py +++ b/code/stat159lambda/utils/data_path.py @@ -5,6 +5,18 @@ def get_raw_path(subj_num, run_num): + """ + Derives the absolute path to data for a particular subject and run + + Parameters + ---------- + subj_num : int + run_num : int + + Returns + ------- + path : string + """ if subj_num < 10: subj_num = '0' + str(subj_num) return '{0}/data/raw/sub0{1}/task001_run00{2}/bold_dico_dico_rcds_nl.nii'.format( @@ -12,25 +24,103 @@ def get_raw_path(subj_num, run_num): def get_concatenated_path(subj_num): + """ + Derives the absolute path to data for a particular subject only + + Parameters + ---------- + subj_num : int + + Returns + ------- + path : string + """ return '{0}/data/processed/sub{1}_rcds.npy'.format(REPO_HOME_PATH, subj_num) def get_smoothed_path(subj_num, fwhm_mm): + """ + Derives the absolute path to the smoothed data for a particular subject and + particular full width half maximum smoothed version + + Parameters + ---------- + subj_num : int + fwhm_mm : int + + Returns + ------- + path: string + """ return '{0}/data/processed/sub{1}_rcds_smoothed_{2}_mm.npy'.format( REPO_HOME_PATH, subj_num, fwhm_mm) def get_smoothed_2d_path(subj_num, fwhm_mm): + """ + Derives the absolute path to the smoothed 2-D data for a particular subject + and particular full width half maximum smoothed version + + Parameters + ---------- + subj_num : int + fwhm_mm : int + + Returns + ------- + path: string + """ return '{0}/data/processed/sub{1}_rcds_smoothed_{2}_mm_2d.npy'.format( REPO_HOME_PATH, subj_num, fwhm_mm) def get_correlation_path(subj_1_num, subj_2_num): + """ + Derives the absolute path to the calculated correlations between two + subjects + + Parameters + ---------- + subj_1_num : int + subj_2_num : int + + Returns + ------- + path: string + """ return '{0}/data/processed/sub{1}_sub{2}_correlation.npy'.format( REPO_HOME_PATH, subj_1_num, subj_2_num) def get_2d_path(subj_num): + """ + Derives the absolute path to the 2-D data for a particular subject, + originally contained in a 4-D array + + Parameters + ---------- + subj_num : int + + Returns + ------- + path : string + """ return '{0}/data/processed/sub{1}_rcds_2d.npy'.format(REPO_HOME_PATH, subj_num) + +def get_correlation_hist_path(aggregation): + """ + Derives the absolute path to the correlations calculated by using either + the means or pooled data + + Parameters + ---------- + aggregation : string + + Returns + ------- + path : string + """ + return '{0}/figures/{1}_correlation_histogram.png'.format(REPO_HOME_PATH, + aggregation) \ No newline at end of file