From e92e3ff0d446569cd876d4b6de8927ff5fb786f4 Mon Sep 17 00:00:00 2001 From: Boying Gong Date: Wed, 11 Nov 2015 15:01:18 -0800 Subject: [PATCH 1/4] delete file --- data/hash.py | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 data/hash.py diff --git a/data/hash.py b/data/hash.py deleted file mode 100644 index ea569c0..0000000 --- a/data/hash.py +++ /dev/null @@ -1,22 +0,0 @@ -from __future__ import print_function, division - -import hashlib -import os -import os.path -import os -import json - -import data - -d = {} - -rootDir = 'ds005' - -for dirName, subdirList, fileList in os.walk(rootDir): - # print('Found directory: %s' % dirName) - for fname in fileList: - a = os.path.join(dirName, fname) - d[a] = data.generate_file_md5(a, blocksize=2**20) - - -json.dump(d, open("hashList.txt",'w')) \ No newline at end of file From 863fff594e380178c795ba81cb9aab76ec5e98f6 Mon Sep 17 00:00:00 2001 From: Boying Gong Date: Wed, 9 Dec 2015 15:34:27 -0800 Subject: [PATCH 2/4] add confusion matrix and ROC curve --- code/scripts/logistic.py | 22 +++++++++++++++-- code/utils/logistic_function.py | 42 +++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 2 deletions(-) create mode 100644 code/utils/logistic_function.py diff --git a/code/scripts/logistic.py b/code/scripts/logistic.py index 86392aa..b61d8e0 100644 --- a/code/scripts/logistic.py +++ b/code/scripts/logistic.py @@ -2,6 +2,16 @@ import matplotlib.pyplot as plt from sklearn.cross_validation import cross_val_score from sklearn.linear_model import LogisticRegression +import sys + +# Path to function +pathtofunction = '../utils' +# Append path to sys +sys.path.append(pathtofunction) + +from logistic_function import plot_roc + +pathtofolder = '../../data/' pathtofolder = '../../data/' @@ -9,6 +19,7 @@ beh_lambda = np.array([]) beh_score = np.array([]) val_score = np.array([]) +AUC_val = np.array([]) for i in np.arange(1, nsub+1): run1 = np.loadtxt(pathtofolder + 'ds005/sub0'+ str(i).zfill(2)+ '/behav/task001_run001/behavdata.txt', skiprows = 1) @@ -18,7 +29,7 @@ '/behav/task001_run003/behavdata.txt', skiprows = 1) behav = np.concatenate((run1, run2, run3), axis=0) behav = behav[np.logical_or.reduce([behav[:,5] == x for x in [0,1]])] - X = zip(np.ones(len(behav)), behav[:, 1],behav[:, 2]) + X = zip(np.ones(len(behav)), behav[:, 1], behav[:, 2]) y = behav[:, 5] logreg = LogisticRegression(C=1e5) # C=1e5 specifies a regularization strength @@ -34,8 +45,15 @@ scores = cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=10) val_score = np.append(val_score, scores.mean()) + # calculate the AUC and plot ROC curve for each subject + logreg_proba = logreg.predict_proba(X) + fig, AUC = plot_roc(logreg_proba, y) + fig.savefig(pathtofolder + 'ds005/models/roc_curve_sub0'+ str(i).zfill(2)) + AUC_val = np.append(AUC_val, scores.mean()) np.savetxt(pathtofolder + 'ds005/models/lambda.txt', beh_lambda) np.savetxt(pathtofolder + 'ds005/models/reg_score.txt', beh_score) -np.savetxt(pathtofolder + 'ds005/models/cross_val_score.txt', val_score) \ No newline at end of file +np.savetxt(pathtofolder + 'ds005/models/cross_val_score.txt', val_score) +np.savetxt(pathtofolder + 'ds005/models/AUC_val.txt', AUC_val) + diff --git a/code/utils/logistic_function.py b/code/utils/logistic_function.py new file mode 100644 index 0000000..76e60d7 --- /dev/null +++ b/code/utils/logistic_function.py @@ -0,0 +1,42 @@ +import numpy as np +import matplotlib.pyplot as plt + +def plot_roc(logreg_proba, y): + + thresholds = np.linspace(1,0,101) + + ROC = np.zeros((101,2)) + + for i in range(101): + t = thresholds[i] + + # Classifier / label agree and disagreements for current threshold. + TP_t = np.logical_and( logreg_proba[:,1] > t, y==1 ).sum() + TN_t = np.logical_and( logreg_proba[:,1] <=t, y==0 ).sum() + FP_t = np.logical_and( logreg_proba[:,1] > t, y==0 ).sum() + FN_t = np.logical_and( logreg_proba[:,1] <=t, y==1 ).sum() + + # Compute false positive rate for current threshold. + FPR_t = FP_t / float(FP_t + TN_t) + ROC[i,0] = FPR_t + + # Compute true positive rate for current threshold. + TPR_t = TP_t / float(TP_t + FN_t) + ROC[i,1] = TPR_t + + # Plot the ROC curve. + fig = plt.figure(figsize=(6,6)) + plt.plot(ROC[:,0], ROC[:,1], lw=2) + plt.xlim(-0.1,1.1) + plt.ylim(-0.1,1.1) + plt.xlabel('$FPR(t)$') + plt.ylabel('$TPR(t)$') + plt.grid() + + AUC = 0. + for i in range(100): + AUC += (ROC[i+1,0]-ROC[i,0]) * (ROC[i+1,1]+ROC[i,1]) + AUC *= 0.5 + + plt.title('ROC curve, AUC = %.4f'%AUC) + return fig, AUC \ No newline at end of file From 4c694941cab5500fec8ec6c3fbdb0289708306a3 Mon Sep 17 00:00:00 2001 From: Boying Gong Date: Wed, 9 Dec 2015 17:24:54 -0800 Subject: [PATCH 3/4] add comment --- code/utils/logistic_function.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/code/utils/logistic_function.py b/code/utils/logistic_function.py index 76e60d7..30cd974 100644 --- a/code/utils/logistic_function.py +++ b/code/utils/logistic_function.py @@ -2,6 +2,15 @@ import matplotlib.pyplot as plt def plot_roc(logreg_proba, y): + """ + function to plot the ROC (receiver operating characteristic) curve and + calculate the corresponding AUC (Area Under Curve). + Input: + logreg_proba: The estimate probability for each class calculated from + logistic regression model. + y: The actual class of response,. + Output: The ROC curve and and correspong AUC value. + """ thresholds = np.linspace(1,0,101) From 306fc9f6e30e97ceb18ea5b83a02357590f5939d Mon Sep 17 00:00:00 2001 From: Boying Gong Date: Fri, 11 Dec 2015 11:38:44 -0800 Subject: [PATCH 4/4] integrate code for logistic --- code/scripts/logistic.py | 21 ++++--- code/utils/logistic_function.py | 102 ++++++++++++++++++++++++-------- 2 files changed, 88 insertions(+), 35 deletions(-) diff --git a/code/scripts/logistic.py b/code/scripts/logistic.py index b61d8e0..72a080a 100644 --- a/code/scripts/logistic.py +++ b/code/scripts/logistic.py @@ -9,9 +9,7 @@ # Append path to sys sys.path.append(pathtofunction) -from logistic_function import plot_roc - -pathtofolder = '../../data/' +from logistic_function import create_confusion, getMin_thrs, plot_roc pathtofolder = '../../data/' @@ -19,7 +17,9 @@ beh_lambda = np.array([]) beh_score = np.array([]) val_score = np.array([]) -AUC_val = np.array([]) +Min_thrs = np.array([]) +AUC_smr = np.array([]) +fig = plt.figure(figsize=(20,20)) for i in np.arange(1, nsub+1): run1 = np.loadtxt(pathtofolder + 'ds005/sub0'+ str(i).zfill(2)+ '/behav/task001_run001/behavdata.txt', skiprows = 1) @@ -47,13 +47,16 @@ val_score = np.append(val_score, scores.mean()) # calculate the AUC and plot ROC curve for each subject logreg_proba = logreg.predict_proba(X) - fig, AUC = plot_roc(logreg_proba, y) - fig.savefig(pathtofolder + 'ds005/models/roc_curve_sub0'+ str(i).zfill(2)) - AUC_val = np.append(AUC_val, scores.mean()) - + confusion = create_confusion(logreg_proba, y) + addsub = fig.add_subplot(4, 4, i) + addsub, AUC = plot_roc(confusion, addsub) + Min_thrs = np.append(Min_thrs, getMin_thrs(confusion)) + AUC_smr = np.append(AUC_smr, AUC) np.savetxt(pathtofolder + 'ds005/models/lambda.txt', beh_lambda) np.savetxt(pathtofolder + 'ds005/models/reg_score.txt', beh_score) np.savetxt(pathtofolder + 'ds005/models/cross_val_score.txt', val_score) -np.savetxt(pathtofolder + 'ds005/models/AUC_val.txt', AUC_val) +np.savetxt(pathtofolder + 'ds005/models/Min_thrs.txt', Min_thrs.reshape(16,3)) +np.savetxt(pathtofolder + 'ds005/models/AUC_smr.txt', AUC_smr) +fig.savefig(pathtofolder + 'ds005/models/roc_curve') diff --git a/code/utils/logistic_function.py b/code/utils/logistic_function.py index 30cd974..baad02c 100644 --- a/code/utils/logistic_function.py +++ b/code/utils/logistic_function.py @@ -1,40 +1,90 @@ import numpy as np import matplotlib.pyplot as plt -def plot_roc(logreg_proba, y): - """ - function to plot the ROC (receiver operating characteristic) curve and - calculate the corresponding AUC (Area Under Curve). - Input: - logreg_proba: The estimate probability for each class calculated from - logistic regression model. - y: The actual class of response,. - Output: The ROC curve and and correspong AUC value. +def create_confusion(logreg_proba, y, thrs_inc=0.01): """ - - thresholds = np.linspace(1,0,101) - - ROC = np.zeros((101,2)) - - for i in range(101): - t = thresholds[i] - + Creates the confusion matrix based on various levels of discriminate + probability thresholds + + Parameters + ---------- + actual: Actual responses, 1-d array with values 0 or 1 + fitted: Fitted probabilities, 1-d array with values between 0 and 1 + thrs_inc: increment of threshold probability (default 0.05) + + Returns + ------- + Confusion Matrix : Array of dim (X, 5) where X is the number of different + thresholds + Column 1: Threshold value between 0, 1 + Columns 2-5 show counts for: + Column 2: True postive + Column 3: True negative + Column 4: False postive + Column 5: False negative + """ + thrs_array = np.linspace(0, 1, 1/thrs_inc +1) + confusion = np.ones((len(thrs_array), 5)) + confusion[:,0] = thrs_array + for i in range(int(1/thrs_inc +1)): + t = thrs_array[i] # Classifier / label agree and disagreements for current threshold. TP_t = np.logical_and( logreg_proba[:,1] > t, y==1 ).sum() TN_t = np.logical_and( logreg_proba[:,1] <=t, y==0 ).sum() FP_t = np.logical_and( logreg_proba[:,1] > t, y==0 ).sum() FN_t = np.logical_and( logreg_proba[:,1] <=t, y==1 ).sum() + confusion[i, 1:5] = [TP_t, TN_t, FP_t, FN_t] + return confusion + + +def getMin_thrs(confusion): + """ + Returns the threshold with the smallest number of wrong predictions + + Parameters: + ----------- + Confustion matrix: 2-d array with 5 columns + + Returns: + -------- + thrs: min threshold that gives minimum wrong predictions: columns 3 + + column 4 + false_pos: number of incorrect trues + false_neg: number of incorrect falses + """ + thrs_min = np.argmin(confusion[:,3]+ confusion[:,4]) + col_out = confusion[thrs_min, :] + thrs = col_out[0] + false_pos = col_out[3] + false_neg = col_out[4] + return thrs, false_pos, false_neg + +def plot_roc(confusion, fig): + """ + function to plot the ROC (receiver operating characteristic) curve and + calculate the corresponding AUC (Area Under Curve). + + Parameters: + ----------- + Confustion matrix: 2-d array with 5 columns + + Returns: + -------- + fig: The ROC curve + AUC: Correspong AUC value + """ + ROC = np.zeros((confusion.shape[0],2)) + for i in range(confusion.shape[0]): # Compute false positive rate for current threshold. - FPR_t = FP_t / float(FP_t + TN_t) + FPR_t = confusion[i, 3] / float(confusion[i, 3] + confusion[i, 2]) ROC[i,0] = FPR_t - + # Compute true positive rate for current threshold. - TPR_t = TP_t / float(TP_t + FN_t) + TPR_t = confusion[i, 1] / float(confusion[i, 1] + confusion[i, 4]) ROC[i,1] = TPR_t - + # Plot the ROC curve. - fig = plt.figure(figsize=(6,6)) plt.plot(ROC[:,0], ROC[:,1], lw=2) plt.xlim(-0.1,1.1) plt.ylim(-0.1,1.1) @@ -43,9 +93,9 @@ def plot_roc(logreg_proba, y): plt.grid() AUC = 0. - for i in range(100): + for i in range(confusion.shape[0]-1): AUC += (ROC[i+1,0]-ROC[i,0]) * (ROC[i+1,1]+ROC[i,1]) - AUC *= 0.5 - + AUC *= -0.5 + plt.title('ROC curve, AUC = %.4f'%AUC) - return fig, AUC \ No newline at end of file + return fig, AUC