From e92e3ff0d446569cd876d4b6de8927ff5fb786f4 Mon Sep 17 00:00:00 2001
From: Boying Gong <jorothy_gong@berkeley.edu>
Date: Wed, 11 Nov 2015 15:01:18 -0800
Subject: [PATCH 1/4] delete file

---
 data/hash.py | 22 ----------------------
 1 file changed, 22 deletions(-)
 delete mode 100644 data/hash.py

diff --git a/data/hash.py b/data/hash.py
deleted file mode 100644
index ea569c0..0000000
--- a/data/hash.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from __future__ import print_function, division
-
-import hashlib
-import os
-import os.path
-import os
-import json
-
-import data
-
-d = {}
-
-rootDir = 'ds005'
-
-for dirName, subdirList, fileList in os.walk(rootDir):
-    # print('Found directory: %s' % dirName)
-    for fname in fileList:
-        a = os.path.join(dirName, fname)
-        d[a] = data.generate_file_md5(a, blocksize=2**20)
-
-
-json.dump(d, open("hashList.txt",'w'))
\ No newline at end of file

From 863fff594e380178c795ba81cb9aab76ec5e98f6 Mon Sep 17 00:00:00 2001
From: Boying Gong <jorothy_gong@berkeley.edu>
Date: Wed, 9 Dec 2015 15:34:27 -0800
Subject: [PATCH 2/4] add  confusion matrix and ROC curve

---
 code/scripts/logistic.py        | 22 +++++++++++++++--
 code/utils/logistic_function.py | 42 +++++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+), 2 deletions(-)
 create mode 100644 code/utils/logistic_function.py

diff --git a/code/scripts/logistic.py b/code/scripts/logistic.py
index 86392aa..b61d8e0 100644
--- a/code/scripts/logistic.py
+++ b/code/scripts/logistic.py
@@ -2,6 +2,16 @@
 import matplotlib.pyplot as plt
 from sklearn.cross_validation import cross_val_score
 from sklearn.linear_model import LogisticRegression
+import sys
+
+# Path to function
+pathtofunction = '../utils'
+# Append path to sys
+sys.path.append(pathtofunction)
+
+from logistic_function import plot_roc
+
+pathtofolder = '../../data/'
 
 pathtofolder = '../../data/'
 
@@ -9,6 +19,7 @@
 beh_lambda = np.array([])
 beh_score = np.array([])
 val_score = np.array([])
+AUC_val = np.array([])
 for i in np.arange(1, nsub+1):
     run1 = np.loadtxt(pathtofolder + 'ds005/sub0'+ str(i).zfill(2)+
                       '/behav/task001_run001/behavdata.txt', skiprows = 1)
@@ -18,7 +29,7 @@
                       '/behav/task001_run003/behavdata.txt', skiprows = 1)
     behav = np.concatenate((run1, run2, run3), axis=0)
     behav = behav[np.logical_or.reduce([behav[:,5] == x for x in [0,1]])]
-    X = zip(np.ones(len(behav)), behav[:, 1],behav[:, 2])
+    X = zip(np.ones(len(behav)), behav[:, 1], behav[:, 2])
     y = behav[:, 5]
     logreg = LogisticRegression(C=1e5)
     # C=1e5 specifies a regularization strength
@@ -34,8 +45,15 @@
     scores = cross_val_score(LogisticRegression(), X, y, 
         scoring='accuracy', cv=10)
     val_score = np.append(val_score, scores.mean())
+    # calculate the AUC and plot ROC curve for each subject
+    logreg_proba = logreg.predict_proba(X)
+    fig, AUC = plot_roc(logreg_proba, y)
+    fig.savefig(pathtofolder + 'ds005/models/roc_curve_sub0'+ str(i).zfill(2))
+    AUC_val = np.append(AUC_val, scores.mean())
     
 
 np.savetxt(pathtofolder + 'ds005/models/lambda.txt', beh_lambda)
 np.savetxt(pathtofolder + 'ds005/models/reg_score.txt', beh_score)
-np.savetxt(pathtofolder + 'ds005/models/cross_val_score.txt', val_score)
\ No newline at end of file
+np.savetxt(pathtofolder + 'ds005/models/cross_val_score.txt', val_score)
+np.savetxt(pathtofolder + 'ds005/models/AUC_val.txt', AUC_val)
+
diff --git a/code/utils/logistic_function.py b/code/utils/logistic_function.py
new file mode 100644
index 0000000..76e60d7
--- /dev/null
+++ b/code/utils/logistic_function.py
@@ -0,0 +1,42 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+def plot_roc(logreg_proba, y):
+    
+    thresholds = np.linspace(1,0,101)
+
+    ROC = np.zeros((101,2))
+
+    for i in range(101):
+        t = thresholds[i]
+
+        # Classifier / label agree and disagreements for current threshold.
+        TP_t = np.logical_and( logreg_proba[:,1] > t, y==1 ).sum()
+        TN_t = np.logical_and( logreg_proba[:,1] <=t, y==0 ).sum()
+        FP_t = np.logical_and( logreg_proba[:,1] > t, y==0 ).sum()
+        FN_t = np.logical_and( logreg_proba[:,1] <=t, y==1 ).sum()
+
+        # Compute false positive rate for current threshold.
+        FPR_t = FP_t / float(FP_t + TN_t)
+        ROC[i,0] = FPR_t
+
+        # Compute true  positive rate for current threshold.
+        TPR_t = TP_t / float(TP_t + FN_t)
+        ROC[i,1] = TPR_t
+
+    # Plot the ROC curve.
+    fig = plt.figure(figsize=(6,6))
+    plt.plot(ROC[:,0], ROC[:,1], lw=2)
+    plt.xlim(-0.1,1.1)
+    plt.ylim(-0.1,1.1)
+    plt.xlabel('$FPR(t)$')
+    plt.ylabel('$TPR(t)$')
+    plt.grid()
+
+    AUC = 0.
+    for i in range(100):
+        AUC += (ROC[i+1,0]-ROC[i,0]) * (ROC[i+1,1]+ROC[i,1])
+    AUC *= 0.5
+
+    plt.title('ROC curve, AUC = %.4f'%AUC)
+    return fig, AUC
\ No newline at end of file

From 4c694941cab5500fec8ec6c3fbdb0289708306a3 Mon Sep 17 00:00:00 2001
From: Boying Gong <jorothy_gong@berkeley.edu>
Date: Wed, 9 Dec 2015 17:24:54 -0800
Subject: [PATCH 3/4] add comment

---
 code/utils/logistic_function.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/code/utils/logistic_function.py b/code/utils/logistic_function.py
index 76e60d7..30cd974 100644
--- a/code/utils/logistic_function.py
+++ b/code/utils/logistic_function.py
@@ -2,6 +2,15 @@
 import matplotlib.pyplot as plt
 
 def plot_roc(logreg_proba, y):
+    """ 
+    function to plot the ROC (receiver operating characteristic) curve and 
+        calculate the corresponding AUC (Area Under Curve).
+    Input: 
+        logreg_proba: The estimate probability for each class calculated from 
+                    logistic regression model. 
+        y: The actual class of response,.
+    Output: The ROC curve and and correspong AUC value.
+    """
     
     thresholds = np.linspace(1,0,101)
 

From 306fc9f6e30e97ceb18ea5b83a02357590f5939d Mon Sep 17 00:00:00 2001
From: Boying Gong <jorothy_gong@berkeley.edu>
Date: Fri, 11 Dec 2015 11:38:44 -0800
Subject: [PATCH 4/4] integrate code for logistic

---
 code/scripts/logistic.py        |  21 ++++---
 code/utils/logistic_function.py | 102 ++++++++++++++++++++++++--------
 2 files changed, 88 insertions(+), 35 deletions(-)

diff --git a/code/scripts/logistic.py b/code/scripts/logistic.py
index b61d8e0..72a080a 100644
--- a/code/scripts/logistic.py
+++ b/code/scripts/logistic.py
@@ -9,9 +9,7 @@
 # Append path to sys
 sys.path.append(pathtofunction)
 
-from logistic_function import plot_roc
-
-pathtofolder = '../../data/'
+from logistic_function import create_confusion, getMin_thrs, plot_roc
 
 pathtofolder = '../../data/'
 
@@ -19,7 +17,9 @@
 beh_lambda = np.array([])
 beh_score = np.array([])
 val_score = np.array([])
-AUC_val = np.array([])
+Min_thrs = np.array([])
+AUC_smr = np.array([])
+fig = plt.figure(figsize=(20,20))
 for i in np.arange(1, nsub+1):
     run1 = np.loadtxt(pathtofolder + 'ds005/sub0'+ str(i).zfill(2)+
                       '/behav/task001_run001/behavdata.txt', skiprows = 1)
@@ -47,13 +47,16 @@
     val_score = np.append(val_score, scores.mean())
     # calculate the AUC and plot ROC curve for each subject
     logreg_proba = logreg.predict_proba(X)
-    fig, AUC = plot_roc(logreg_proba, y)
-    fig.savefig(pathtofolder + 'ds005/models/roc_curve_sub0'+ str(i).zfill(2))
-    AUC_val = np.append(AUC_val, scores.mean())
-    
+    confusion = create_confusion(logreg_proba, y)
+    addsub = fig.add_subplot(4, 4, i)
+    addsub, AUC = plot_roc(confusion, addsub)
+    Min_thrs = np.append(Min_thrs, getMin_thrs(confusion))
+    AUC_smr = np.append(AUC_smr, AUC)
 
 np.savetxt(pathtofolder + 'ds005/models/lambda.txt', beh_lambda)
 np.savetxt(pathtofolder + 'ds005/models/reg_score.txt', beh_score)
 np.savetxt(pathtofolder + 'ds005/models/cross_val_score.txt', val_score)
-np.savetxt(pathtofolder + 'ds005/models/AUC_val.txt', AUC_val)
+np.savetxt(pathtofolder + 'ds005/models/Min_thrs.txt', Min_thrs.reshape(16,3))
+np.savetxt(pathtofolder + 'ds005/models/AUC_smr.txt', AUC_smr)
+fig.savefig(pathtofolder + 'ds005/models/roc_curve')
 
diff --git a/code/utils/logistic_function.py b/code/utils/logistic_function.py
index 30cd974..baad02c 100644
--- a/code/utils/logistic_function.py
+++ b/code/utils/logistic_function.py
@@ -1,40 +1,90 @@
 import numpy as np
 import matplotlib.pyplot as plt
 
-def plot_roc(logreg_proba, y):
-    """ 
-    function to plot the ROC (receiver operating characteristic) curve and 
-        calculate the corresponding AUC (Area Under Curve).
-    Input: 
-        logreg_proba: The estimate probability for each class calculated from 
-                    logistic regression model. 
-        y: The actual class of response,.
-    Output: The ROC curve and and correspong AUC value.
+def create_confusion(logreg_proba, y, thrs_inc=0.01):
     """
-    
-    thresholds = np.linspace(1,0,101)
-
-    ROC = np.zeros((101,2))
-
-    for i in range(101):
-        t = thresholds[i]
-
+        Creates the confusion matrix based on various levels of discriminate
+        probability thresholds
+        
+        Parameters
+        ----------
+        actual: Actual responses, 1-d array with values 0 or 1
+        fitted: Fitted probabilities, 1-d array with values between 0 and 1
+        thrs_inc: increment of threshold probability (default 0.05)
+        
+        Returns
+        -------
+        Confusion Matrix : Array of dim (X, 5) where X is the number of different
+        thresholds
+        Column 1: Threshold value between 0, 1
+        Columns 2-5 show counts for:
+        Column 2: True postive
+        Column 3: True negative
+        Column 4: False postive
+        Column 5: False negative
+        """
+    thrs_array = np.linspace(0, 1, 1/thrs_inc +1)
+    confusion = np.ones((len(thrs_array), 5))
+    confusion[:,0] = thrs_array
+    for i in range(int(1/thrs_inc +1)):
+        t = thrs_array[i]
         # Classifier / label agree and disagreements for current threshold.
         TP_t = np.logical_and( logreg_proba[:,1] > t, y==1 ).sum()
         TN_t = np.logical_and( logreg_proba[:,1] <=t, y==0 ).sum()
         FP_t = np.logical_and( logreg_proba[:,1] > t, y==0 ).sum()
         FN_t = np.logical_and( logreg_proba[:,1] <=t, y==1 ).sum()
+        confusion[i, 1:5] = [TP_t, TN_t, FP_t, FN_t]
+    return confusion
+
+
+def getMin_thrs(confusion):
+    """
+        Returns the threshold with the smallest number of wrong predictions
+        
+        Parameters:
+        -----------
+        Confustion matrix: 2-d array with 5 columns
+        
+        Returns:
+        --------
+        thrs: min threshold that gives minimum wrong predictions: columns 3 +
+        column 4
+        false_pos: number of incorrect trues
+        false_neg: number of incorrect falses
+        """
+    thrs_min = np.argmin(confusion[:,3]+ confusion[:,4])
+    col_out = confusion[thrs_min, :]
+    thrs = col_out[0]
+    false_pos = col_out[3]
+    false_neg = col_out[4]
+    return thrs, false_pos, false_neg
 
+
+def plot_roc(confusion, fig):
+    """
+        function to plot the ROC (receiver operating characteristic) curve and
+        calculate the corresponding AUC (Area Under Curve).
+        
+        Parameters:
+        -----------
+        Confustion matrix: 2-d array with 5 columns
+        
+        Returns:
+        --------
+        fig: The ROC curve
+        AUC: Correspong AUC value
+        """
+    ROC = np.zeros((confusion.shape[0],2))
+    for i in range(confusion.shape[0]):
         # Compute false positive rate for current threshold.
-        FPR_t = FP_t / float(FP_t + TN_t)
+        FPR_t = confusion[i, 3] / float(confusion[i, 3] + confusion[i, 2])
         ROC[i,0] = FPR_t
-
+        
         # Compute true  positive rate for current threshold.
-        TPR_t = TP_t / float(TP_t + FN_t)
+        TPR_t = confusion[i, 1] / float(confusion[i, 1] + confusion[i, 4])
         ROC[i,1] = TPR_t
-
+    
     # Plot the ROC curve.
-    fig = plt.figure(figsize=(6,6))
     plt.plot(ROC[:,0], ROC[:,1], lw=2)
     plt.xlim(-0.1,1.1)
     plt.ylim(-0.1,1.1)
@@ -43,9 +93,9 @@ def plot_roc(logreg_proba, y):
     plt.grid()
 
     AUC = 0.
-    for i in range(100):
+    for i in range(confusion.shape[0]-1):
         AUC += (ROC[i+1,0]-ROC[i,0]) * (ROC[i+1,1]+ROC[i,1])
-    AUC *= 0.5
-
+    AUC *= -0.5
+    
     plt.title('ROC curve, AUC = %.4f'%AUC)
-    return fig, AUC
\ No newline at end of file
+    return fig, AUC