From 306fc9f6e30e97ceb18ea5b83a02357590f5939d Mon Sep 17 00:00:00 2001
From: Boying Gong <jorothy_gong@berkeley.edu>
Date: Fri, 11 Dec 2015 11:38:44 -0800
Subject: [PATCH] integrate code for logistic

---
 code/scripts/logistic.py        |  21 ++++---
 code/utils/logistic_function.py | 102 ++++++++++++++++++++++++--------
 2 files changed, 88 insertions(+), 35 deletions(-)

diff --git a/code/scripts/logistic.py b/code/scripts/logistic.py
index b61d8e0..72a080a 100644
--- a/code/scripts/logistic.py
+++ b/code/scripts/logistic.py
@@ -9,9 +9,7 @@
 # Append path to sys
 sys.path.append(pathtofunction)
 
-from logistic_function import plot_roc
-
-pathtofolder = '../../data/'
+from logistic_function import create_confusion, getMin_thrs, plot_roc
 
 pathtofolder = '../../data/'
 
@@ -19,7 +17,9 @@
 beh_lambda = np.array([])
 beh_score = np.array([])
 val_score = np.array([])
-AUC_val = np.array([])
+Min_thrs = np.array([])
+AUC_smr = np.array([])
+fig = plt.figure(figsize=(20,20))
 for i in np.arange(1, nsub+1):
     run1 = np.loadtxt(pathtofolder + 'ds005/sub0'+ str(i).zfill(2)+
                       '/behav/task001_run001/behavdata.txt', skiprows = 1)
@@ -47,13 +47,16 @@
     val_score = np.append(val_score, scores.mean())
     # calculate the AUC and plot ROC curve for each subject
     logreg_proba = logreg.predict_proba(X)
-    fig, AUC = plot_roc(logreg_proba, y)
-    fig.savefig(pathtofolder + 'ds005/models/roc_curve_sub0'+ str(i).zfill(2))
-    AUC_val = np.append(AUC_val, scores.mean())
-    
+    confusion = create_confusion(logreg_proba, y)
+    addsub = fig.add_subplot(4, 4, i)
+    addsub, AUC = plot_roc(confusion, addsub)
+    Min_thrs = np.append(Min_thrs, getMin_thrs(confusion))
+    AUC_smr = np.append(AUC_smr, AUC)
 
 np.savetxt(pathtofolder + 'ds005/models/lambda.txt', beh_lambda)
 np.savetxt(pathtofolder + 'ds005/models/reg_score.txt', beh_score)
 np.savetxt(pathtofolder + 'ds005/models/cross_val_score.txt', val_score)
-np.savetxt(pathtofolder + 'ds005/models/AUC_val.txt', AUC_val)
+np.savetxt(pathtofolder + 'ds005/models/Min_thrs.txt', Min_thrs.reshape(16,3))
+np.savetxt(pathtofolder + 'ds005/models/AUC_smr.txt', AUC_smr)
+fig.savefig(pathtofolder + 'ds005/models/roc_curve')
 
diff --git a/code/utils/logistic_function.py b/code/utils/logistic_function.py
index 30cd974..baad02c 100644
--- a/code/utils/logistic_function.py
+++ b/code/utils/logistic_function.py
@@ -1,40 +1,90 @@
 import numpy as np
 import matplotlib.pyplot as plt
 
-def plot_roc(logreg_proba, y):
-    """ 
-    function to plot the ROC (receiver operating characteristic) curve and 
-        calculate the corresponding AUC (Area Under Curve).
-    Input: 
-        logreg_proba: The estimate probability for each class calculated from 
-                    logistic regression model. 
-        y: The actual class of response,.
-    Output: The ROC curve and and correspong AUC value.
+def create_confusion(logreg_proba, y, thrs_inc=0.01):
     """
-    
-    thresholds = np.linspace(1,0,101)
-
-    ROC = np.zeros((101,2))
-
-    for i in range(101):
-        t = thresholds[i]
-
+        Creates the confusion matrix based on various levels of discriminate
+        probability thresholds
+        
+        Parameters
+        ----------
+        actual: Actual responses, 1-d array with values 0 or 1
+        fitted: Fitted probabilities, 1-d array with values between 0 and 1
+        thrs_inc: increment of threshold probability (default 0.05)
+        
+        Returns
+        -------
+        Confusion Matrix : Array of dim (X, 5) where X is the number of different
+        thresholds
+        Column 1: Threshold value between 0, 1
+        Columns 2-5 show counts for:
+        Column 2: True postive
+        Column 3: True negative
+        Column 4: False postive
+        Column 5: False negative
+        """
+    thrs_array = np.linspace(0, 1, 1/thrs_inc +1)
+    confusion = np.ones((len(thrs_array), 5))
+    confusion[:,0] = thrs_array
+    for i in range(int(1/thrs_inc +1)):
+        t = thrs_array[i]
         # Classifier / label agree and disagreements for current threshold.
         TP_t = np.logical_and( logreg_proba[:,1] > t, y==1 ).sum()
         TN_t = np.logical_and( logreg_proba[:,1] <=t, y==0 ).sum()
         FP_t = np.logical_and( logreg_proba[:,1] > t, y==0 ).sum()
         FN_t = np.logical_and( logreg_proba[:,1] <=t, y==1 ).sum()
+        confusion[i, 1:5] = [TP_t, TN_t, FP_t, FN_t]
+    return confusion
+
+
+def getMin_thrs(confusion):
+    """
+        Returns the threshold with the smallest number of wrong predictions
+        
+        Parameters:
+        -----------
+        Confustion matrix: 2-d array with 5 columns
+        
+        Returns:
+        --------
+        thrs: min threshold that gives minimum wrong predictions: columns 3 +
+        column 4
+        false_pos: number of incorrect trues
+        false_neg: number of incorrect falses
+        """
+    thrs_min = np.argmin(confusion[:,3]+ confusion[:,4])
+    col_out = confusion[thrs_min, :]
+    thrs = col_out[0]
+    false_pos = col_out[3]
+    false_neg = col_out[4]
+    return thrs, false_pos, false_neg
 
+
+def plot_roc(confusion, fig):
+    """
+        function to plot the ROC (receiver operating characteristic) curve and
+        calculate the corresponding AUC (Area Under Curve).
+        
+        Parameters:
+        -----------
+        Confustion matrix: 2-d array with 5 columns
+        
+        Returns:
+        --------
+        fig: The ROC curve
+        AUC: Correspong AUC value
+        """
+    ROC = np.zeros((confusion.shape[0],2))
+    for i in range(confusion.shape[0]):
         # Compute false positive rate for current threshold.
-        FPR_t = FP_t / float(FP_t + TN_t)
+        FPR_t = confusion[i, 3] / float(confusion[i, 3] + confusion[i, 2])
         ROC[i,0] = FPR_t
-
+        
         # Compute true  positive rate for current threshold.
-        TPR_t = TP_t / float(TP_t + FN_t)
+        TPR_t = confusion[i, 1] / float(confusion[i, 1] + confusion[i, 4])
         ROC[i,1] = TPR_t
-
+    
     # Plot the ROC curve.
-    fig = plt.figure(figsize=(6,6))
     plt.plot(ROC[:,0], ROC[:,1], lw=2)
     plt.xlim(-0.1,1.1)
     plt.ylim(-0.1,1.1)
@@ -43,9 +93,9 @@ def plot_roc(logreg_proba, y):
     plt.grid()
 
     AUC = 0.
-    for i in range(100):
+    for i in range(confusion.shape[0]-1):
         AUC += (ROC[i+1,0]-ROC[i,0]) * (ROC[i+1,1]+ROC[i,1])
-    AUC *= 0.5
-
+    AUC *= -0.5
+    
     plt.title('ROC curve, AUC = %.4f'%AUC)
-    return fig, AUC
\ No newline at end of file
+    return fig, AUC