Merge pull request #110 from boyinggong/dev

Merging for now, add subject label later
berkeley-stat159 · Dec 12, 2015 · 122e406 · 122e406
2 parents a2565ae + 306fc9f
commit 122e406
Show file tree

Hide file tree

Showing 2 changed files with 125 additions and 3 deletions.
diff --git a/code/scripts/logistic.py b/code/scripts/logistic.py
@@ -2,13 +2,24 @@
 import matplotlib.pyplot as plt
 from sklearn.cross_validation import cross_val_score
 from sklearn.linear_model import LogisticRegression
+import sys
+
+# Path to function
+pathtofunction = '../utils'
+# Append path to sys
+sys.path.append(pathtofunction)
+
+from logistic_function import create_confusion, getMin_thrs, plot_roc
 
 pathtofolder = '../../data/'
 
 nsub = 16
 beh_lambda = np.array([])
 beh_score = np.array([])
 val_score = np.array([])
+Min_thrs = np.array([])
+AUC_smr = np.array([])
+fig = plt.figure(figsize=(20,20))
 for i in np.arange(1, nsub+1):
     run1 = np.loadtxt(pathtofolder + 'ds005/sub0'+ str(i).zfill(2)+
                       '/behav/task001_run001/behavdata.txt', skiprows = 1)
@@ -18,7 +29,7 @@
                       '/behav/task001_run003/behavdata.txt', skiprows = 1)
     behav = np.concatenate((run1, run2, run3), axis=0)
     behav = behav[np.logical_or.reduce([behav[:,5] == x for x in [0,1]])]
-    X = zip(np.ones(len(behav)), behav[:, 1],behav[:, 2])
+    X = zip(np.ones(len(behav)), behav[:, 1], behav[:, 2])
     y = behav[:, 5]
     logreg = LogisticRegression(C=1e5)
     # C=1e5 specifies a regularization strength
@@ -34,8 +45,18 @@
     scores = cross_val_score(LogisticRegression(), X, y, 
         scoring='accuracy', cv=10)
     val_score = np.append(val_score, scores.mean())
-
+    # calculate the AUC and plot ROC curve for each subject
+    logreg_proba = logreg.predict_proba(X)
+    confusion = create_confusion(logreg_proba, y)
+    addsub = fig.add_subplot(4, 4, i)
+    addsub, AUC = plot_roc(confusion, addsub)
+    Min_thrs = np.append(Min_thrs, getMin_thrs(confusion))
+    AUC_smr = np.append(AUC_smr, AUC)
 
 np.savetxt(pathtofolder + 'ds005/models/lambda.txt', beh_lambda)
 np.savetxt(pathtofolder + 'ds005/models/reg_score.txt', beh_score)
-np.savetxt(pathtofolder + 'ds005/models/cross_val_score.txt', val_score)
+np.savetxt(pathtofolder + 'ds005/models/cross_val_score.txt', val_score)
+np.savetxt(pathtofolder + 'ds005/models/Min_thrs.txt', Min_thrs.reshape(16,3))
+np.savetxt(pathtofolder + 'ds005/models/AUC_smr.txt', AUC_smr)
+fig.savefig(pathtofolder + 'ds005/models/roc_curve')
+
diff --git a/code/utils/logistic_function.py b/code/utils/logistic_function.py
@@ -0,0 +1,101 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+def create_confusion(logreg_proba, y, thrs_inc=0.01):
+    """
+        Creates the confusion matrix based on various levels of discriminate
+        probability thresholds
+        
+        Parameters
+        ----------
+        actual: Actual responses, 1-d array with values 0 or 1
+        fitted: Fitted probabilities, 1-d array with values between 0 and 1
+        thrs_inc: increment of threshold probability (default 0.05)
+        
+        Returns
+        -------
+        Confusion Matrix : Array of dim (X, 5) where X is the number of different
+        thresholds
+        Column 1: Threshold value between 0, 1
+        Columns 2-5 show counts for:
+        Column 2: True postive
+        Column 3: True negative
+        Column 4: False postive
+        Column 5: False negative
+        """
+    thrs_array = np.linspace(0, 1, 1/thrs_inc +1)
+    confusion = np.ones((len(thrs_array), 5))
+    confusion[:,0] = thrs_array
+    for i in range(int(1/thrs_inc +1)):
+        t = thrs_array[i]
+        # Classifier / label agree and disagreements for current threshold.
+        TP_t = np.logical_and( logreg_proba[:,1] > t, y==1 ).sum()
+        TN_t = np.logical_and( logreg_proba[:,1] <=t, y==0 ).sum()
+        FP_t = np.logical_and( logreg_proba[:,1] > t, y==0 ).sum()
+        FN_t = np.logical_and( logreg_proba[:,1] <=t, y==1 ).sum()
+        confusion[i, 1:5] = [TP_t, TN_t, FP_t, FN_t]
+    return confusion
+
+
+def getMin_thrs(confusion):
+    """
+        Returns the threshold with the smallest number of wrong predictions
+        
+        Parameters:
+        -----------
+        Confustion matrix: 2-d array with 5 columns
+        
+        Returns:
+        --------
+        thrs: min threshold that gives minimum wrong predictions: columns 3 +
+        column 4
+        false_pos: number of incorrect trues
+        false_neg: number of incorrect falses
+        """
+    thrs_min = np.argmin(confusion[:,3]+ confusion[:,4])
+    col_out = confusion[thrs_min, :]
+    thrs = col_out[0]
+    false_pos = col_out[3]
+    false_neg = col_out[4]
+    return thrs, false_pos, false_neg
+
+
+def plot_roc(confusion, fig):
+    """
+        function to plot the ROC (receiver operating characteristic) curve and
+        calculate the corresponding AUC (Area Under Curve).
+        
+        Parameters:
+        -----------
+        Confustion matrix: 2-d array with 5 columns
+        
+        Returns:
+        --------
+        fig: The ROC curve
+        AUC: Correspong AUC value
+        """
+    ROC = np.zeros((confusion.shape[0],2))
+    for i in range(confusion.shape[0]):
+        # Compute false positive rate for current threshold.
+        FPR_t = confusion[i, 3] / float(confusion[i, 3] + confusion[i, 2])
+        ROC[i,0] = FPR_t
+
+        # Compute true  positive rate for current threshold.
+        TPR_t = confusion[i, 1] / float(confusion[i, 1] + confusion[i, 4])
+        ROC[i,1] = TPR_t
+
+    # Plot the ROC curve.
+    plt.plot(ROC[:,0], ROC[:,1], lw=2)
+    plt.xlim(-0.1,1.1)
+    plt.ylim(-0.1,1.1)
+    plt.xlabel('$FPR(t)$')
+    plt.ylabel('$TPR(t)$')
+    plt.grid()
+
+    AUC = 0.
+    for i in range(confusion.shape[0]-1):
+        AUC += (ROC[i+1,0]-ROC[i,0]) * (ROC[i+1,1]+ROC[i,1])
+    AUC *= -0.5
+
+    plt.title('ROC curve, AUC = %.4f'%AUC)
+    return fig, AUC