Merge pull request #211 from mingujo/min-log_reg_re

Min log reg re
berkeley-stat159 · Dec 13, 2015 · 4b30121 · 4b30121
2 parents 85253f6 + 2a296ae
commit 4b30121
Show file tree

Hide file tree

Showing 7 changed files with 491 additions and 81 deletions.
diff --git a/code/utils/organize_behavior_data.py → ...utils/functions/organize_behavior_data.py b/code/utils/organize_behavior_data.py → ...utils/functions/organize_behavior_data.py
@@ -1,22 +1,49 @@
 """
 Purpose:
 -----------------------------------------------------------------------------------
-We make the extraction of txt file to be easier by building these functions. 
+To explore the behavior data for each subject, we make it convenient by building these
+functions. Especially, logistic regression on behavior data needs the data in pandas.
+data_frame. This does it for you.
 -----------------------------------------------------------------------------------
 """
 
 
 
 import numpy as np
 import pandas as pd
+import os
 
-project_location="../../"
-data_location=project_location+"data/ds005/"
+data_location=os.path.join(os.path.dirname(__file__), '../../../data/ds005/')
+
+
+def load_in_dataframe(subject_number):
+	""" Return the subject behav data combining all 3 runs and excluding invalid data (-1) in data.frame
+    Parameters
+    ----------
+    subject_number : int
+    	Subject Number
+    
+    Returns
+    -------
+    behav_total_run : data.frame
+    	the subject's behav data (combining all 3 runs)
+        
+    """
+
+	#convert 3 behavior datas in 1 subject into data frames.
+	run1 = pd.read_table(data_location+'sub'+str(subject_number).zfill(3)+'/behav/task001_run001/behavdata.txt')
+	run2 = pd.read_table(data_location+'sub'+str(subject_number).zfill(3)+'/behav/task001_run002/behavdata.txt')
+	run3 = pd.read_table(data_location+'sub'+str(subject_number).zfill(3)+'/behav/task001_run003/behavdata.txt')
+
+	#append all the runs in one pandas data frame
+	r=run1.append(run2)
+	run_total=r.append(run3) 
+
+	return run_total
 
 """load and combine 3 behavior datas from 3 runs"""
 def load_behav_txt(subject_number):
 	""" Return the subject behav data combining all 3 runs and excluding invalid data (-1) in np.array
-
     Parameters
     ----------
     subject_number : int
@@ -30,9 +57,9 @@ def load_behav_txt(subject_number):
     """
 
 	#load texts
-	behav1=np.loadtxt(data_location+'sub00%s/behav/task001_run001/behavdata.txt'%(subject_number),skiprows=1)
-	behav2=np.loadtxt(data_location+'sub00%s/behav/task001_run002/behavdata.txt'%(subject_number),skiprows=1)
-	behav3=np.loadtxt(data_location+'sub00%s/behav/task001_run003/behavdata.txt'%(subject_number),skiprows=1)
+	behav1=np.loadtxt(data_location+'sub'+str(subject_number).zfill(3)+'/behav/task001_run001/behavdata.txt',skiprows=1)
+	behav2=np.loadtxt(data_location+'sub'+str(subject_number).zfill(3)+'/behav/task001_run002/behavdata.txt',skiprows=1)
+	behav3=np.loadtxt(data_location+'sub'+str(subject_number).zfill(3)+'/behav/task001_run003/behavdata.txt',skiprows=1)
 
 	#concatenate them to be 1
 	behav_total_run=np.concatenate((behav1,behav2,behav3),axis=0)
@@ -47,7 +74,6 @@ def load_behav_txt(subject_number):
 
 def load_behav_text_one(subject_number, run_number):
 	""" Return the subject behav data (single run) in np.array
-
     Parameters
     ----------
     subject_number : int
@@ -62,7 +88,7 @@ def load_behav_text_one(subject_number, run_number):
         
     """
 
-	behav=np.loadtxt(data_location+'sub00%s/behav/task001_run00%s/behavdata.txt'%(subject_number,run_number),
+	behav=np.loadtxt(data_location+'sub'+str(subject_number).zfill(3)+'/behav/task001_run00%s/behavdata.txt'%(run_number),
 		skiprows=1)
 
 	#delete the rows that contain -1 in respcat (these are errors in experiment so we should take them out
@@ -73,36 +99,3 @@ def load_behav_text_one(subject_number, run_number):
 
 
 
-def load_in_dataframe(subject_number):
-	""" Return the subject behav data combining all 3 runs and excluding invalid data (-1) in data.frame
-
-    Parameters
-    ----------
-    subject_number : int
-    	Subject Number
-    
-    Returns
-    -------
-    behav_total_run : data.frame
-    	the subject's behav data (combining all 3 runs)
-        
-    """
-
-	#convert 3 behavior datas in 1 subject into data frames.
-	run1 = pd.read_table(data_location+'sub00%s/behav/task001_run001/behavdata.txt'%(subject_number))
-	run2 = pd.read_table(data_location+'sub00%s/behav/task001_run002/behavdata.txt'%(subject_number))
-	run3 = pd.read_table(data_location+'sub00%s/behav/task001_run003/behavdata.txt'%(subject_number))
-
-	#append all the runs in one pandas data frame
-	r=run1.append(run2)
-	run_total=r.append(run3) 
-
-	return run_total
-
-
-
-
-
-
-
-
diff --git a/code/utils/logistic_reg.py b/code/utils/logistic_reg.py
@@ -1,3 +1,12 @@
+"""
+Purpose:
+-----------------------------------------------------------------------------------
+We try to capture the significance of gain and loss amount condition for each subjects.
+We fit the logistic regression line based on their responses on the experiment. The slope 
+of the fitted line illustrates the subject's sensitivity on either gain or loss amount. 
+-----------------------------------------------------------------------------------
+"""
+
 import pandas as pd
 import statsmodels.api as sm
 import pylab as pl
@@ -12,8 +21,19 @@
 
 
 def add_gainlossratio(run):
-
-	""" add gain/loss ratio column """
+	""" Return the behavioral data of a subject with added 'ratio' column
+ 
+    Parameters
+    ----------
+    run : data.frame
+        Behavior data over 3 runs (combined) of a subject
+    
+    Returns
+    -------
+    run : data.frame
+        behavioral data added the ratio column (ratio : gain/loss)
+
+    """
 
 	gain = run.ix[:,1]
 	loss = run.ix[:,2]
@@ -22,14 +42,20 @@ def add_gainlossratio(run):
 	return run
 
 def organize_columns(run):
+	""" move around the columns to get the data frame be ready for logistic regression
+ 
+    Parameters
+    ----------
+    run : data.frame
+        behavioral data added the ratio column (ratio : gain/loss)
 
-	"""
+    Returns
+    -------
+    run_final : data.frame
+        behavioral data frame with organized columns
 
-	drop the unnecessary columns in the data frame,
-	and reorganize the columns for logistic regression
+    """
 
-	
-	"""
 
 	# drop onset column
 	a = run.drop('onset', 1) 
@@ -55,9 +81,19 @@ def organize_columns(run):
 
 
 def log_regression(run_final):
-
-	"""do logistic regression on train cols to predict the subject's decision"""
+	"""Do logistic regression on train cols to predict the subject's decision
 	
+	Parameters
+    ----------
+    run : data.frame
+        behavioral data frame with organized columns
+
+    Returns
+    -------
+    logit_pars : logistic regression result summary
+        the logistic regression result summary
+
+    """
 	# do logistic regression
 	x = logit("respcat ~ gain + loss", run_final).fit()
 
@@ -67,30 +103,7 @@ def log_regression(run_final):
 	#store the parameters of logistic regression
 	logit_pars = x.params
 
-	### START TO PLOT ###
-	# calculate intercept and slope
-	intercept = -logit_pars['Intercept'] / logit_pars['gain']
-	slope = -logit_pars['loss'] / logit_pars['gain']
-
-	fig = plt.figure(figsize = (10, 8))   
-
-	# plot gain and loss for respcat = 1(decides to gamble)
-	plt.plot(run_final[run_final['respcat'] == 1].values[:,2], run_final[run_final['respcat'] == 1].values[:,1], '.', label = "Gamble", mfc = 'None', mec='red')
-
-	# plot gain and loss for respcat = 0(decides to not gamble)
-	plt.plot(run_final[run_final['respcat'] == 0].values[:,2], run_final[run_final['respcat'] == 0].values[:,1], '.', label = "Not gamble", mfc = 'None', mec='blue')
-
-	# draw regression line
-	plt.plot(run_final['loss'], intercept + slope * run_final['loss'],'-', color = 'green') 
-
-	plt.xlabel('Loss ($)')
-	plt.ylabel('Gain ($)')
-	plt.legend(loc='bottom right')
-	plt.axis([2, 23, 8, 41])
-	plt.title("Logistic Regression to predict 1(gamble) 0(not gamble) with gain and loss values\n")
-	plt.savefig('log_regression.png')
-	plt.show()
-	return 
+	return logit_pars
 
 if __name__ == '__main__':
     a=add_gainlossratio(int(sys.argv[1]))

diff --git a/code/utils/scripts/log_regression_script.py b/code/utils/scripts/log_regression_script.py
@@ -1,10 +1,76 @@
-""" Linear Regression on Begavioral data """
+"""
+Purpose:
+-----------------------------------------------------------------------------------
+We try to capture the significance of gain and loss amount condition for each subjects.
+We fit the logistic regression line based on their responses on the experiment. The slope 
+of the fitted line illustrates the subject's sensitivity on either gain or loss amount. 
 
-import sys
-sys.path.append(".././utils")
+This script outputs plots for each subject and combine them into one image of subplots. 
+-----------------------------------------------------------------------------------
+"""
+
+from __future__ import absolute_import, division, print_function
+import sys, os
+#TODO : later change this
+sys.path.append(os.path.join(os.path.dirname(__file__), "../"))
+import numpy as np
+import matplotlib.pyplot as plt
+import nibabel as nib
 from logistic_reg import *
+from organize_behavior_data import *
+
+
+
+# Create the necessary directories if they do not exist
+dirs = ['../../fig','../../fig/log_reg_behav']
+for d in dirs:
+    if not os.path.exists(d):
+            os.makedirs(d)
+
+# Locate the different paths
+#TODO: the current location for this file project-epsilon/code/scripts
+project_path = '../../../'
+# TODO: change it to relevant path
+data_path = project_path+'data/ds005/'
+subject_list = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16']
+
+
+fig = plt.figure()
+for i,subject in enumerate(subject_list):
+	behav_df = load_in_dataframe(subject)
+	add_lambda = add_gainlossratio(behav_df)
+	columns_changed = organize_columns(add_lambda)
+	logit_pars = log_regression(columns_changed)
+
+	### START TO PLOT ###
+	# calculate intercept and slope
+	intercept = -logit_pars['Intercept'] / logit_pars['gain']
+	slope = -logit_pars['loss'] / logit_pars['gain']
+		#fig = plt.figure(figsize = (10, 8))   
+	ax = fig.add_subplot(4, 4, i+1) 
+	ax.set_title("Subject_%s_run001"%(str(i+1)), fontsize =10)
+	ax.set_axis_bgcolor('white')
+
+	# plot gain and loss for respcat = 1(decides to gamble)
+	l1, = ax.plot(behav_df[behav_df['respcat'] == 1].values[:,2], behav_df[behav_df['respcat'] == 1].values[:,1], '.', label = "Gamble", mfc = 'None', mec='red')
+
+	# plot gain and loss for respcat = 0(decides to not gamble)
+	l2, = ax.plot(behav_df[behav_df['respcat'] == 0].values[:,2], behav_df[behav_df['respcat'] == 0].values[:,1], '.', label = "Not gamble", mfc = 'None', mec='blue')
+
+	# draw regression line
+	ax.plot(behav_df['loss'], intercept + slope * behav_df['loss'],'-', color = 'green') 
+
+	ax.set_xlabel('Loss ($)', fontsize =10)
+	ax.set_ylabel('Gain ($)', fontsize =10)
+	ax.set_xlim([2,23])
+	ax.set_ylim([8,41])
+	ax.tick_params(axis='x', labelsize=10)
+	ax.tick_params(axis='y', labelsize=10)
+
+fig.legend((l1,l2), ('Gamble','Not Gamble'), loc = 'lower right', labelspacing = 0.5, fontsize = 10)
+fig.tight_layout()
+fig.subplots_adjust(top=0.90)
+fig.suptitle("Fitted Logistic Regression Line (1(gamble) 0(not gamble) with gain and loss values\n", fontsize=12)
+fig.savefig(dirs[1]+'/log_regression_behav_subplots.png',facecolor='white', edgecolor='white')
 
 
-a=add_gainlossratio(behav_df)
-b=organize_columns(a)
-log_regression(b)