berkeley-stat159 · timothy1191xa · Dec 13, 2015 · Dec 13, 2015 · Dec 13, 2015 · Dec 13, 2015
diff --git a/code/utils/linear_regression.py → code/utils/functions/linear_regression.py b/code/utils/linear_regression.py → code/utils/functions/linear_regression.py
@@ -2,6 +2,8 @@
 
 
 import pandas as pd
+import sys, os
+sys.path.append(os.path.join(os.path.dirname(__file__), "./"))
 #import statsmodels.api as sm
 import statsmodels.formula.api as smf
 import pylab as pl
@@ -11,7 +13,8 @@
 from scipy.stats import t 
 import numpy.linalg as npl
 import math
-
+import logistic_reg
+from logistic_reg import *
 
 
 
@@ -30,84 +33,23 @@
 def load_data(subject, data_dir = "/Users/macbookpro/Desktop/stat159_Project/"):
 
 	# Get the directory where data is stored
-	data_location= data_dir + 'ds005/sub' + subject
-
-	try:
+	data_location = data_dir + 'ds005/sub' + subject
 
-		# Load the data for each run
-		run1 = pd.read_table(data_location+"/behav/task001_run001/behavdata.txt")
-		run2 = pd.read_table(data_location+"/behav/task001_run002/behavdata.txt")
-		run3 = pd.read_table(data_location+"/behav/task001_run003/behavdata.txt")
+	# Load the data for each run
+	run1 = pd.read_table(data_location+"/behav/task001_run001/behavdata.txt")
+	run2 = pd.read_table(data_location+"/behav/task001_run002/behavdata.txt")
+	run3 = pd.read_table(data_location+"/behav/task001_run003/behavdata.txt")
 
-	except IOError:
-
-		print "Can't find files in such directory! Please enter the directory where you store ds005 dataset!"
-		return
 
 	run_1=run1.append(run2)
 	run_total=run_1.append(run3) #append all the data frames of run
 
 	return(run_total)
 
 
-"""
-To combine all the behavioral data
-
-Parameters:
-
-	data_dir: The working directory that you store your data
-
-Return:
-
-	all_data: the behavioral data that contains 16 subjects, each subject has 3 runs
-
-"""
-def combine_all_data(data_dir = "/Users/macbookpro/Desktop/stat159_Project/"):
-
-	# Get all the subjects
-	all_subjects = ['001', '002', '003', '004', '005', '006', '007', 
-	'008', '009', '010', '011', '012', '013', '014', '015', '016']
-
-	all_data = []
-
-	# Loop for 16 subjects to get the data
-	for i in all_subjects:
-		temp = load_data(i, data_dir)
-
-		# if there's no such dataset, then stop running the loop and return nothing
-		if temp is None:
-			return
-
-		# combine all the data
-		all_data.append(temp)
-
-	# Concat the data
-	all_data = pd.concat(all_data)
-
-	# Calculate the difference for future use
-	all_data['diff'] = all_data['gain'] - all_data['loss']
-
-	# Calculate the ratio for future use
-	all_data['ratio'] = all_data['gain'] / all_data['loss']
-
-
-	all_data['log_gain'] = np.log(all_data['gain'])
-
-	all_data['log_loss'] = np.log(all_data['loss'])
-
-	all_data['log_diff'] = np.log(all_data['diff'])
-
-	all_data['log_ratio'] = np.log(all_data['ratio'])
-
-	# # Remove the rows that have respcat == -1 
-	#(meaning that the individual chooses to not participate the activity)
-	all_data = all_data[np.logical_not(all_data['respcat'] == -1)]
-
-	return (all_data)
-
 
 """
-To combine all the behavioral data
+To perform linear regression
 
 Parameters:
 
@@ -163,9 +105,9 @@ def linear_regression(data, y, *arg):
 	for i in range(1,p):
 		print('==============================================================')
 		print(arg[i-1])
-		print 'Coefficient: ' + str(beta[i]), 'p-value: ' + str(pvalues[i])
+		print ('Coefficient: ' + str(beta[i]), 'p-value: ' + str(pvalues[i]))
 
-	return
+	return beta, pvalues
 
 """
 def linear_regression_RT_with_gain_and_loss(data):
@@ -213,20 +155,78 @@ def linear_regression_RT_with_gain_and_loss(data):
 	print 'Coefficient: ' + str(beta[2]), 'p-value: ' + str(pval_loss)
 
 	return
-"""
 
 
 
-def linear_regression_fast(data, formula):
-	#plt.plot(data['diff'], data['RT'], '+')
-	# It seems like there's no siginificant relationship between diff and Response time
-	# When we do the regression, it shows that difference between gain and loss does not really matter either.
+def simple_regression_plot(data, dep_var, exp_var):
+	y = data[dep_var]
+	x = data[exp_var]
+	n = len(data)
+	# Design X matrix
+	X = np.column_stack((np.ones(n), x))
+	# Get the beta
+	B = npl.pinv(X).dot(y)
+	# Get the regression line
+	x_vals = [0, max(x)]
+	y_vals = [my_line(0), my_line(max(x))]
+	# Plot the simple linear regression
+	plt.plot(x, y, '+')
+	plt.xlabel('ratio (gain/loss)')
+	plt.ylabel('Response Time')
+	plt.plot(x_vals, y_vals)
+	plt.title('Ratio vs Response Time with predicted line')
+	return
+
+
+
+
+
+
+
+	def plot_neural_and_behav_loss_aversion(all_subjects, data, beta = None):
+
+	lambdas = []
+	loss_aversion = []
 
+	for i in range(len(all_subjects)):
 
-	est = smf.ols(formula = formula, data=data).fit()
-	est.summary()
+		a = add_gainlossratio(data[i])
+		b = organize_columns(a)
+		x = logit("respcat ~ gain + loss", b).fit()
+		logit_pars = x.params
+		ratio =  -logit_pars['loss'] / logit_pars['gain'] 
+		lambdas.append( math.log(ratio) )
+		loss_aversion.append( (-logit_pars['loss']) - logit_pars['gain'] ) # This will be changed!
 
-	return est
+
+	X = np.column_stack((np.ones(16), loss_aversion))
+
+
+
+	B = npl.pinv(X).dot(lambdas)
+
+	def my_line(x, B = B):
+    	# Best prediction 
+		return B[0] + B[1] * x
+
+	x_vals = [0, max(loss_aversion)]
+	y_vals = [my_line(0), my_line(max(loss_aversion))]
+
+	fig = plt.figure()
+	ax = fig.add_subplot(111)
+
+	plt.plot(loss_aversion, lambdas, '+') 
+	plt.plot(x_vals, y_vals)
+
+	plt.title("Scatterplot of correspondence between neural \nloss aversion and behavioral loss aversion")
+	plt.xlabel(r'Neural loss aversion [-($\beta[loss]) - \beta[gain]$]')
+	plt.ylabel(r'Behavioral loss aversion [ln($\lambda)$]')
+
+	plt.grid()
+	plt.show()
+
+	return
+"""
 
 
 

diff --git a/code/utils/scripts/linear_regression_script.py b/code/utils/scripts/linear_regression_script.py
@@ -5,11 +5,83 @@
 from linear_regression import *
 
 # Get the data
-data = combine_all_data()
+all_subjects= ['001', '002' ,'003', '004', '005', '006', '007', '008', '009', '010', '011', '012', '013', '014', '015', '016']
+# data_dir = "/Users/macbookpro/Desktop/stat159_Project/"
+project_path = '../../../'
+data_dir = project_path+'data/'
+
+
+########################
+# combine all the data #
+########################
+
+
+all_data = []
+
+# Loop for 16 subjects to get the data
+for i in all_subjects:
+	temp = load_data(i, data_dir)
+	# if there's no such dataset, then stop running the loop and return nothing
+	if temp is None:
+		return
+
+
+
+all_data.append(temp)
+
+# Concat the data
+all_data = pd.concat(all_data)
+
+# Calculate the difference for future use
+all_data['diff'] = all_data['gain'] - all_data['loss']
+
+# Calculate the ratio for future use
+all_data['ratio'] = all_data['gain'] / all_data['loss']
+
+all_data['log_gain'] = np.log(all_data['gain'])
+
+all_data['log_loss'] = np.log(all_data['loss'])
+
+all_data['log_diff'] = np.log(all_data['diff'])
+
+all_data['log_ratio'] = np.log(all_data['ratio'])
+
+# # Remove the rows that have respcat == -1 
+#(meaning that the individual chooses to not participate the activity)
+all_data = all_data[np.logical_not(all_data['respcat'] == -1)]
+
+
+########################
+#  Load each subject   #
+########################
+
+data_each = []
+
+for i in all_subjects:
+	data_each.append(load_data(i, data_dir))
+
+for i in range(len(all_subjects)):
+	data_each[i]['ratio'] = data_each[i]['gain'] / data_each[i]['loss']
+
+
+
+#######################
+#  Peform regression  #
+#######################
+
+data = all_data
 
 # Run the linear_regression function to get the summary
 linear_regression(data, 'RT', 'gain', 'loss')
 
 linear_regression(data, 'RT', 'ratio')
 
 linear_regression(data, 'RT', 'diff')
+
+
+#######################
+#        Plot         #
+#######################
+
+
+
diff --git a/code/utils/tests/test_linear_regression.py b/code/utils/tests/test_linear_regression.py
@@ -0,0 +1,74 @@
+""" 
+
+Test linear_regression.py 
+
+This is a test module.
+
+It is designed to be run the with the "nose" testing package (via the
+"nosetests" script.
+
+Nose will look for any functions with "test" in their names, and run them.
+
+Nose reports any errors, or any failures.
+
+So we use the tests to check that the results of our function are (still) as we
+expect.
+
+
+"""
+
+
+# Python 3 compatibility
+from __future__ import absolute_import, division, print_function
+import pandas as pd
+import numpy as np
+from numpy.testing import assert_almost_equal
+import sys, os, pdb
+
+#Specicy the path for functions
+sys.path.append(os.path.join(os.path.dirname(__file__), "../functions/"))
+import linear_regression
+from linear_regression import *
+
+
+
+
+def test_linear_regression():
+
+
+	# Create a data frame
+	d = {'y' : pd.Series([95, 85, 80, 75, 70, 65, 60, 55, 50, 45], index=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']),
+		 'x1' : pd.Series([85, 95, 70, 65, 70, 60, 64, 60, 51, 49], index=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']),
+		 'x2' : pd.Series([10, 8.8, 8.4, 7.5, 7.4, 7.2, 7.0, 6.4, 5.3, 4], index=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']),
+		 }
+
+
+	df = pd.DataFrame(d)
+
+
+	beta1, pvalues1 = linear_regression(df, 'y', 'x1')
+	beta2, pvalues2 = linear_regression(df, 'y', 'x2')
+	beta3, pvalues3 = linear_regression(df, 'y', 'x1', 'x2')
+
+
+	expected_beta1 = np.array([ 0.69128736,  1.00610931]) # Calculated by hands
+	expected_p1 = np.array([0.95669000991385234, 0.00082441892685309844]) # Calculated by hands
+	expected_beta2 = np.array([ 2.92830189,  9.03773585]) # Calculated by hands
+	expected_p2 = [0.64660353670191761, 1.2010523101013017e-05] # Calculated by hands
+	expected_beta3 = np.array([-0.01554384,  0.20355359,  7.55525124]) # Calculated by hands
+	expected_p3 = np.array([0.99826544217405555, 0.37237722050579208, 0.0049816157477362418]) # Calculated by hands
+
+	assert_almost_equal(expected_beta1, beta1)
+	assert_almost_equal(expected_p1, pvalues1)
+	assert_almost_equal(expected_beta2, beta2)
+	assert_almost_equal(expected_p2, pvalues2)
+	assert_almost_equal(expected_beta3, beta3)
+	assert_almost_equal(expected_p3, pvalues3)
+
+
+
+
+
+
+
+