Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

I finally passed the friggin tests for linear regression. #243

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@


import pandas as pd
import sys, os
sys.path.append(os.path.join(os.path.dirname(__file__), "./"))
#import statsmodels.api as sm
import statsmodels.formula.api as smf
import pylab as pl
Expand All @@ -11,7 +13,8 @@
from scipy.stats import t
import numpy.linalg as npl
import math

import logistic_reg
from logistic_reg import *



Expand All @@ -30,84 +33,23 @@
def load_data(subject, data_dir = "/Users/macbookpro/Desktop/stat159_Project/"):

# Get the directory where data is stored
data_location= data_dir + 'ds005/sub' + subject

try:
data_location = data_dir + 'ds005/sub' + subject

# Load the data for each run
run1 = pd.read_table(data_location+"/behav/task001_run001/behavdata.txt")
run2 = pd.read_table(data_location+"/behav/task001_run002/behavdata.txt")
run3 = pd.read_table(data_location+"/behav/task001_run003/behavdata.txt")
# Load the data for each run
run1 = pd.read_table(data_location+"/behav/task001_run001/behavdata.txt")
run2 = pd.read_table(data_location+"/behav/task001_run002/behavdata.txt")
run3 = pd.read_table(data_location+"/behav/task001_run003/behavdata.txt")

except IOError:

print "Can't find files in such directory! Please enter the directory where you store ds005 dataset!"
return

run_1=run1.append(run2)
run_total=run_1.append(run3) #append all the data frames of run

return(run_total)


"""
To combine all the behavioral data

Parameters:

data_dir: The working directory that you store your data

Return:

all_data: the behavioral data that contains 16 subjects, each subject has 3 runs

"""
def combine_all_data(data_dir = "/Users/macbookpro/Desktop/stat159_Project/"):

# Get all the subjects
all_subjects = ['001', '002', '003', '004', '005', '006', '007',
'008', '009', '010', '011', '012', '013', '014', '015', '016']

all_data = []

# Loop for 16 subjects to get the data
for i in all_subjects:
temp = load_data(i, data_dir)

# if there's no such dataset, then stop running the loop and return nothing
if temp is None:
return

# combine all the data
all_data.append(temp)

# Concat the data
all_data = pd.concat(all_data)

# Calculate the difference for future use
all_data['diff'] = all_data['gain'] - all_data['loss']

# Calculate the ratio for future use
all_data['ratio'] = all_data['gain'] / all_data['loss']


all_data['log_gain'] = np.log(all_data['gain'])

all_data['log_loss'] = np.log(all_data['loss'])

all_data['log_diff'] = np.log(all_data['diff'])

all_data['log_ratio'] = np.log(all_data['ratio'])

# # Remove the rows that have respcat == -1
#(meaning that the individual chooses to not participate the activity)
all_data = all_data[np.logical_not(all_data['respcat'] == -1)]

return (all_data)


"""
To combine all the behavioral data
To perform linear regression

Parameters:

Expand Down Expand Up @@ -163,9 +105,9 @@ def linear_regression(data, y, *arg):
for i in range(1,p):
print('==============================================================')
print(arg[i-1])
print 'Coefficient: ' + str(beta[i]), 'p-value: ' + str(pvalues[i])
print ('Coefficient: ' + str(beta[i]), 'p-value: ' + str(pvalues[i]))

return
return beta, pvalues

"""
def linear_regression_RT_with_gain_and_loss(data):
Expand Down Expand Up @@ -213,20 +155,78 @@ def linear_regression_RT_with_gain_and_loss(data):
print 'Coefficient: ' + str(beta[2]), 'p-value: ' + str(pval_loss)

return
"""



def linear_regression_fast(data, formula):
#plt.plot(data['diff'], data['RT'], '+')
# It seems like there's no siginificant relationship between diff and Response time
# When we do the regression, it shows that difference between gain and loss does not really matter either.
def simple_regression_plot(data, dep_var, exp_var):
y = data[dep_var]
x = data[exp_var]
n = len(data)
# Design X matrix
X = np.column_stack((np.ones(n), x))
# Get the beta
B = npl.pinv(X).dot(y)
# Get the regression line
x_vals = [0, max(x)]
y_vals = [my_line(0), my_line(max(x))]
# Plot the simple linear regression
plt.plot(x, y, '+')
plt.xlabel('ratio (gain/loss)')
plt.ylabel('Response Time')
plt.plot(x_vals, y_vals)
plt.title('Ratio vs Response Time with predicted line')
return







def plot_neural_and_behav_loss_aversion(all_subjects, data, beta = None):

lambdas = []
loss_aversion = []

for i in range(len(all_subjects)):

est = smf.ols(formula = formula, data=data).fit()
est.summary()
a = add_gainlossratio(data[i])
b = organize_columns(a)
x = logit("respcat ~ gain + loss", b).fit()
logit_pars = x.params
ratio = -logit_pars['loss'] / logit_pars['gain']
lambdas.append( math.log(ratio) )
loss_aversion.append( (-logit_pars['loss']) - logit_pars['gain'] ) # This will be changed!

return est

X = np.column_stack((np.ones(16), loss_aversion))



B = npl.pinv(X).dot(lambdas)

def my_line(x, B = B):
# Best prediction
return B[0] + B[1] * x

x_vals = [0, max(loss_aversion)]
y_vals = [my_line(0), my_line(max(loss_aversion))]

fig = plt.figure()
ax = fig.add_subplot(111)

plt.plot(loss_aversion, lambdas, '+')
plt.plot(x_vals, y_vals)

plt.title("Scatterplot of correspondence between neural \nloss aversion and behavioral loss aversion")
plt.xlabel(r'Neural loss aversion [-($\beta[loss]) - \beta[gain]$]')
plt.ylabel(r'Behavioral loss aversion [ln($\lambda)$]')

plt.grid()
plt.show()

return
"""



Expand Down
74 changes: 73 additions & 1 deletion code/utils/scripts/linear_regression_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,83 @@
from linear_regression import *

# Get the data
data = combine_all_data()
all_subjects= ['001', '002' ,'003', '004', '005', '006', '007', '008', '009', '010', '011', '012', '013', '014', '015', '016']
# data_dir = "/Users/macbookpro/Desktop/stat159_Project/"
project_path = '../../../'
data_dir = project_path+'data/'


########################
# combine all the data #
########################


all_data = []

# Loop for 16 subjects to get the data
for i in all_subjects:
temp = load_data(i, data_dir)
# if there's no such dataset, then stop running the loop and return nothing
if temp is None:
return



all_data.append(temp)

# Concat the data
all_data = pd.concat(all_data)

# Calculate the difference for future use
all_data['diff'] = all_data['gain'] - all_data['loss']

# Calculate the ratio for future use
all_data['ratio'] = all_data['gain'] / all_data['loss']

all_data['log_gain'] = np.log(all_data['gain'])

all_data['log_loss'] = np.log(all_data['loss'])

all_data['log_diff'] = np.log(all_data['diff'])

all_data['log_ratio'] = np.log(all_data['ratio'])

# # Remove the rows that have respcat == -1
#(meaning that the individual chooses to not participate the activity)
all_data = all_data[np.logical_not(all_data['respcat'] == -1)]


########################
# Load each subject #
########################

data_each = []

for i in all_subjects:
data_each.append(load_data(i, data_dir))

for i in range(len(all_subjects)):
data_each[i]['ratio'] = data_each[i]['gain'] / data_each[i]['loss']



#######################
# Peform regression #
#######################

data = all_data

# Run the linear_regression function to get the summary
linear_regression(data, 'RT', 'gain', 'loss')

linear_regression(data, 'RT', 'ratio')

linear_regression(data, 'RT', 'diff')


#######################
# Plot #
#######################



74 changes: 74 additions & 0 deletions code/utils/tests/test_linear_regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""

Test linear_regression.py

This is a test module.

It is designed to be run the with the "nose" testing package (via the
"nosetests" script.

Nose will look for any functions with "test" in their names, and run them.

Nose reports any errors, or any failures.

So we use the tests to check that the results of our function are (still) as we
expect.


"""


# Python 3 compatibility
from __future__ import absolute_import, division, print_function
import pandas as pd
import numpy as np
from numpy.testing import assert_almost_equal
import sys, os, pdb

#Specicy the path for functions
sys.path.append(os.path.join(os.path.dirname(__file__), "../functions/"))
import linear_regression
from linear_regression import *




def test_linear_regression():


# Create a data frame
d = {'y' : pd.Series([95, 85, 80, 75, 70, 65, 60, 55, 50, 45], index=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']),
'x1' : pd.Series([85, 95, 70, 65, 70, 60, 64, 60, 51, 49], index=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']),
'x2' : pd.Series([10, 8.8, 8.4, 7.5, 7.4, 7.2, 7.0, 6.4, 5.3, 4], index=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']),
}


df = pd.DataFrame(d)


beta1, pvalues1 = linear_regression(df, 'y', 'x1')
beta2, pvalues2 = linear_regression(df, 'y', 'x2')
beta3, pvalues3 = linear_regression(df, 'y', 'x1', 'x2')


expected_beta1 = np.array([ 0.69128736, 1.00610931]) # Calculated by hands
expected_p1 = np.array([0.95669000991385234, 0.00082441892685309844]) # Calculated by hands
expected_beta2 = np.array([ 2.92830189, 9.03773585]) # Calculated by hands
expected_p2 = [0.64660353670191761, 1.2010523101013017e-05] # Calculated by hands
expected_beta3 = np.array([-0.01554384, 0.20355359, 7.55525124]) # Calculated by hands
expected_p3 = np.array([0.99826544217405555, 0.37237722050579208, 0.0049816157477362418]) # Calculated by hands

assert_almost_equal(expected_beta1, beta1)
assert_almost_equal(expected_p1, pvalues1)
assert_almost_equal(expected_beta2, beta2)
assert_almost_equal(expected_p2, pvalues2)
assert_almost_equal(expected_beta3, beta3)
assert_almost_equal(expected_p3, pvalues3)