In [1]:
import numpy as np
import pandas as pd
from scipy.stats import norm, chi2
import sys

In [2]:
# #preallocate image matrices for choices
# #This pertains to estimating covariance matrices of the error differences
# #See Train book on discrete choice analysis p 113
# #"This matrix can be used to transform the covariance matrix of
# #errors into the covariance matrix of error differences: ~Ωi = MiΩMi.T .
# temp = np.identity(Jm-1)
# M = np.empty((Jm, Jm-1, 12))
# for i in range(1, Jm+1):
#     M[i-1] = np.concatenate((temp[:,0:i-1], -1*np.ones((Jm-1,1)), temp[:, i-1:]), axis=1)

# #Matrices for only the chosen options
# Mi=M[y-1]



In [3]:
def DivisiveNormalization(theta, data):
    denom = theta[0] + np.multiply(theta[1], np.linalg.norm(data, theta[2], 1))
    v=np.divide(data.T, denom)
    
    return v

In [4]:
def calcPiProbitQuad(Mi, v):
    
    MiT=np.transpose(Mi, axes=(0,2,1))
    T=v.shape[0]
    [x, w] = np.polynomial.hermite.hermgauss(100)

    #I honestly don't really know how tensordot works, but these lines of code return the correct values
    c = np.tensordot(MiT,v, axes=([1,0]))
    cT=np.transpose(c, axes=(0,2,1))
    vi = cT.diagonal() #This matches vi in MATLAB for s=1, trials 8,10,14
    
    #first part of equation in ProbaChoice.m, line 242
    z1=np.multiply(-2**0.5, vi)

    #second part of equation in ProbaChoice.m, line 242
    z2=np.multiply(-2**0.5, x)

    #These values have been validated
    zz = [z1-ele for ele in z2]

    aa=np.prod(norm.cdf(zz), axis=1)
    #Pi have been validated
    Pi=np.divide(np.sum(np.multiply(w.reshape(100,1), aa), axis=0), np.pi**0.5)
    
    return Pi
    

In [None]:
choice.head()

In [26]:
chosen_vals

array([ 3, 19,  8, 10,  0,  6,  0,  3,  2,  8,  3,  1, 16,  4,  0, 14,  1,
       11, 13,  9, 15, 19,  9,  0,  7,  1,  1,  1,  5, 18,  1, 12,  1,  4,
       14,  1, 14,  0,  1,  1,  1,  2,  2, 13,  4,  0,  4,  0,  1,  1,  0,
       13,  6,  8,  9, 19, 18,  3,  2,  9,  6,  1,  0,  8,  1,  0,  0, 13,
        4, 17,  6,  0, 11,  1,  4,  1,  7,  4,  4, 17, 16,  0, 13,  0,  7,
       14,  1,  7,  1,  9,  0,  4,  8,  0,  0, 17,  2,  8,  3,  2,  2,  0,
        3,  1, 13, 18, 17,  0,  0,  1,  7,  3,  1, 12,  2, 16,  8,  3, 12,
        7,  0])

In [19]:
data=choice_set_vals
choices = chosen_vals
v=DivisiveNormalization(theta=thetaDN, data=choice_set_vals)
probs = np.empty(data.shape)
#get the size of the choice array. Choice arrays must be the same size
Jm=data.shape[1]
temp = np.identity(Jm-1)
M = np.empty((Jm, Jm-1, Jm))




for i in range(Jm):
    M[i] = np.concatenate((temp[:,0:i], -1*np.ones((Jm-1,1)), temp[:, i:]), axis=1)

Mi=M[chosen_vals]
pi = calcPiProbitQuad(Mi, v)

In [6]:
#Load data from Bollen et al., 2010
choice = pd.read_csv('/Users/amywinecoff/Documents/CITP/Research/Github/AgentChoiceSim/co1_wide.csv')  

#for now, remove the conditions with 5 options so I can figure out the code for a fixed set size
choice = choice[~choice['condition'].isin(['Top5', 'Top5_NR'])]

score_cols = [c for c in choice.columns if 'score' in c]
movie_cols = [c for c in choice.columns if 'movie' in c]
choice_set_vals = np.array(choice[score_cols]/10)

choice['chosen_num']=None
for idx, m in enumerate(movie_cols):
    choice['chosen_num'] = np.where(choice[m]==choice["choice"], idx, choice['chosen_num'])
chosen_vals = np.array(choice['chosen_num'].astype(int).values)

chosen = choice_set_vals[np.arange(len(choice_set_vals)), chosen_vals]


In [33]:
def calcPiChosen(theta, data, choices):
    
    v=DivisiveNormalization(theta=thetaDN, data=data)
    probs = np.empty(data.shape)
    #get the size of the choice array. Choice arrays must be the same size
    Jm=data.shape[1]
    temp = np.identity(Jm-1)
    M = np.empty((Jm, Jm-1, Jm))


    for i in range(Jm):
        M[i] = np.concatenate((temp[:,0:i], -1*np.ones((Jm-1,1)), temp[:, i:]), axis=1)

    Mi=M[choices]
    pi = calcPiProbitQuad(Mi, v)
    
    return pi

In [34]:
pi_check = calcPiChosen(theta=thetaDN, data=choice_set_vals, choices=chosen_vals)

In [5]:
def calcPiAll(theta, data):
    
    v=DivisiveNormalization(theta, data)
    
    probs = np.empty(data.shape)
    #get the size of the choice array. Choice arrays must be the same size
    Jm=data.shape[1]
    temp = np.identity(Jm-1)
    M = np.empty((Jm, Jm-1, Jm))


    for i in range(Jm):
        M[i] = np.concatenate((temp[:,0:i], -1*np.ones((Jm-1,1)), temp[:, i:]), axis=1)
    
    for i in range(Jm):
        y=np.array([i]*data.shape[0])
        #print(y)
        
        #Matrices for only the chosen options
        Mi=M[y]
        #print(Mi)
        
        pi=calcPiProbitQuad(Mi,v)
        probs[:,i]=pi.T
        #print("pi is {}".format(pi))
    return probs

In [None]:
# ###CONFIRMATORY ANALYSIS TO TEST MATCH WITH WEBB DATA
# #create a vector of the WTP values. These values are from data(1).X, cells 8, 10, and 14
# d = np.array([
#              [4, 2.33, 1.875, 1.8, 1.5, 1.495, 1.335, 1.275, 1.125, 1.09, 1, 0.925],
#              [2.125, 2.125, 2.025, 2.0, 1.875, 1.495, 1.485, 1.335, 1.275, 1.075, 1.0, 0.625],
#              [4.0, 2.17,  2.0, 2.0, 1.875, 1.875, 1.5, 1.485, 1.335, 1.275, 1.09, 1.075],
#              ])

# # #These are the chosen options for s=1, on trials 8, 10,14
# y=np.array([4,3,2])

# # #Choice set size for trials 8,10,14 for subject 1
# Jm=12

# # #sigma, omega(w), beta
# theta = [0.0000, 0.2376, 0.9739]
# temp = np.identity(Jm-1)
# M = np.empty((Jm, Jm-1, 12))
# for i in range(1, Jm+1):
#     M[i-1] = np.concatenate((temp[:,0:i-1], -1*np.ones((Jm-1,1)), temp[:, i-1:]), axis=1)

# #Matrices for only the chosen options
# Mi=M[y-1]

# #This result has been spot checked against the values returned by the MATLAB code for data(1).X, cells 8, 10, and 14 
# v=DivisiveNormalization(theta=theta, data=d)
# pi=calcPiProbitQuad(Mi,v)

# print("probs for chosen options only: {}".format(pi))
# print(pi) 
# #[0.08327671 0.10499576 0.09305649]

# probs=calcPiAll(theta=theta,data=d)
# print("probs for all options: {}".format(probs))

# Simulation of user choices
We simulate 100,000 trials of each of the 3 choice sets and use the values yielded by the `DivisiveNormalization` method + a random noise vector and check that the choice probabilities are roughly in line with the analytic probabilities from `calcPiProbitQuad`.

In [None]:
#def chose_item_dn(d, num_it=1000, theta = [0.0000, 0.2376, 0.9739])
freq_chosen = np.array([0., 0., 0.])
num_it = 100000
v = DivisiveNormalization(theta=theta, data=d)
# the following covariance matrix has the structure
# [ 1    0.5    ...    0.5 ]
# [ 0.5    1    ...    0.5 ]
# [ 0.5   ...    1    0.5  ]
# [ 0.5   0.5   ...    1   ]
cov = np.ones((12, 12)) * 0.5
cov[np.arange(12), np.arange(12)] = 1
mean = np.zeros(12)
for i in range(num_it):
    eps = np.random.multivariate_normal(mean, cov, size=3).T
    u = v + eps
    item_chosen = (u.argmax(axis=0) == (y-1)).astype(float)
    freq_chosen += item_chosen / num_it
    
print(freq_chosen)

###Steps to Computing a Power Analysis Given an Experimental Design and value of theta
1. Read in scores into correct np array format
2. Chose the item given its normalized value 
3. Calculate the probability of the chosen item

In [None]:
chosen

In [None]:

d = choice_set_vals.values
#sigma, omega(w), beta
#theta_h1 = [0.0000, 0.2376, 0.9739]
probs=calcPiAll(theta=t, data=d)
print(probs)

In [None]:
#def chose_item_dn(d, theta = [0.0000, 0.2376, 0.9739]):
def chose_item(theta, data, return_utility=False):
    probs=calcPiAll(theta=t, data=data)
    num_subj = data.shape[0]
    Jm = data.shape[1]

    v = DivisiveNormalization(theta=theta, data=data)
    # the following covariance matrix has the structure
    # [ 1    0.5    ...    0.5 ]
    # [ 0.5    1    ...    0.5 ]
    # [ 0.5   ...    1    0.5  ]
    # [ 0.5   0.5   ...    1   ]


    cov = np.ones((Jm, Jm)) * 0.5
    cov[np.arange(Jm), np.arange(Jm)] = 1
    mean = np.zeros(Jm)
    #for i in range(num_it):
    eps = np.random.multivariate_normal(mean, cov, size=num_subj).T
    #print(eps)
    u = v + eps
    item_chosen = u.argmax(axis=0)
    
    if return_utility:
        return item_chosen, u
    else:
        return item_chosen


In [None]:
def calcModelLL(data, theta, null_theta=None):
    """Calculates the log likelikihood given theta values for a DN model. If a null model is being tested,
    it will chose the item based on the alternative model, then calculate the probability of that choice, and the 
    log-likelihood given both the alternative model and the null model
    """
    #This is not really right. Need to figure out how to solve the probability issue since this is calculating based on the theoretical prob, which is not the same as the observeed prob
    ##TODO: Fix this so that it works on variable data size. Right now only running on 20-movie decisions
    #probably need to calculate this based on the calculated u, not on the theoretical probs
    probs=calcPiAll(theta=theta, data=data)
    item_chosen = chose_item(theta=theta, data=data) 
    
    
    chosen_probs=probs[np.arange(len(probs)), item_chosen]
    #add epsilon to all values to prevent divide by zero error
    chosen_probs = chosen_probs + sys.float_info.epsilon
    LL = sum(np.log(chosen_probs))
    
    if null_theta:
        null_probs=calcPiAll(theta=null_theta, data=data)
        #add epsilon to all values to prevent divide by zero error
        null_probs = null_probs + sys.float_info.epsilon
        #print("calculating null theta")
        
        null_chosen_probs = null_probs[np.arange(len(null_probs)), item_chosen]
        #print("null chosen probs {}".format(null_chosen_probs))
        null_LL = sum(np.log(null_chosen_probs))
    else:
        null_LL = None
        #null_chosen_probs = None
    
    return LL, null_LL
    

In [None]:
def vuong_test(p1, p2):
    r"""
    https://gist.github.com/jseabold/6617976
    Vuong-test for non-nested models.
    Parameters
    ----------
    p1 : array-like
        f1(Y=y_i | x_i)
    p2 : array-like
        f2(Y=y_i | x_i)
    Notes
    -----
    This is hard-coded for testing Poisson vs. Zero-inflated. E.g.,
    it does not account for
    Let f_j(y_i|x_i) denote the predicted probability that random variable Y
    equals y_i under the assumption that the distribution is f_j(y_i|x_i) for
    j = 1,2. Let
    .. math::
       m_i = log(\frac{f_1(y_i|x_i)}{f_2(y_i|x_i)})
    The test statistic from Vuong to test the hypothesis of Model 1 vs.
    Model 2 is
    .. math::
       v = \frac{\sqrt{n}(1/n \sum_{i=1}^{n}m_i)}{\sqrt{1/n \sum_{i=1}^{n}(m_i - \bar{m})^2}}
    This statistic has a limiting standard normal distribution. Values of
    v greater than ~2, indicate that model 1 is preferred. Values of V
    less than ~-2 indicate the model 2 is preferred. Values of |V| < ~2 are
    inconclusive.
    References
    ----------
    Greene, W. Econometric Analysis.
    Vuong, Q.H. 1989 "Likelihood ratio tests for model selection and
        non-nested hypotheses." Econometrica. 57: 307-333.
    """
    m = np.log(p1) - np.log(p2)
    n = len(m)
    v = n ** .5 * m.mean() / m.std()
    return v, stats.norm.sf(np.abs(v))

In [None]:
def nestedLRT(LL, nullLL):
    
    df = len([ele for idx, ele in enumerate(thetaDN) if thetaDNNull[idx]!=ele])
    LR = 2*(LL-nullLL)
    #consider using chi2.sf since sometimes it is more accurate? https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2.html
    p=1 - chi2.cdf(LR, df)
  
    return LR, p

In [None]:
dnLL_preds = calc

In [10]:
thetaDN=[0.114, 0.177, 1]#Webb 2020 sigma and omega only
thetaDNNull = [0.114, 0, 1]#Fix omega to 0 to test hypothesis that normalization occurrs
dnLL, nullLL, = calcModelLL(d, thetaDN, thetaDNNull)
LR, p = nestedLRT(dnLL, nullLL)

print(LR, p)

NameError: name 'calcModelLL' is not defined

In [None]:
thetaDNb=[0.012, 0.412, 25.74]
dnb_probs=calcPiAll(theta=thetaDNb, data=d)

In [None]:
print(LL, nullLL)
LR = 2*(LL-nullLL)
print(LR)
p=1 - chi2.cdf(LR, 2)
print(p)

In [None]:
#d = np.array([
 #            [4, 2.33, 1.875, 1.8, 1.5, 1.495, 1.335, 1.275, 1.125, 1.09, 1, 0.925],              
  #           [2.125, 2.125, 2.025, 2.0, 1.875, 1.495, 1.485, 1.335, 1.275, 1.075, 1.0, 0.625],
   #          [4.0, 2.17,  2.0, 2.0, 1.875, 1.875, 1.5, 1.485, 1.335, 1.275, 1.09, 1.075],
    #        ])
#omega allowed to vary. Set to value in Webb et al., 2020
theta_h1 = [1.0, 0.117, 1.0]
#This is the null model that tests that omega != 0
theta_h0 = [theta_h1[0], 0, theta_h1[2]]

LLs = calcModelLL(theta=theta_h1, data=d, null_theta=theta_h0)
#LL_h1 = calcModelLL(theta=theta_h1, data=d)

print("LL for H0 model: {}".format(LLs[1]))
print("LL for H1 model: {}".format(LLs[0]))
#print(LL_h1)#-362.68216377703664
LR = 2*(LLs[0]-LLs[1])
print(LR)

In [None]:
p=1 - chi2.cdf(LR, 1)
print(p)

In [None]:
d = choice_set_vals.values
d

In [None]:
d = choice_set_vals.values /20
#omega allowed to vary. Set to value in Webb et al., 2020
theta_h1 = [0.44, 0.0006, 1.0]
#This is the null model that tests that omega != 0
theta_h0 = [theta_h1[0], 0, theta_h1[2]]

LLs = calcModelLL(theta=theta_h1, data=d, null_theta=theta_h0)
#LL_h1 = calcModelLL(theta=theta_h1, data=d)

print("LL for H0 model: {}".format(LLs[1]))
print("LL for H1 model: {}".format(LLs[0]))
#print(LL_h1)#-362.68216377703664
LR = 2*(LLs[0]-LLs[1])
print(LR)
p=1 - chi2.cdf(LR, 1)
print(p)

In [None]:
null_probs=calcPiAll(theta=theta_h0, data=d)

In [None]:
null_probs_df= pd.DataFrame(null_probs)
null_probs_df.head(10)

In [None]:
choice_set_vals.head(10)

In [None]:
# save as Python
#!jupyter nbconvert --to script DivisiveNormalization.ipynb