In [1]:
import numpy as np
import pandas as pd
from scipy.stats import norm, chi2
import sys

In [3]:
def DivisiveNormalization(theta, data):
    denom = theta[0] + np.multiply(theta[1], np.linalg.norm(data, theta[2], 1))
    v=np.divide(data.T, denom)
    
    return v

In [4]:
def calcPiProbitQuad(Mi, v):
    
    MiT=np.transpose(Mi, axes=(0,2,1))
    T=v.shape[0]
    [x, w] = np.polynomial.hermite.hermgauss(100)

    #I honestly don't really know how tensordot works, but these lines of code return the correct values
    c = np.tensordot(MiT,v, axes=([1,0]))
    cT=np.transpose(c, axes=(0,2,1))
    vi = cT.diagonal() #This matches vi in MATLAB for s=1, trials 8,10,14
    
    #first part of equation in ProbaChoice.m, line 242
    z1=np.multiply(-2**0.5, vi)

    #second part of equation in ProbaChoice.m, line 242
    z2=np.multiply(-2**0.5, x)

    #These values have been validated
    zz = [z1-ele for ele in z2]

    aa=np.prod(norm.cdf(zz), axis=1)
    #Pi have been validated
    Pi=np.divide(np.sum(np.multiply(w.reshape(100,1), aa), axis=0), np.pi**0.5)
    
    return Pi
    

In [47]:
def calcPiChosen(theta, data, choices, compute_DN=True, **kwargs):
    
    if compute_DN:
        v=DivisiveNormalization(theta=theta, data=data)
    else:
        v = kwargs.get("v", None)
    
    probs = np.empty(data.shape)
    #get the size of the choice array. Choice arrays must be the same size
    Jm=data.shape[1]
    temp = np.identity(Jm-1)
    M = np.empty((Jm, Jm-1, Jm))


    for i in range(Jm):
        M[i] = np.concatenate((temp[:,0:i], -1*np.ones((Jm-1,1)), temp[:, i:]), axis=1)

    Mi=M[choices]
    pi = calcPiProbitQuad(Mi, v)
    
    return pi

In [19]:
# data=choice_set_vals
# choices = chosen_vals
# v=DivisiveNormalization(theta=thetaDN, data=choice_set_vals)
# probs = np.empty(data.shape)
# #get the size of the choice array. Choice arrays must be the same size
# Jm=data.shape[1]
# temp = np.identity(Jm-1)
# M = np.empty((Jm, Jm-1, Jm))




# for i in range(Jm):
#     M[i] = np.concatenate((temp[:,0:i], -1*np.ones((Jm-1,1)), temp[:, i:]), axis=1)

# Mi=M[chosen_vals]
# pi = calcPiProbitQuad(Mi, v)

In [5]:
def calcPiAll(theta, data):
    
    v=DivisiveNormalization(theta, data)
    
    probs = np.empty(data.shape)
    #get the size of the choice array. Choice arrays must be the same size
    Jm=data.shape[1]
    temp = np.identity(Jm-1)
    M = np.empty((Jm, Jm-1, Jm))


    for i in range(Jm):
        M[i] = np.concatenate((temp[:,0:i], -1*np.ones((Jm-1,1)), temp[:, i:]), axis=1)
    
    for i in range(Jm):
        y=np.array([i]*data.shape[0])
        #print(y)
        
        #Matrices for only the chosen options
        Mi=M[y]
        #print(Mi)
        
        pi=calcPiProbitQuad(Mi,v)
        probs[:,i]=pi.T
        #print("pi is {}".format(pi))
    return probs

In [77]:
def chose_item(theta, data, return_utility=False):
    probs=calcPiAll(theta=theta, data=data)
    num_subj = data.shape[0]
    Jm = data.shape[1]

    v = DivisiveNormalization(theta=theta, data=data)

    cov = np.ones((Jm, Jm)) * 0.5
    cov[np.arange(Jm), np.arange(Jm)] = 1
    mean = np.zeros(Jm)
    #for i in range(num_it):
    eps = np.random.multivariate_normal(mean, cov, size=num_subj).T
    #print(eps)
    u = v + eps
    item_chosen = u.argmax(axis=0)
    
    if return_utility:
        return item_chosen, u
    else:
        return item_chosen


In [50]:
#item_chosen, utility = chose_item(theta=thetaDN, data=choice_set_vals, return_utility=True)
#item_chosen, utility = chose_item(theta=theta, data=data, return_utility=True) 
    
#chosen_probs = calcPiChosen(theta=thetaDN, data=choice_set_vals, choices=item_chosen, compute_DN=False, v=utility)
#probs = calcPiChosen(theta=thetaDN, data=choice_set_vals, choices=item_chosen)

In [51]:
probs

array([0.05067833, 0.04932959, 0.05233648, 0.04869672, 0.04939065,
       0.04994267, 0.04921606, 0.04861186, 0.04702758, 0.04940131,
       0.04785192, 0.05806789, 0.05074143, 0.04461944, 0.05241813,
       0.04964373, 0.05023651, 0.04910548, 0.04983168, 0.04631339,
       0.05004627, 0.05044568, 0.05030846, 0.04972277, 0.04851939,
       0.05046718, 0.04932908, 0.04361602, 0.05686752, 0.04994705,
       0.04946589, 0.05142896, 0.05434028, 0.04838589, 0.0491843 ,
       0.05741014, 0.04983541, 0.04517124, 0.05184058, 0.05502792,
       0.04859988, 0.05523065, 0.04759689, 0.04925558, 0.04965296,
       0.06064578, 0.05557062, 0.04430633, 0.05097432, 0.04919554,
       0.05342853, 0.05201958, 0.05087096, 0.04977667, 0.04881539,
       0.04954218, 0.05694637, 0.05514526, 0.04586673, 0.0512188 ,
       0.04994239, 0.04884701, 0.04965848, 0.04554893, 0.04901566,
       0.04719087, 0.0482337 , 0.0572407 , 0.04754371, 0.04895584,
       0.04978869, 0.05594186, 0.05      , 0.04989403, 0.05125

In [49]:
chosen_probs

array([0.33985989, 0.51910933, 0.19556666, 0.22958043, 0.28236234,
       0.25740071, 0.2850636 , 0.57648276, 0.3640491 , 0.46651057,
       0.25928401, 0.38885703, 0.27520808, 0.55783343, 0.51220286,
       0.49333278, 0.64197997, 0.9647931 , 0.32171358, 0.20929643,
       0.20941335, 0.31502702, 0.20472493, 0.23600778, 0.28276203,
       0.2050716 , 0.33889212, 0.44857318, 0.28044371, 0.33882104,
       0.24526206, 0.24421668, 0.25835342, 0.21339425, 0.19061498,
       0.25009143, 0.41370148, 0.39239902, 0.32543598, 0.33903065,
       0.43318115, 0.42340619, 0.22165985, 0.53722598, 0.45154781,
       0.38326654, 0.17037961, 0.21061661, 0.23774143, 0.47699112,
       0.31709306, 0.32851718, 0.26282185, 0.27241978, 0.26501491,
       0.28141751, 0.29833908, 0.31139778, 0.22010629, 0.69062294,
       0.34119979, 0.3422035 , 0.4525259 , 0.27833535, 0.27788647,
       0.30615307, 0.21178032, 0.31309786, 0.22054728, 0.38196631,
       0.35412216, 0.49414019, 0.27463856, 0.30055389, 0.30766

In [6]:
#Load data from Bollen et al., 2010
choice = pd.read_csv('/Users/amywinecoff/Documents/CITP/Research/Github/AgentChoiceSim/co1_wide.csv')  

#for now, remove the conditions with 5 options so I can figure out the code for a fixed set size
choice = choice[~choice['condition'].isin(['Top5', 'Top5_NR'])]

score_cols = [c for c in choice.columns if 'score' in c]
movie_cols = [c for c in choice.columns if 'movie' in c]
choice_set_vals = np.array(choice[score_cols]/10)

choice['chosen_num']=None
for idx, m in enumerate(movie_cols):
    choice['chosen_num'] = np.where(choice[m]==choice["choice"], idx, choice['chosen_num'])
chosen_vals = np.array(choice['chosen_num'].astype(int).values)

chosen = choice_set_vals[np.arange(len(choice_set_vals)), chosen_vals]


# Simulation of user choices
We simulate 100,000 trials of each of the 3 choice sets and use the values yielded by the `DivisiveNormalization` method + a random noise vector and check that the choice probabilities are roughly in line with the analytic probabilities from `calcPiProbitQuad`.

In [74]:
#v.shape


array([[ 1.18240827,  0.4750277 ,  0.49432965, ..., -0.84551732,
         0.74413817, -0.86944711],
       [ 0.74339641,  2.21254872, -0.32288456, ..., -0.30815154,
         1.89320935, -0.67915574],
       [ 0.37245565,  1.6431992 ,  1.34284343, ...,  1.10368014,
         2.22929224,  0.11528717],
       ...,
       [ 0.36520412,  0.97498967,  0.3999641 , ...,  1.41150469,
         0.39762537,  0.19062542],
       [ 0.61083784,  1.78475857,  0.37102898, ...,  0.63663223,
         2.27736603, -0.91682929],
       [-0.3418495 ,  0.45592201,  1.16147606, ..., -1.19606249,
         1.70533312,  1.23852719]])

In [90]:
v = DivisiveNormalization(theta=thetaDN, data=choice_set_vals)
ic, u = chose_item(theta=thetaDN, data=choice_set_vals, return_utility=True)
pi=calcPiChosen(theta=thetaDN, data=choice_set_vals, choices=ic)
print(np.argmax(v, axis=0))
print(np.argmax(u, axis=0))
print(sum(np.log(pi)))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0]
[16  3 19 18  5  3  1 15  5  1  2  2  6  3 12 14  0 17 11 15  4 19 11  4
 15 18 10  8  1 16  5  3 10  7  9 17 18 11 19 11 11  2  7 16 15 16 14  0
 12  0 15 18 10 18 14  9 17  6  5  3 16 17  5  4 15  2  4  2 17  3  9  1
  5  6  1 12 18  0 12 15  3 16  2 19 12 16 15  1 12  2  7  6  6 13  8  4
 16 19  1  2  9  1 14  8 11 19 15 18 11  2  4  5  1 18 17 13  6 18 11 17
  8]
-360.98094776750617


In [91]:
#v = DivisiveNormalization(theta=thetaDNNull, data=choice_set_vals)
#ic, u = chose_item(theta=thetaDNNull, data=choice_set_vals, return_utility=True)
pi=calcPiChosen(theta=thetaDNNull, data=choice_set_vals, choices=ic)
#print(np.argmax(v, axis=0))
#print(np.argmax(u, axis=0))
print(sum(np.log(pi)))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0]
[16  3 19 18  5  3  1 15  5  1  2  2  6  3 12 14  0 17 11 15  4 19 11  4
 15 18 10  8  1 16  5  3 10  7  9 17 18 11 19 11 11  2  7 16 15 16 14  0
 12  0 15 18 10 18 14  9 17  6  5  3 16 17  5  4 15  2  4  2 17  3  9  1
  5  6  1 12 18  0 12 15  3 16  2 19 12 16 15  1 12  2  7  6  6 13  8  4
 16 19  1  2  9  1 14  8 11 19 15 18 11  2  4  5  1 18 17 13  6 18 11 17
  8]
-2085.6292119782784


In [61]:
#def chose_item_dn(d, num_it=1000, theta = [0.0000, 0.2376, 0.9739])
theta=thetaDN
d = np.array([
             [4, 2.33, 1.875, 1.8, 1.5, 1.495, 1.335, 1.275, 1.125, 1.09, 1, 0.925],              
             [2.125, 2.125, 2.025, 2.0, 1.875, 1.495, 1.485, 1.335, 1.275, 1.075, 1.0, 0.625],
             [4.0, 2.17,  2.0, 2.0, 1.875, 1.875, 1.5, 1.485, 1.335, 1.275, 1.09, 1.075],
            ])
freq_chosen = np.array([0., 0., 0.])
num_it = 100000
v = DivisiveNormalization(theta=theta, data=d)
# the following covariance matrix has the structure
# [ 1    0.5    ...    0.5 ]
# [ 0.5    1    ...    0.5 ]
# [ 0.5   ...    1    0.5  ]
# [ 0.5   0.5   ...    1   ]
cov = np.ones((12, 12)) * 0.5
cov[np.arange(12), np.arange(12)] = 1
mean = np.zeros(12)
for i in range(num_it):
    eps = np.random.multivariate_normal(mean, cov, size=3).T
    u = v + eps
    item_chosen = (u.argmax(axis=0) == (y-1)).astype(float)
    freq_chosen += item_chosen / num_it
    
print(freq_chosen)

NameError: name 'y' is not defined

###Steps to Computing a Power Analysis Given an Experimental Design and value of theta
1. Read in scores into correct np array format
2. Chose the item given its normalized value 
3. Calculate the probability of the chosen item (based on u rather than strict probabilities for option values?)

In [None]:

d = choice_set_vals.values
#sigma, omega(w), beta
#theta_h1 = [0.0000, 0.2376, 0.9739]
probs=calcPiAll(theta=t, data=d)
print(probs)

In [99]:
def calcModelLL(data, theta, null_theta=None, value_form='utility'):
    """Calculates the log likelikihood given theta values for a DN model. If a null model is being tested,
    it will chose the item based on the alternative model, then calculate the probability of that choice, and the 
    log-likelihood given both the alternative model and the null model
    """
    #This is not really right. Need to figure out how to solve the probability issue since this is calculating based on the theoretical prob, which is not the same as the observeed prob
    ##TODO: Fix this so that it works on variable data size. Right now only running on 20-movie decisions
    #probably need to calculate this based on the calculated u, not on the theoretical probs
    probs=calcPiAll(theta=theta, data=data)
    #item_chosen, utility = chose_item(theta=theta, data=data, return_utility=True) 
    item_chosen = chose_item(theta=theta, data=data)
    
    #chosen_probs = calcPiChosen(theta=theta, data=data, item_chosen, compute_DN=False, v=utility)
    
    
    chosen_probs=probs[np.arange(len(probs)), item_chosen]
    #add epsilon to all values to prevent divide by zero error
    chosen_probs = chosen_probs + sys.float_info.epsilon
    LL = sum(np.log(chosen_probs))
    
    if null_theta:
        null_probs=calcPiAll(theta=null_theta, data=data)
        #add epsilon to all values to prevent divide by zero error
        null_probs = null_probs + sys.float_info.epsilon
        #print("calculating null theta")
        
        null_chosen_probs = null_probs[np.arange(len(null_probs)), item_chosen]
        #print("null chosen probs {}".format(null_chosen_probs))
        null_LL = sum(np.log(null_chosen_probs))
    else:
        null_LL = None
        #null_chosen_probs = None
    
    return LL, null_LL
    

In [None]:
def vuong_test(p1, p2):
    r"""
    https://gist.github.com/jseabold/6617976
    Vuong-test for non-nested models.
    Parameters
    ----------
    p1 : array-like
        f1(Y=y_i | x_i)
    p2 : array-like
        f2(Y=y_i | x_i)
    Notes
    -----
    This is hard-coded for testing Poisson vs. Zero-inflated. E.g.,
    it does not account for
    Let f_j(y_i|x_i) denote the predicted probability that random variable Y
    equals y_i under the assumption that the distribution is f_j(y_i|x_i) for
    j = 1,2. Let
    .. math::
       m_i = log(\frac{f_1(y_i|x_i)}{f_2(y_i|x_i)})
    The test statistic from Vuong to test the hypothesis of Model 1 vs.
    Model 2 is
    .. math::
       v = \frac{\sqrt{n}(1/n \sum_{i=1}^{n}m_i)}{\sqrt{1/n \sum_{i=1}^{n}(m_i - \bar{m})^2}}
    This statistic has a limiting standard normal distribution. Values of
    v greater than ~2, indicate that model 1 is preferred. Values of V
    less than ~-2 indicate the model 2 is preferred. Values of |V| < ~2 are
    inconclusive.
    References
    ----------
    Greene, W. Econometric Analysis.
    Vuong, Q.H. 1989 "Likelihood ratio tests for model selection and
        non-nested hypotheses." Econometrica. 57: 307-333.
    """
    m = np.log(p1) - np.log(p2)
    n = len(m)
    v = n ** .5 * m.mean() / m.std()
    return v, stats.norm.sf(np.abs(v))

In [98]:
2*(-7944.39+7969.81)

p=1 - chi2.cdf(2*(-7944.39+7969.81), 1)
p

1.0020873020266663e-12

In [56]:
def nestedLRT(LL, nullLL):
    
    df = len([ele for idx, ele in enumerate(thetaDN) if thetaDNNull[idx]!=ele])
    LR = 2*(LL-nullLL)
    #consider using chi2.sf since sometimes it is more accurate? https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2.html
    p=1 - chi2.cdf(LR, df)
  
    return LR, p

In [None]:
dnLL_preds = calc

In [130]:
thetaDN=[0.114, 0.0005, 1]#Webb 2020 sigma and omega only
thetaDNNull = [0.114, 0, 1]#Fix omega to 0 to test hypothesis that normalization occurrs
dnLL, nullLL, = calcModelLL(data=choice_set_vals, theta=thetaDN, null_theta=thetaDNNull)
print("dnLL = {}, nullLL= {}".format(dnLL, nullLL))
LR, p = nestedLRT(dnLL, nullLL)

print(LR, p)

dnLL = -194.3428242649589, nullLL= -195.96713139726944
3.2486142646210965 0.0714838688552959


In [127]:
thetaDNb=[0.012, 0.412, 25.74]
dnb_probs=calcPiAll(theta=thetaDNb, data=d)

In [None]:
print(LL, nullLL)
LR = 2*(LL-nullLL)
print(LR)
p=1 - chi2.cdf(LR, 2)
print(p)

In [None]:
#d = np.array([
 #            [4, 2.33, 1.875, 1.8, 1.5, 1.495, 1.335, 1.275, 1.125, 1.09, 1, 0.925],              
  #           [2.125, 2.125, 2.025, 2.0, 1.875, 1.495, 1.485, 1.335, 1.275, 1.075, 1.0, 0.625],
   #          [4.0, 2.17,  2.0, 2.0, 1.875, 1.875, 1.5, 1.485, 1.335, 1.275, 1.09, 1.075],
    #        ])
#omega allowed to vary. Set to value in Webb et al., 2020
theta_h1 = [1.0, 0.117, 1.0]
#This is the null model that tests that omega != 0
theta_h0 = [theta_h1[0], 0, theta_h1[2]]

LLs = calcModelLL(theta=theta_h1, data=d, null_theta=theta_h0)
#LL_h1 = calcModelLL(theta=theta_h1, data=d)

print("LL for H0 model: {}".format(LLs[1]))
print("LL for H1 model: {}".format(LLs[0]))
#print(LL_h1)#-362.68216377703664
LR = 2*(LLs[0]-LLs[1])
print(LR)

In [None]:
p=1 - chi2.cdf(LR, 1)
print(p)

In [None]:
d = choice_set_vals.values
d

In [None]:
d = choice_set_vals.values /20
#omega allowed to vary. Set to value in Webb et al., 2020
theta_h1 = [0.44, 0.0006, 1.0]
#This is the null model that tests that omega != 0
theta_h0 = [theta_h1[0], 0, theta_h1[2]]

LLs = calcModelLL(theta=theta_h1, data=d, null_theta=theta_h0)
#LL_h1 = calcModelLL(theta=theta_h1, data=d)

print("LL for H0 model: {}".format(LLs[1]))
print("LL for H1 model: {}".format(LLs[0]))
#print(LL_h1)#-362.68216377703664
LR = 2*(LLs[0]-LLs[1])
print(LR)
p=1 - chi2.cdf(LR, 1)
print(p)

In [None]:
null_probs=calcPiAll(theta=theta_h0, data=d)

In [None]:
null_probs_df= pd.DataFrame(null_probs)
null_probs_df.head(10)

In [None]:
choice_set_vals.head(10)

In [None]:
# save as Python
#!jupyter nbconvert --to script DivisiveNormalization.ipynb