In [1]:
import numpy as np
import pandas as pd
from scipy.stats import norm, chi2
import sys

In [2]:
def DivisiveNormalization(theta, data):
    denom = theta[0] + np.multiply(theta[1], np.linalg.norm(data, theta[2], 1))
    v=np.divide(data.T, denom)
    
    return v

In [3]:
def calcPiProbitQuad(Mi, v):
    
    MiT=np.transpose(Mi, axes=(0,2,1))
    T=v.shape[0]
    [x, w] = np.polynomial.hermite.hermgauss(100)

    #I honestly don't really know how tensordot works, but these lines of code return the correct values
    c = np.tensordot(MiT,v, axes=([1,0]))
    cT=np.transpose(c, axes=(0,2,1))
    vi = cT.diagonal() #This matches vi in MATLAB for s=1, trials 8,10,14
    
    #first part of equation in ProbaChoice.m, line 242
    z1=np.multiply(-2**0.5, vi)

    #second part of equation in ProbaChoice.m, line 242
    z2=np.multiply(-2**0.5, x)

    #These values have been validated
    zz = [z1-ele for ele in z2]

    aa=np.prod(norm.cdf(zz), axis=1)
    #Pi have been validated
    Pi=np.divide(np.sum(np.multiply(w.reshape(100,1), aa), axis=0), np.pi**0.5)
    
    return Pi
    

In [4]:
def calcPiChosen(v, choices):
    
    """v is values from DivisiveNormalization, choices is an array of containing the indices of the chosen options"""

    probs = np.empty((v.shape[1], v.shape[0]))#reverse shape from the data
    #get the size of the choice array. Choice arrays must be the same size
    Jm=v.shape[0]
    temp = np.identity(Jm-1)
    M = np.empty((Jm, Jm-1, Jm))


    for i in range(Jm):
        M[i] = np.concatenate((temp[:,0:i], -1*np.ones((Jm-1,1)), temp[:, i:]), axis=1)

    Mi=M[choices]
    pi = calcPiProbitQuad(Mi, v)
    
    return pi

In [5]:
def calcPiAll(v):
    
    probs = np.empty((v.shape[1], v.shape[0]))#reverse shape from the data
    #get the size of the choice array. Choice arrays must be the same size
    Jm=v.shape[0]
    temp = np.identity(Jm-1)
    M = np.empty((Jm, Jm-1, Jm))


    for i in range(Jm):
        M[i] = np.concatenate((temp[:,0:i], -1*np.ones((Jm-1,1)), temp[:, i:]), axis=1)
    
    for i in range(Jm):
        y=np.array([i]*v.shape[1])

        
        #Matrices for only the chosen options
        Mi=M[y]
        
        pi=calcPiProbitQuad(Mi,v)
        probs[:,i]=pi.T

    return probs

In [6]:
def choose_item(v):
    probs=calcPiAll(v)
    num_subj = v.shape[1]
    Jm = v.shape[0]


    cov = np.ones((Jm, Jm)) * 0.5
    cov[np.arange(Jm), np.arange(Jm)] = 1
    mean = np.zeros(Jm)
    #for i in range(num_it):
    eps = np.random.multivariate_normal(mean, cov, size=num_subj).T
    #print(eps)
    u = v + eps
    item_chosen = u.argmax(axis=0)
    
    return item_chosen


In [69]:
# thetaDN=[0.114, 0.177, 1]
# v=DivisiveNormalization(theta=thetaDN, data=choice_set_vals)
# all_pi = calcPiAll(v=v)
# chosen_pi = calcPiChosen(v=v, choices=chosen_vals)
# ic = choose_item(v)
#Load data from Bollen et al., 2010
#choice = pd.read_csv('/Users/amywinecoff/Documents/CITP/Research/Github/AgentChoiceSim/co1_wide.csv')
choice.shape

(121, 144)

In [68]:
#Load data from Bollen et al., 2010
choice = pd.read_csv('/Users/amywinecoff/Documents/CITP/Research/Github/AgentChoiceSim/co1_wide.csv')  

#for now, remove the conditions with 5 options so I can figure out the code for a fixed set size
choice = choice[~choice['condition'].isin(['Top5', 'Top5_NR'])]

score_cols = [c for c in choice.columns if 'score' in c]
movie_cols = [c for c in choice.columns if 'movie' in c]
choice_set_vals = np.array(choice[score_cols]/10)

choice['chosen_num']=None
for idx, m in enumerate(movie_cols):
    choice['chosen_num'] = np.where(choice[m]==choice["choice"], idx, choice['chosen_num'])
chosen_vals = np.array(choice['chosen_num'].astype(int).values)

chosen = choice_set_vals[np.arange(len(choice_set_vals)), chosen_vals]


In [66]:
choice['choice_set_variance']=np.var(choice_set_vals, axis=1)
choice['choice_set_min']=np.min(choice_set_vals, axis=1)
choice['choice_set_max']=np.max(choice_set_vals, axis=1)
choice['choice_set_mean']=np.mean(choice_set_vals, axis=1)

choice[['condition','choice_set_variance', 'choice_set_min', 'choice_set_max', 'choice_set_mean']].groupby('condition').mean().head()


Unnamed: 0_level_0,choice_set_variance,choice_set_min,choice_set_max,choice_set_mean
condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Lin20,0.112596,3.071429,3.992857,3.451429
Lin20_NR,0.180422,3.063333,4.243333,3.5415
Top20,0.007317,3.734483,4.017241,3.821379
Top20_NR,0.007866,3.729412,3.997059,3.812059


### Steps to Computing a Power Analysis Given an Experimental Design and value of theta
1. Read in scores into correct np array format
2. Chose the item given its normalized value, assuming the alternative hypothesis is true 
3. Calculate the probability of the chosen item under the alternative and null hypothesis
4. Calcualte the LRT
5. Rise and repeat

In [36]:
def calcModelLL(data, theta, **kwargs):
    """Calculates the log likelikihood given theta values for a DN model. If a null model is being tested,
    it will chose the item based on the alternative model, then calculate the probability of that choice, and the 
    log-likelihood given both the alternative model and the null model
    """
    #This is not really right. Need to figure out how to solve the probability issue since this is calculating based on the theoretical prob, which is not the same as the observeed prob
    ##TODO: Fix this so that it works on variable data size. Right now only running on 20-movie decisions
    #probably need to calculate this based on the calculated u, not on the theoretical probs
    null_theta = kwargs.get("null_theta", None)
    sim_choice = kwargs.get("sim_choice", False)
    choices = kwargs.get("choices", None)
    
    if not sim_choice and choices is None:
        sim_choice = True
        print("Choice not specified. Defaulting to simulating choices based on alternative theta.")
    
    
    v=DivisiveNormalization(theta=theta, data=data)
    if sim_choice:
        item_chosen = choose_item(v)
    else:
        item_chosen = choices
        
    eps = sys.float_info.epsilon  
    #add epsilon to all values to prevent divide by zero error
    chosen_probs = calcPiChosen(v=v, choices=item_chosen) + eps  
    LL = sum(np.log(chosen_probs))
        
    
    if null_theta:
        v_null=DivisiveNormalization(theta=null_theta, data=data)
        null_chosen_probs = calcPiChosen(v=v_null, choices=item_chosen) + eps
        nullLL = sum(np.log(null_chosen_probs))
            
        return LL, nullLL
    
    return LL
    

In [31]:
def MCPowerSimulation(data, alt_theta, null_theta, dof, iterations=1000, alpha=0.05):
    
    simulation_stats = []
    
    for i in range(iterations):
        LL, nullLL = calcModelLL(data=choice_set_vals, theta=thetaDN, null_theta=thetaDNNull, sim_choice=True)
        
        LR = 2*(LL-nullLL)
        #consider using chi2.sf since sometimes it is more accurate? https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2.html
        p=1 - chi2.cdf(LR, dof)
        
        simulation_stats.append([i, LL, nullLL, LR, p])
    
    simulation_df = pd.DataFrame(simulation_stats,columns = ["iter","altLL", "nullLL", "LR", "p"])
    
    sig_iters = simulation_df[simulation_df["p"]< alpha]
    
    power = sig_iters.shape[0] / simulation_df.shape[0]
    
    return power, simulation_df

In [32]:
thetaDN3=[0.114, 0.177, 1]#Webb 2020 sigma and omega only for trinary choice
thetaDNNull3 = [thetaDN3[0] 0, 1]#Fix omega to 0 to test hypothesis that normalization occurrs
power3, df3 = MCPowerSimulation(data=choice_set_vals, alt_theta=thetaDN3, null_theta=thetaDNNull3, dof=1)

In [33]:
power3

0.992

In [34]:
df3.head()

Unnamed: 0,iter,altLL,nullLL,LR,p
0,0,-361.182312,-365.877711,9.390798,0.002180773
1,1,-363.120422,-383.588871,40.936898,1.572246e-10
2,2,-361.064398,-365.347611,8.566427,0.003424186
3,3,-362.546792,-377.078355,29.063125,7.005784e-08
4,4,-362.05509,-374.110021,24.109861,9.099314e-07


In [26]:
thetaDNSS=[0.985, 0.02, 1]#Webb 2020 sigma and omega only for set size choice
thetaDNNullSS = [thetaDNSS[0], 0, 1]#Fix omega to 0 to test hypothesis that normalization occurrs
powerSS, dfSS = MCPowerSimulation(data=choice_set_vals, alt_theta=thetaDNSS, null_theta=thetaDNNullSS, dof=1)

In [27]:
powerSS

0.985

In [70]:
thetaDNw=[0.001, 0.442, 18.85]
calcModelLL(data=choice_set_vals, theta=thetaDNSS, choices=chosen_vals)

-340.2286206602855

In [57]:
thetaDNwNull=[1.21, 0, 1]
calcModelLL(data=choice_set_vals, theta=thetaDNwNull, choices=chosen_vals)

-330.2544285156778

In [13]:
#Figure out how to adapt for the current situation, if needed for testing non-nested models
# def vuong_test(p1, p2):
#     r"""
#     https://gist.github.com/jseabold/6617976
#     Vuong-test for non-nested models.
#     Parameters
#     ----------
#     p1 : array-like
#         f1(Y=y_i | x_i)
#     p2 : array-like
#         f2(Y=y_i | x_i)
#     Notes
#     -----
#     This is hard-coded for testing Poisson vs. Zero-inflated. E.g.,
#     it does not account for
#     Let f_j(y_i|x_i) denote the predicted probability that random variable Y
#     equals y_i under the assumption that the distribution is f_j(y_i|x_i) for
#     j = 1,2. Let
#     .. math::
#        m_i = log(\frac{f_1(y_i|x_i)}{f_2(y_i|x_i)})
#     The test statistic from Vuong to test the hypothesis of Model 1 vs.
#     Model 2 is
#     .. math::
#        v = \frac{\sqrt{n}(1/n \sum_{i=1}^{n}m_i)}{\sqrt{1/n \sum_{i=1}^{n}(m_i - \bar{m})^2}}
#     This statistic has a limiting standard normal distribution. Values of
#     v greater than ~2, indicate that model 1 is preferred. Values of V
#     less than ~-2 indicate the model 2 is preferred. Values of |V| < ~2 are
#     inconclusive.
#     References
#     ----------
#     Greene, W. Econometric Analysis.
#     Vuong, Q.H. 1989 "Likelihood ratio tests for model selection and
#         non-nested hypotheses." Econometrica. 57: 307-333.
#     """
#     m = np.log(p1) - np.log(p2)
#     n = len(m)
#     v = n ** .5 * m.mean() / m.std()
#     return v, stats.norm.sf(np.abs(v))

In [14]:
# save as Python
#!jupyter nbconvert --to script DivisiveNormalization.ipynb