In [1]:
import numpy as np
from scipy import stats
import unittest
import csv
import gc
import sys
%run helper_fns.ipynb
%run probability_fns.ipynb
%run likelihood_fns.ipynb
%run sampling_fns.ipynb

...........
----------------------------------------------------------------------
Ran 11 tests in 0.009s

OK
...........
----------------------------------------------------------------------
Ran 11 tests in 0.009s

OK
......
----------------------------------------------------------------------
Ran 6 tests in 0.012s

OK
...........
----------------------------------------------------------------------
Ran 11 tests in 0.009s

OK
...........
----------------------------------------------------------------------
Ran 11 tests in 0.010s

OK
......
----------------------------------------------------------------------
Ran 6 tests in 0.012s

OK
..

end is lower than start; can happen and will return 0 which will cause neg infs later: [-inf, 0.0, -1.0, 1.0, inf]
end is lower than start; can happen and will return 0 which will cause neg infs later: [-inf, 0.0, -1.0, 1.0, inf]
A will cause overflow:        0.35529257637474965 (A1) -3.6897845409794487 (A2) -4.045077117354198 (A3) 0
some thetas out of order: [-inf, 1.5, 1.3, 3.5, inf]; test: [False  True False False  True]
np.inf returned; should only really happen with thetas out of order:[-inf, 1.5, 1.3, 3.5, inf]
LL: -inf
LP: -3.4768155996140178
A is -inf: A -inf; A1 -inf A2 -10.146213436622062 A3 -0.2255550003176261


  LLQ += counts * np.log(prob) # this means fewer calls to lookup table
  real_LP_1 = np.sum(np.log([max(0,
.
----------------------------------------------------------------------
Ran 3 tests in 0.018s

OK


## Likelihood functions and checks
This notebook is to be used as a submodule that contains wrappers for all the likelihood functions used by the ordinal probit model for survey data, and Metropolis-Hastings sampler. There is also an optional testing suite.

In [2]:
class OrdinalProbitSurveyData():
    """A class to fit ordinal probit models to survey data using MCMC. Meant to implement the model in 
    "Analyzing ordinal data with metric models: What could possibly go wrong?""
    Torrin M.Liddell and John K.Kruschke (2018)
    https://www.sciencedirect.com/science/article/abs/pii/S0022103117307746
    
    Initial implementation: Ruairidh McLennan Battleday
    
    Inputs
    ---------
    
    surveyResponses: matrix of ints or list or list of ints. Each row or list contains the counts of responses
                     to a particular question, where the answers form an ordinal list (ordered sequence)
                     and each entry is the counts for each answer. That is, entry 0 (0-based indexing) contains
                     the counts for answer 1 (1-based indexing) to that question. 
                     All rows or questions must have the same meaning and permissible responses
                     (e.g., similarity, 1=highly similar, 9=not at all similar, etc).
                     
                     
    k: int; the number of applicable responses. This will be used to define a range of possible responses
                    from 0:k-1.
                    
                    
    num_samples: number of MCMC samples after discarding burn in samples;
    burn_in: number of initial MCMC samples to discard;
    slice_frequency: how frequently to retain samples after burn in samples;
    print_frequency: how frequently to print.
    
    printing: whether to call subfunctions and methods with printing on (prints all intermediate values)
    debug: whether to run assertion statements within subfunctions.
    
    Structures
    --------------
    paramDict: a dictionary of variables and arguments for each parameter, which are used to conduct MCMC. 
                Should contain the parameter name as key, which returns a sub dictionary.
                Each subdictionary has the following key:value pairs
                "value" key that stores its value
                "proposal_function" key, which contains the proposal function
                "proposal" key, which stores a list of proposal function arguments
                "acceptance_function" key, which stores the acceptance function
                "acceptance" key, which stores the acceptance function arguments.
                
    Methods
    -------------
    Cycle function, which updates a single parameter, and then updates dependencies.
                    
    Outputs
    -----------
    Each of the below will be embedded into a matrix of K x S, where K is the size of the paramter vector in question,
    and S is the number of retained MCMC samples.
    
    mus: vector of floats. These are the mean parameters for the latent Gaussians underlying each question;
    sigmas: vector of floats from positive reals. These are the sd parameters for the latent Gaussians underlying each question;
    thetas: vector of floats from positive reals; ascending sequence. These define the quantiles used to calculate
            the mean response for a question / the response thresholds determining the probability of any given answer.
    """
    
    def __init__(self, surveyResponses, k, num_samples, burn_in, slice_frequency, print_frequency,
                printing=False, debug=False):
        
        self.surveyResponses = surveyResponses
        self.Q = None
        self.k = k
        self.num_samples = num_samples
        self.burn_in = burn_in
        self.slice_frequency = slice_frequency
        self.print_frequency = print_frequency
        self.printing = printing
        self.debug = debug
        self.paramDict = {}
        self.paramNames = []
        
        
        self.mu_0 = (self.k+1)/2
        self.sigma_0 = self.k
        self.sigma_prop = 1.0 # proposal function SD
        self.gamma_mean = 3.0 # mean of gamma prior on sigmas; note, original paper used 3.0 as mode, 
        # but I've changed this to mean for ease of conversion (was it a typo in original paper?)
        self.gamma_spread = 3.0 # sd of gamma prior on sigmas
        self.theta_shift = 0.5
        self.lower = 1.5
        self.upper = k-0.5
        
        self.initialize()

        
    def initialize(self):
        try:
            self.surveyResponses = np.int_(np.array(self.surveyResponses))
        except Exception as E:
            print(E)
            print("Survey responses are the wrong format: should be List of Lists of ints" + \
            "with each subList the same length; or, matrix of ints.")
        assert self.surveyResponses.shape[-1] == self.k, "mismatch between number of columns in surveyResponses and k {} vs {}".format(self.surveyResponses[-1],
                                                                                                                                 k)
        # first establish global and prior parameters; taken from original paper
        self.Q = self.surveyResponses.shape[0]
        self.current_values = {'mu': np.empty(self.Q), 'sigma': np.empty(self.Q), 'theta': np.empty(self.k+1)}
        self.current_values['theta'][0] = -np.inf
        self.current_values['theta'][1] = self.lower
        self.current_values['theta'][self.k] = np.inf
        self.current_values['theta'][self.k-1] = self.upper

        # fill in proposals; should these be functions, so they are recalled to generate a list?
        # or, should there me separate methods for that?
        # should have a stored copy of mus, sigmas, and thetas
        for q in np.arange(self.Q):
            self.paramDict["mu_{}".format(q)] = {"value": self.mu_0,
                                           "proposal_fn": mu_proposal,
                                            "proposal": self.refresh_mu_proposal,
                                            "acceptance_fn": mu_accept,
                                            "acceptance": self.refresh_mu_acceptance}
            
            self.paramDict["sigma_{}".format(q)] = {"value": self.gamma_mean,
                                           "proposal_fn": sigma_proposal,
                                            "proposal": self.refresh_sigma_proposal,
                                            "acceptance_fn": sigma_accept,
                                            "acceptance": self.refresh_sigma_acceptance}
            
        for k_prime in np.arange(2, self.k):
            # just watch this, in case it is redefining last slot
            self.paramDict["theta_{}".format(k_prime)] = {"value": k_prime+self.theta_shift,
                                           "proposal_fn": theta_proposal,
                                            "proposal": self.refresh_theta_proposal,
                                            "acceptance_fn": theta_accept,
                                            "acceptance": self.refresh_theta_acceptance}
            
        self.paramList = sorted(self.paramDict.keys())
        
        for param_name in self.paramList:
            self.update_cached_value(param_name, self.paramDict[param_name]['value'] )
            
    def update_cached_value(self, param_name, value):
        param, num = param_name.split('_')
        num = int(num)
        self.current_values[param][num] = self.paramDict[param_name]['value'] 
        
    def refresh_mu_proposal(self, param, num):
        return [self.current_values[param][num], self.sigma_prop]
    
    def refresh_mu_acceptance(self, param, num, proposal):
        return [self.current_values['mu'][num], proposal, self.surveyResponses[num],
               self.current_values['sigma'][num], self.current_values['theta'], 
               self.mu_0, self.sigma_0, self.sigma_prop]

    def refresh_sigma_proposal(self, param, num):
        return [self.current_values['sigma'][num], self.gamma_spread]
    
    def refresh_sigma_acceptance(self, param, num, proposal):        
        return [self.current_values[param][num], proposal, 
                self.surveyResponses[num], self.current_values['mu'][num], 
                self.current_values['theta'], self.gamma_mean, self.gamma_spread]
    
    def refresh_theta_proposal(self, param, num):
        return [self.current_values[param][num], self.sigma_prop, self.lower, self.upper]
    
    def refresh_theta_acceptance(self, param, num, proposal):
        return [self.current_values[param][num], proposal, self.current_values['theta'], 
                self.surveyResponses, self.current_values['mu'], self.current_values['sigma'],
                num, self.theta_shift, self.sigma_0, self.sigma_prop, self.lower, self.upper]
               
    def update_all_params(self, paramList, paramDict, printing=False, debug=False):
        """ParamDict is the key data structure here.

        Returns a copy of dict for safety: might need to rethink this later."""
        for param_name in sorted(paramList):
            gc.collect()
            gc.collect()
            gc.collect()
            print('on {}'.format(param_name))
            param, num = param_name.split('_')
            num = int(num)
            if printing:
                print('updating {}'.format(param_name))
            # here we need to refresh values of dependencies somehow
            proposalArguments = self.paramDict[param_name]["proposal"](param, num)
            proposal = self.paramDict[param_name]["proposal_fn"](*proposalArguments, debug=True)
            
            acceptanceArguments = self.paramDict[param_name]["acceptance"](param, num, proposal)
            accept = self.paramDict[param_name]["acceptance_fn"](*acceptanceArguments, debug=True)
            ru = np.random.uniform()
            
            if accept > ru:
                self.paramDict[param_name]["value"] = proposal
            
            # whatever happens, update cached value
            self.update_cached_value(param_name, paramDict[param_name]['value'])

        # at the end, refresh the values, which are stored as attributes for the class.    

        return paramDict.copy()
        
    

In [3]:
movieData = []

with open('data/MoviesData.csv') as c:
    read = csv.reader(c)
    for i, row in enumerate(read):
        if i < 2:
            continue
        movieData.append(np.int_(row[2:]))
        
cleanData = np.array(movieData)
np.save('data/MoviesData.npy', cleanData)
testClass = OrdinalProbitSurveyData(cleanData, 5, 1000, 100, 100, 100)

In [4]:
testClass.update_all_params(testClass.paramList, testClass.paramDict)

on mu_0
A will cause overflow:        23.40015031878761 (A1) -1161.645441415783 (A2) -1185.0455917345705 (A3) 0
on mu_1
A will cause overflow:        -63.175823792708115 (A1) -1034.2830503288963 (A2) -971.1072265361881 (A3) 0
on mu_10
A will cause overflow:        -73.60726966456855 (A1) -1117.305713516955 (A2) -1043.6984438523864 (A3) 0
on mu_11
A will cause overflow:        285.8148349133003 (A1) -1257.3027160393392 (A2) -1543.1175509526395 (A3) 0
on mu_12
A will cause overflow:        -416.4155801417769 (A1) -2218.9375422037906 (A2) -1802.5219620620137 (A3) 0
on mu_13
A will cause overflow:        -5119.847797778497 (A1) -23590.797234078836 (A2) -18470.94943630034 (A3) 0
on mu_14
A will cause overflow:        17.410591680705352 (A1) -872.7414166891828 (A2) -890.1520083698881 (A3) 0
on mu_15
A will cause overflow:        229.3189286720135 (A1) -7257.562615470153 (A2) -7486.881544142167 (A3) 0
on mu_16
A will cause overflow:        -45.852017207070276 (A1) -1685.5455480835312 (A2) -16

  LLQ += counts * np.log(prob) # this means fewer calls to lookup table


on sigma_16
on sigma_17
on sigma_18
on sigma_19


  return min(1, np.exp(A))


on sigma_2
on sigma_20
on sigma_21
on sigma_22
on sigma_23
on sigma_24
on sigma_25
on sigma_26
on sigma_27
on sigma_28
on sigma_29
on sigma_3
on sigma_30
on sigma_31
on sigma_32
on sigma_33
on sigma_34
on sigma_35
on sigma_4
on sigma_5
on sigma_6
on sigma_7
on sigma_8
on sigma_9
on theta_2


AssertionError: theta proposal out: 4.649908235397529

In [None]:
print(testClass.current_values)