In [1]:
import numpy as np
from scipy import stats
import unittest
import csv
import gc
import sys
%run helper_fns.ipynb
%run probability_fns.ipynb
%run likelihood_fns.ipynb
%run sampling_fns.ipynb

...........
----------------------------------------------------------------------
Ran 11 tests in 0.010s

OK
......
----------------------------------------------------------------------
Ran 6 tests in 0.010s

OK
  LLQ += counts * np.log(prob) # this means fewer calls to lookup table
  real_LP_1 = np.sum(np.log([max(0,
.

end is lower than start; can happen and will return 0 which will cause neg infs later: [-inf, 0.0, -1.0, 1.0, inf]
Some thetas out of order after proposal: previous: [-inf, 1.5, 1.3, 3.5, inf] proposed: [False  True False False  True]            These will throw -infs, but will always be rejected.
np.inf returned; should only really happen with thetas out of order:[-inf, 1.5, 1.3, 3.5, inf]
LL: -inf
LP: -3.4768155996140178
A is -inf: A -inf; A1 -inf A2 -6.92554331291155 A3 -0.2255550003176261



----------------------------------------------------------------------
Ran 3 tests in 0.016s

OK


## Ordinal probit class
This notebook contains the class for the ordinal probit model for survey data, along with methods for MCMC (Metropolis-Hastings). It draws on the functions in helper_fns, probability_fns, likelihood_fns, and sampling_fns. There is no test suite yet.

In [2]:
class OrdinalProbitSurveyData():
    """A class to fit ordinal probit models to survey data using MCMC. Meant to implement the model in 
    "Analyzing ordinal data with metric models: What could possibly go wrong?""
    Torrin M.Liddell and John K.Kruschke (2018)
    https://www.sciencedirect.com/science/article/abs/pii/S0022103117307746
    
    Initial implementation: Ruairidh McLennan Battleday
    
    Inputs
    ---------
    
    surveyResponses: matrix of ints or list or list of ints. Each row or list contains the counts of responses
                     to a particular question, where the answers form an ordinal list (ordered sequence)
                     and each entry is the counts for each answer. That is, entry 0 (0-based indexing) contains
                     the counts for answer 1 (1-based indexing) to that question. 
                     All rows or questions must have the same meaning and permissible responses
                     (e.g., similarity, 1=highly similar, 9=not at all similar, etc).
                     
                     
    k: int; the number of applicable responses. This will be used to define a range of possible responses
                    from 0:k-1.
                    
                    
    num_samples: number of MCMC samples in total (i.e., includes burn in);
    slice_frequency: how frequently to retain samples;
    print_frequency: how frequently to print.
    
    printing: whether to call subfunctions and methods with printing on (prints all intermediate values)
    debug: whether to run assertion statements within subfunctions.
    
    Attributes
    ---------------
    Q: int; number of questions;
    mu_0; scalar; prior mean for mu; 
    sigma_0; scalar in positive Reals; prior sd for mu; 
    sigma_prop; scalar in positive Reals; jump size for all proposal distributions;
    gamma_mean; scalar in positive Reals # mean of gamma prior on sigmas; note, original paper used 3.0 as mode, 
        # but I've changed this to mean for ease of conversion (was it a typo in original paper?)
    gamma_spread; scalar in positive Reals # sd of gamma prior on sigmas
    theta_shift; scalar value in Reals; to add to prior means to center intervals; pretty much always 0.5;
    lower_sigma: scalar in positive Reals; lower boundary of trunc normal proposal for sigma (usually 0);
    upper_sigma: scalar in positive Reals; upper boundary of trunc normal proposal for sigma (usually 100);
    lower_theta: scalar in positive Reals; lower boundary of trunc normal proposal for theta (usually 1.5);
    upper_theta: scalar in positive Reals; upper boundary of trunc normal proposal for theta (usually k-0.5);
    
    Structures
    --------------
    param_dict: a dictionary of variables and arguments for each parameter, which are used to conduct MCMC. 
                Should contain the parameter name as key, which returns a sub dictionary.
                Each subdictionary has the following key:value pairs
                "value" key that stores its value
                "proposal_function" key, which contains the proposal function
                "proposal" key, which stores a list of proposal function arguments
                "acceptance_function" key, which stores the acceptance function
                "acceptance" key, which stores the acceptance function arguments.
    paramList: a list of the keys of paramDict; should always be saved sorted;
    current_values: a dictionary that contains a cached version of current parameter values and log likelihood.
                The keys are the parameter types ('mu', 'sigma', 'theta', 'LL'), and the values are vectors 
                indexed by question number (mu and sigma) or answer number (theta)
    return_values: a dictionary that contains a cached version of all saved parameter value and log likelihood samples.
                The keys are the parameter types ('mu', 'sigma', 'theta', 'LL'), and the values are matrices
                with rows indexed by question number (mu and sigma) or answer number (theta), and columnms indexed by 
                stored sample number.
    
                
    Methods
    -------------
    initialize: uses values from paper and priors to initialize all parameters, and fills in remaining attributes
                and structures. 
    Refresh methods: uses dependencies to update arguments for SMC proposal and acceptance functions;
                These just replicate argument order and values from corresponding functions in sampling_fns and probability fns notebooks.
                
    update_all_params: Cycle function, which sequentially updates parameters using paramList;
                        First dependencies are regenerated, then proposals drawn. 
                        Then acceptance arguments are regenerated, and then the accepted or continued parameter
                        is stored in param_dict and cached_values;
    update_cached_value: helper function to make sure current_values is actually current;
    update_return_values: helper function to update the dictionary of stored samples. Important to copy values, 
                        as otherwise they may just be links and overwritten later. 
    cleanup_return_values: helper function to make each value of return_value dict
                            a matrix of size(|param type| * number stored samples)
    MH_sample: sampler function that runs an update of parameters (refreshes internally), saves values according
                            to save_frequency attribute; prints progress according to print_frequency attribute;
                            Importantly, this is the master of printing and debug, which it takes from the object's
                            initialization. That is, these are not arguments to the method, uniquely, but are called internally.
                            All other methods or functions take optional printing and debug arguments.
                            
                    
    Outputs
    -----------
    MH_sample is the only method to return anything. It returns a copy of return_values, a dictionary.
    The structure of return_values is given above.  
    """
    
    def __init__(self, surveyResponses, k, num_samples=1000, slice_frequency=100, print_frequency=100,
                printing=False, debug=False):
        
        self.surveyResponses = surveyResponses
        self.Q = None
        self.k = k
        self.num_samples = num_samples
        self.slice_frequency = slice_frequency
        self.print_frequency = print_frequency
        self.printing = printing
        self.debug = debug
        self.param_dict = {}
        self.paramList = None # initialized below
        
        self.mu_0 = (self.k+1)/2
        self.sigma_0 = self.k
        self.sigma_prop = 0.3 # proposal function SD
        self.gamma_mean = 3.0 # mean of gamma prior on sigmas; note, original paper used 3.0 as mode, 
        # but I've changed this to mean for ease of conversion (was it a typo in original paper?)
        self.gamma_spread = 3.0 # sd of gamma prior on sigmas
        self.theta_shift = 0.5
        self.lower_sigma = 0
        self.upper_sigma = 100
        self.lower_theta = 1.5
        self.upper_theta = k-0.5
        
        self.initialize()

        
    def initialize(self, printing=False, debug=False):
        try:
            self.surveyResponses = np.int_(np.array(self.surveyResponses))
        except Exception as E:
            print(E)
            print("Survey responses are the wrong format: should be List of Lists of ints" + \
            "with each subList the same length; or, matrix of ints.")
        assert self.surveyResponses.shape[-1] == self.k, "mismatch between number of columns in surveyResponses and k {} vs {}".format(self.surveyResponses[-1],
                                                                                                                                 k)
        # first establish global and prior parameters; taken from original paper
        self.Q = self.surveyResponses.shape[0]
        self.current_values = {'mu': np.empty(self.Q), 'sigma': np.empty(self.Q), 'theta': np.empty(self.k+1)}
        self.current_values['theta'][0] = -np.inf
        self.current_values['theta'][1] = self.lower_theta
        self.current_values['theta'][self.k] = np.inf
        self.current_values['theta'][self.k-1] = self.upper_theta
        self.refresh_return_values()
        
        for q in np.arange(self.Q):
            self.param_dict["mu_{}".format(q)] = {"value": self.mu_0,
                                           "proposal_fn": mu_proposal,
                                            "proposal": self.refresh_mu_proposal,
                                            "acceptance_fn": mu_accept,
                                            "acceptance": self.refresh_mu_acceptance}
            
            self.param_dict["sigma_{}".format(q)] = {"value": self.gamma_mean,
                                           "proposal_fn": sigma_proposal,
                                            "proposal": self.refresh_sigma_proposal,
                                            "acceptance_fn": sigma_accept,
                                            "acceptance": self.refresh_sigma_acceptance}
            
        for k_prime in np.arange(2, self.k-1):
            self.param_dict["theta_{}".format(k_prime)] = {"value": k_prime+self.theta_shift,
                                           "proposal_fn": theta_proposal,
                                            "proposal": self.refresh_theta_proposal,
                                            "acceptance_fn": theta_accept,
                                            "acceptance": self.refresh_theta_acceptance}
            
        
        self.paramList = sorted(self.param_dict.keys())
        
        for param_name in self.paramList:
            param, num = param_name.split('_')
            num = int(num)
            self.update_cached_value(param, num, self.param_dict[param_name]['value'] )
        
    def refresh_mu_proposal(self, param, num, printing=False, debug=False):
        return [self.current_values[param][num], self.sigma_prop]
    
    def refresh_mu_acceptance(self, param, num, proposal, printing=False, debug=False):
        return [self.current_values['mu'][num], proposal, self.surveyResponses[num],
               self.current_values['sigma'][num], self.current_values['theta'], 
               self.mu_0, self.sigma_0, self.sigma_prop]

    def refresh_sigma_proposal(self, param, num, printing=False, debug=False):
        return [self.current_values['sigma'][num], self.sigma_prop, self.lower_sigma, self.upper_sigma]
        
    def refresh_sigma_acceptance(self, param, num, proposal, printing=False, debug=False):        
        return [self.current_values[param][num], proposal, 
                self.surveyResponses[num], self.current_values['mu'][num], 
                self.current_values['theta'], self.gamma_mean, self.gamma_spread, self.sigma_prop,
               self.lower_sigma, self.upper_sigma]
    
    def refresh_theta_proposal(self, param, num, printing=False, debug=False):
        return [self.current_values[param][num], self.sigma_prop, self.lower_theta, self.upper_theta]
    
    def refresh_theta_acceptance(self, param, num, proposal, printing=False, debug=False):
        return [self.current_values[param][num], proposal, self.current_values['theta'], 
                self.surveyResponses, self.current_values['mu'], self.current_values['sigma'],
                num, self.theta_shift, self.sigma_0, self.sigma_prop, self.lower_theta, self.upper_theta]
               
    def update_all_params(self, printing=False, debug=False):
        """self.param_dict is the key data structure here."""
        
        for param_name in sorted(self.paramList):
            param, num = param_name.split('_')
            num = int(num)
            
            proposalArguments = self.param_dict[param_name]["proposal"](param, num)
            proposal = self.param_dict[param_name]["proposal_fn"](*proposalArguments, debug=debug)
            
            acceptanceArguments = self.param_dict[param_name]["acceptance"](param, num, proposal)
            accept = self.param_dict[param_name]["acceptance_fn"](*acceptanceArguments, debug=debug)
            ru = np.random.uniform()
            
            if accept > ru:
                if printing:
                    print('updating {} from {} to {}'.format(param_name, 
                                                             self.param_dict[param_name]["value"],
                                                             proposal))
                self.param_dict[param_name]["value"] = proposal
                if debug:
                    assert self.param_dict[param_name]["value"]==proposal, "param not updating properly"
                if printing:
                    print('updated {} to {}'.format(param_name, 
                                                             self.param_dict[param_name]["value"]))
            
            # whatever happens, update cached value
            self.update_cached_value(param, num, self.param_dict[param_name]['value'], printing=printing,
                                    debug=debug)
            
    def update_cached_value(self, param, num, value, printing=False, debug=False):
        if printing:
            print('{}_{} was {}'.format(param, num, self.current_values[param][num]))
        self.current_values[param][num] = self.param_dict['{}_{}'.format(param, num)]['value'] 
        
        if printing:
            print('{}_{} now {}'.format(param, num, self.current_values[param][num]))
            
    def update_return_values(self, printing=False, debug=False):
        for k, v in self.current_values.items():
            self.return_values[k].append(v.copy())
            if printing:
                print('updating key {} with values {}'.format(k, v))
                print('updated return values: {}'.format(self.return_values))
        self.return_values['LL'].append([LL(self.surveyResponses, self.current_values['mu'], 
                                      self.current_values['sigma'],self.current_values['theta'])])
        
    def cleanup_return_values(self, printing=False, debug=False):
        for k, v in self.return_values.items():
            newArray = np.array(v).T
            self.return_values[k] = newArray # want this to have variables as rows
            if debug:
                assert newArray.shape == (len(v[0]), (self.num_samples//self.slice_frequency)+1), "return values \
                incorrect: shape {} but should be \
{}".format(newArray.shape, (len(v[0]), (self.num_samples//self.slice_frequency)+1))
                
    def refresh_return_values(self):
        self.return_values = {'mu': [], 'sigma': [], 'theta': [], 'LL': []}
        
    def MH_sample(self, printing=False, debug=False):
        print("Running sampler!")
        for s in range(self.num_samples):
            if s % self.slice_frequency == 0: # get first sample / initialization
                self.update_return_values(printing=self.printing, debug=self.debug)
                gc.collect()
                gc.collect()
                gc.collect()
            
            self.update_all_params(printing=self.printing, debug=self.debug)
            
            if s % self.print_frequency == 0: 
                print('')
                print('on sample {} / {}. \n last saved LL was: {}'.format(s, self.num_samples,
                                                           self.return_values['LL'][-1]))
                
                
        self.update_return_values(printing=self.printing, debug=self.debug) # get last sample    
        self.cleanup_return_values(printing=self.printing, debug=self.debug)
        
        return self.return_values.copy()
        
    