In [1]:
import unittest
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab

## Helper functions
This notebook contains helper and plotting functions for all of the other notebooks. These are specific to the type of data we expect to clean and receive, based on the Movies example in the main text.

In [2]:
# define general plot parameters
params = {'axes.labelsize': 20,
         'axes.titlesize':26,
         'xtick.labelsize':16,
         'ytick.labelsize':16}
pylab.rcParams.update(params)


In [3]:
########################
# Helper functions
######################

def initialize_suite(TestCase):
    """This is a wrapper function to help with unittesting in other notebooks."""
    loader = unittest.TestLoader()
    suite = loader.loadTestsFromTestCase(TestCase)
    return suite 

def print_fn(nameAndVarList, n=2):
    """This is a wrapper function to help with debugging functions in other notebooks.
    All other functions will have an optional printing argument (Boolean), which will 
    rely heavily on this wrapper when True.
    
    Inputs
    -----------
    nameAndVarList: A list in which variable names and values are alternated.
    As in ['var1', var1, 'var2', var2, ...]
    
    Function
    -----------
    Print variable name and value from list, followed by a line break
    
    Output
    -----------
    None
    
    """
    newList = [nameAndVarList[i:i + n] for i in range(0, len(nameAndVarList), n)]
    for name, var in newList:
        print('\n {} has value {}'.format(name, var))
        
def clean_data(load_path): 
    """This cleans the movie data in data/MoviesData.csv, which was downloaded from 
    https://osf.io/53ce9/.
    
    Input
    -----------
    load_path: string: local pathway to MoviesData.csv. This will be data/MoviesData.csv unless
        custom changes made. 
        
    Function
    -----------
    Skips header rows and movie titles, and converts rating counts to integers. 
    
    Output
    ----------
    cleanData: a matrix of integer counts; shape (q, k), where q is the number
        of movies / questions, and k is the number of possible ratings / answers.
    """
    movieData = []
    with open(path) as c:
        read = csv.reader(c)
        for i, row in enumerate(read):
            if i < 2:
                continue
            movieData.append(np.int_(row[2:]))

        cleanData = np.array(movieData)
        return cleanData
    
def save_cleaned_data(save_path, cleanData):
    """Wrapper function to save cleaned data at a particular path."""
    np.save(save_path, cleanData)
    
def load_cleaned_data(load_path):
    """Wrapper function to load cleaned data from a particular path."""
    return np.load(load_path)

def posterior_predictive_distributions(samples):
    """Function to calculate posterior predictive distributions given by samples.
    
    
    Input
    ---------
    samples: dict containing param type as keys, MCMC samples of that parameter as values;
             each value will be p_i * ns, where p_i is the number of individual parameters of that type 
             (e.g., means), and ns is the number of stored samples;
             
             
    Function 
    -----------
    For each question, get MCMC estimate of different answer probabilities, as explained in main paper.
    
    Output
    -----------
    postPreds: a matrix of probabilities; shape (q, k), where q is the number
        of movies / questions, and k is the number of possible ratings / answers. Each row will sum to one.
    
    """
    nq = len(samples['mu']) # number of questions
    nk = len(samples['theta'])-2 # number of possible responses
    ns = len(samples['mu'][0]) # number of samples
    posts = []
    for q in np.arange(nq): # extract MC extimate for probability of each answer per question
        posts_q = [[] for i in range(nk+1)] # we will average each entry to get MC estimate

        for s in np.arange(ns): # for each question and sample, run MC estimate
            mu = samples['mu'][q][s] # mu for that question and sample
            sigma = samples['mu'][q][s] # sigma for that question and sample
                
            for k in np.arange(nk+1): # for each potential answer
                theta_upper = samples['theta'][k+1][s] # these are sampled too, but do not depend on q
                theta_lower = samples['theta'][k][s] # these are sampled too, but do not depend on q
                
                posts_q[k].append(stats.norm.cdf(theta_upper, mu, sigma) - stats.norm.cdf(theta_lower, mu, sigma))

        for k in np.arange(nk+1):
            posts_q[k] = np.mean(posts_q[k])
        posts.append(posts_q)
    postPreds = np.array(posts)
    return postPreds
    
def discard_burn_in(samples, burn_in=1000):
    """Wrapper function to discard burn in period from samples.
    
    Input
    ---------
    samples: dict containing param type as keys, MCMC samples of that parameter as values;
             each value will be p_i * ns, where p_i is the number of individual parameters of that type 
             (e.g., means), and ns is the number of stored samples;
    burn_in: int; number of initial samples to discard
    
    Function
    ---------
    For each parameter type, restart matrix of samples after burn_in
    
    Output
    ----------
    clean_samples: dict containing param type as keys, MCMC samples of that parameter as values minus burn_in;
             each value will be p_i * (ns-burn_in), where p_i is the number of individual parameters of that type 
             (e.g., means), and ns is the number of stored samples;
    """
    clean_samples = {}
    for k, v in samples.items():
        clean_samples[k] = v[:, burn_in:]
    return clean_samples

########################
# Plotting functions
######################
def plot_data(cleanData):
    """This is a plotting function to show all of the answer distributions together, and gauge how
    appropriate metric / ordinal models are.
    
    Input
    ---------------
    cleanData:  a matrix of integer counts; shape (q, k), where q is the number
        of movies / questions, and k is the number of possible ratings / answers.
        
    Function
    ---------------
    Prints question number and N in title; normalizes counts for each question, 
    then plots all of these as subplots in square figure. Each subplot is a barplot,
    with the bar height at different possible ratings / values of k denoting the fraction
    of responses at that answer.
    
    Output
    ----------
    None
    """
    
    q, k = cleanData.shape
    square_len = int(np.ceil(np.sqrt(q)))
    fig, ax = plt.subplots(square_len, square_len, figsize=(square_len*8, square_len*4),
                          sharex=True, sharey=True)

    for i, counts in enumerate(cleanData):
        x, y = np.unravel_index(i, (square_len, square_len))
        ax[x, y].bar(np.arange(1, k+1), counts/np.sum(counts))
        ax[x, y].set_title('Question {}; N={}'.format(i, np.ceil(np.sum(counts))))
        if y % square_len == 0:
            ax[x, y].set_ylabel('Proportion')
        if x == square_len-1:
            ax[x, y].set_xlabel('Choices')
            try:
                ax[x, y].set_x_ticklabels(np.arange(1, k+1))
            except:
                continue
    plt.subplots_adjust(hspace=1.0)


def sample_plot(samples, burn_in, slice_freq, fig_len=30, fig_height=30, save_path=None):
    """This function plots the MCMC sample trajectory for each parameter, as well as
    the log likelihood. 
    
    
    Inputs
    --------
    samples: dict containing param type as keys, MCMC samples of that parameter as values;
             each value will be p_i * ns, where p_i is the number of individual parameters of that type 
             (e.g., means), and ns is the number of stored samples;
    burn_in: int: where to draw the burn-in line;
    slice_freq: int; how frequently samples were saved;
    save_path: string, where to save figure.
    
    Function
    ----------
    Creates subplot for each type of parameter, then plots all sampled values of those parameters 
    against the sample number.
    
    Output
    ---------
    None
    
    """
    params = sorted(samples.keys())
    fig, ax = plt.subplots(nrows = len(params), ncols=1, figsize=(fig_len, fig_height), sharex=True)
    xs = np.arange(0, len(samples[params[0]][0]))* slice_freq # to scale xticklabels

    for plot, param in enumerate(params):
        MX = 0
        mx = 0
        for var in samples[param]:
            MX = max(MX, max(var))
            mx = min(mx, min(var))

            ax[plot].plot(xs, var)
        ax[plot].set_title('{}'.format(param))
        ax[plot].set_ylabel('Param values')
        _ = ax[plot].vlines(x=burn_in, ymin=mx, ymax=MX, colors='k')
        
    ax[plot].set_xlabel('Samples')
    
    if save_path:
        plt.savefig(save_path)
        
def overlay_data(cleanData, postPreds, save_path=None):
    """Function to overlay answer frequencies with posterior predictive probabilities, 
    as a visual measure of how well fitted ordinal probit model reflects underlying data distribution.
    
    Inputs
    -------
    cleanData:  a matrix of integer counts; shape (q, k), where q is the number
        of movies / questions, and k is the number of possible ratings / answers.
    postPreds: a matrix of probabilities; shape (q, k), where q is the number
        of movies / questions, and k is the number of possible ratings / answers. Each row will sum to one.
    save_path: Bool or string; if not None, where to save plot.
    
    Function
    ------------
    Very similar to plot_data, except overlays second matrix as dots on first.
    
    Outputs
    ----------
    None
    
    """
    
    q, k = cleanData.shape
    square_len = int(np.ceil(np.sqrt(q)))
    fig, ax = plt.subplots(square_len, square_len, figsize=(square_len*8, square_len*4),
                          sharex=True, sharey=True)
    xs = np.arange(1, k+1)
    for i, counts in enumerate(cleanData):
        x, y = np.unravel_index(i, (square_len, square_len))
        ax[x, y].bar(xs, counts/np.sum(counts))
        ax[x, y].plot(xs, postPreds[i], 'ok', )
        ax[x, y].set_title('Question {}; N: {}'.format(i, np.sum(counts)))
        if y % square_len == 0:
            ax[x, y].set_ylabel('Proportion')
        if x == square_len-1:
            ax[x, y].set_xlabel('Choices')
            try:
                ax[x, y].set_x_ticklabels(np.arange(1, k+1))
            except:
                continue
    plt.subplots_adjust(hspace=1.0)

    if save_path:
        plt.savefig(save_path)

    