In [1]:
#import numpy as np
import matplotlib.pyplot as plt
#import scipy as sc
import autograd.scipy as sc  # Thinly-wrapped scipy
import autograd.numpy as np  # Thinly-wrapped numpy
from autograd import grad
from sklearn.linear_model import LogisticRegression
import os
import pandas as pd
import itertools

In [2]:
# load joke data including features

# load data using pandas
df = pd.read_csv('../data/ratings.csv', sep=' ', header=None)
# uid = user id
# jid = joke id
df.columns = ['uid', 'jid', 'rating']


# Let N be the number of users, M be the number of jokes
# use +1 for easier referencing later
I = df.uid.max() + 1
J = df.jid.max() + 1

# for this problem, we do not need uid --> remove column!
#df = df[['rating', 'jid']]

# import joke features Jf
dfJ = pd.read_csv('../data/features.csv', sep=' ', header=None)
wrange = ['j'+str(w) for w in range(0, 151)]
dfJ.columns = wrange
dfJ.head()

# convert to numpy matrix Jf
Jf = dfJ.values


The loglikelihood is given as
$$ \mathcal{L}(\beta \; \vert \; \mathcal{D}, \theta) = \sum_{i=1}^n \log \left( F_\epsilon(\theta_{Y_i} + X_i^T \beta) - F_\epsilon(\theta_{Y_i-1} + X_i^T\beta) \right)$$

In [45]:
# the log-likelihood function
# Xrat array consisting of (rating, user, item)
# Xfeat array indexable by the item column of Xrat which delivers item features
# theta is the parameter vector
# theta = (u_1, ..., u_I, v_1, ..., v_J, a_1, ..., a_I, b_1, ..., b_J, g, beta_1, ..., beta_L)
# i.e. theta has length (I + J) * K + I + J + 1 + L
# with u_i, v_j \in \mathbb{R}^K
# b stores the buckets for R categories (i.e. we have R-1 elements in b)
# i.e. in b we store b_1, b_2, ..., b_{R-1} for R different categories
def loglikelihood(theta, Xrat, Xfeat, b, I, J, K, R):
    
    L = Xfeat.shape[1]
    
    # first implement the easy latent model using probit...
    llsum = 0
    for row in xrange(Xrat.shape[0]):
        rating = Xrat[row, 0]
        i = Xrat[row, 1]
        j = Xrat[row, 2]

        # asserts for the indices i, j & rating
        assert i < I and i >= 0
        assert j < J and j >= 0
        assert Xfeat.shape[0] == J
        assert rating > 0 and rating <= R

        # the model for the latent variable
        u_i = theta[K * i:K*(i+1)]
        v_j = theta[(I + j) * K:(I + j + 1) * K]
        a_i = theta[(I + J) * K + i]
        b_j = theta[(I + J) * K + I + j]
        g = theta[(I + J) * K + I + J]
        beta = theta[(I + J) * K + I + J + 1:]

        # some asserts for the sizes
        assert len(u_i) == K
        assert len(v_j) == K
        assert len(beta) == L

        # the full model (reduce if necessary)
        # model with features does not work yet...
        model = np.dot(u_i, v_j) + a_i + b_j + g + np.dot(Xfeat[j, :], beta) 

        # here are some other choices

        # easy model using features only
        # model = np.dot(Xfeat[j, :], beta)

        # model using features + biases
        # model = np.dot(Xfeat[j, :], beta) + a_i + b_j + g

        # model using latent factors for user/item
        # model = np.dot(u_i, v_j)

        # model using latent factors for user/item and biases
        # model = np.dot(u_i, v_j) + a_i + b_j + g

        # the ordinal regression part
        # if rating is 1 or R we have a special case
        # another possibility would be to use instead of the ifelse construction
        # dummy values like +/- 9999 for infty
        # note the additional -1 due to space saving!
#         if rating == R:
#             # note F(infty) = 0 (mathematically not rigourous, limit is more correct)
#             llsum += np.log(sc.stats.norm.cdf(b[rating - 2] + model))
#         elif rating == 1:
#             # note F(-infty) = 0
#             llsum += np.log(1 - sc.stats.norm.cdf(b[rating - 1] + model))
#         else:
#             llsum += np.log(sc.stats.norm.cdf(b[rating - 1] + model) - sc.stats.norm.cdf(b[rating - 2] + model))
            
        llsum += np.log(sc.stats.norm.cdf(b[rating] + model) - sc.stats.norm.cdf(b[rating - 1] + model))
    return llsum

In [46]:
# testing the gradient

# create sample df
dftouse = df[['rating', 'uid', 'jid']].head(200)

# as in the given dataset the indices are 1,...,I and 1, ..., J
# adjust jokes & uid s.t. they serve the 0, ..., I-1 and 0, ..., J-1 space 
dftouse['uid'] = dftouse['uid'] - 1
dftouse['jid'] = dftouse['jid'] - 1

# transform ratings to range 1, ..., R
rating_vals = np.sort(pd.unique(dftouse['rating'].values.ravel()))
minR = dftouse.rating.min()
dftouse['rating'] = dftouse['rating'] - minR + 1
R = len(rating_vals)

# create buckets as midpoints
buckets = 0.5 * (rating_vals[1:] + rating_vals[:-1])
# buckets per hand ( 4 needed)
# 999 is infty
infty = 999
buckets = np.array([-infty, 1.8, 2.2, 3.3, 4.5, infty])

# get length I, J
I = dftouse.uid.max() + 1
J = dftouse.jid.max() + 1

# define some K
K = 2

# convert to numpy data matrix
Xrat = np.array(dftouse)
Xfeat = Jf

# create dummy theta vector (all zeros)
L = Xfeat.shape[1]
theta = np.zeros((I + J) * K + I + J + 1 + L)

# init theta vector with some random values
theta = np.random.normal(size=theta.shape[0], loc=0., scale=0.1)

# compute log likelihood
loglikelihood(theta, Xrat, Xfeat, buckets, I, J, K, R)

-1380.2074667158931

In [47]:
# by hand derived gradient
def manual_grad(theta, Xrat, Xfeat, b, I, J, K, R):
    grad = np.zeros(theta.shape[0])
    
    # go over samples and make update according to them!
    for n in xrange(Xrat.shape[0]):
        
        rating = Xrat[n, 0]
        i = Xrat[n, 1]
        j = Xrat[n, 2]
        
        # the model for the latent variable
        u_i = theta[K * i:K*(i+1)]
        v_j = theta[(I + j) * K:(I + j + 1) * K]
        a_i = theta[(I + J) * K + i]
        b_j = theta[(I + J) * K + I + j]
        g = theta[(I + J) * K + I + J]
        beta = theta[(I + J) * K + I + J + 1:]

        # some asserts for the sizes
        assert len(u_i) == K
        assert len(v_j) == K
        assert len(beta) == L
        
        # compute q_n term
        # the full model (reduce if necessary)
        # model with features does not work yet...
        model = np.dot(u_i, v_j) + a_i + b_j + g + np.dot(Xfeat[j, :], beta) 
        q = 0
        
        # note that this works only for particular pdfs!!!! i.e. they are approaching 0!
        q = sc.stats.norm.pdf(b[rating] + model) - sc.stats.norm.pdf(b[rating - 1] + model) / \
            (sc.stats.norm.cdf(b[rating] + model) - sc.stats.norm.cdf(b[rating - 1] + model))
        
        # compute derivatives
        grad_u_i = q * v_j
        grad_v_j = q * u_i
        grad_a_i = q 
        grad_b_j = q
        grad_g = q
        grad_beta = q * Xfeat[j, :]
        
        grad[K * i:K*(i+1)] += grad_u_i
        grad[(I + j) * K:(I + j + 1) * K] += grad_v_j
        grad[(I + J) * K + i] += grad_a_i
        grad[(I + J) * K + I + j] += grad_b_j
        grad[(I + J) * K + I + J] += grad_g
        grad[(I + J) * K + I + J + 1:] += grad_beta
        
    return grad

In [48]:
gradll = grad(loglikelihood)

# check that gradient has the right length and output sample
#assert len(gradll(theta, Xrat, Xfeat, buckets, I, J, K, R)) == (I + J) * K + I + J + 1 + L
print gradll(theta, Xrat, Xfeat, buckets, I, J, K, R)
# init theta vector with some random values
theta = np.random.normal(size=theta.shape[0], loc=0., scale=0.1)
manual_grad(theta, Xrat, Xfeat, buckets, I, J, K, R)

[   0.            0.            0.         ...,  -65.13756974  -70.621929
 -612.09649219]


array([   0.        ,    0.        ,    0.        , ...,  -57.59335142,
        -74.76345567, -670.64776854])

In [74]:
# the rowlikelihood for sgd
def rowloglikelihood(theta, row, Xrat, Xfeat, b, I, J, K, R):
    L = Xfeat.shape[1]
    rating = Xrat[row, 0]
    i = Xrat[row, 1]
    j = Xrat[row, 2]

    # asserts for the indices i, j & rating
    assert i < I and i >= 0
    assert j < J and j >= 0
    assert Xfeat.shape[0] == J
    assert rating > 0 and rating <= R

    # the model for the latent variable
    u_i = theta[K * i:K*(i+1)]
    v_j = theta[(I + j) * K:(I + j + 1) * K]
    a_i = theta[(I + J) * K + i]
    b_j = theta[(I + J) * K + I + j]
    g = theta[(I + J) * K + I + J]
    beta = theta[(I + J) * K + I + J + 1:]

    # some asserts for the sizes
    assert len(u_i) == K
    assert len(v_j) == K
    assert len(beta) == L

    # the full model (reduce if necessary)
    # model with features does not work yet...
    model = np.dot(u_i, v_j) + a_i + b_j + g + np.dot(Xfeat[j, :], beta) 
    
    # here are some other choices
    
    # easy model using features only
    # model = np.dot(Xfeat[j, :], beta)
    
    # model using features + biases
    # model = np.dot(Xfeat[j, :], beta) + a_i + b_j + g
    
    # model using latent factors for user/item
    # model = np.dot(u_i, v_j)
    
    # model using latent factors for user/item and biases
    # model = np.dot(u_i, v_j) + a_i + b_j + g

    # the ordinal regression part
    # if rating is 1 or R we have a special case
    # another possibility would be to use instead of the ifelse construction
    # dummy values like +/- 9999 for infty
    # note the additional -1 due to space saving!
    if rating == R:
        # note F(infty) = 0 (mathematically not rigourous, limit is more correct)
        return np.log(sc.stats.norm.cdf(b[rating - 2] + model))
    elif rating == 1:
        # note F(-infty) = 0
        return np.log(1 - sc.stats.norm.cdf(b[rating - 1] + model))
    else:
        return np.log(sc.stats.norm.cdf(b[rating - 1] + model) - sc.stats.norm.cdf(b[rating - 2] + model))

In [75]:
# use autograd to get a gradient out of the rowlikelihood
gradrll = grad(rowloglikelihood)
gradll = grad(loglikelihood)

# check that gradient has the right length and output sample
assert len(gradrll(theta, 5, Xrat, Xfeat, buckets, I, J, K, R)) == (I + J) * K + I + J + 1 + L
gradrll(theta, 5, Xrat, Xfeat, buckets, I, J, K, R)

array([ 0.        ,  0.        ,  0.        , ..., -3.09819424,
        0.        , -3.09819424])

In [76]:
# test the row likelihood by comparing to the full one
llsum = 0.
for row in xrange(Xrat.shape[0]):
    llsum += rowloglikelihood(theta, row, Xrat, Xfeat, buckets, I, J, K, R)
print (llsum, loglikelihood(theta, Xrat, Xfeat, buckets, I, J, K, R))

# do analogously the gradient test...
gsum = 0.
for row in xrange(Xrat.shape[0]):
    gsum += gradrll(theta, row, Xrat, Xfeat, buckets, I, J, K, R)
print gsum
print gradll(theta, Xrat, Xfeat, buckets, I, J, K, R)

# do sgd approximation
M = 100 # choose M samples
gsum = 0.
for row in np.random.randint(Xrat.shape[0], size=M):
    # don't forget scaling!
    gsum += 1. * Xrat.shape[0] / M * gradrll(theta, row, Xrat, Xfeat, buckets, I, J, K, R)
print gsum

(-7906.5050053024661, -7906.5050053024661)
[  9.78137949e-02  -5.86879598e-02   0.00000000e+00 ...,  -2.60651665e+02
  -3.36704814e+02  -4.09320543e+03]
[  9.78137949e-02  -5.86879598e-02   0.00000000e+00 ...,  -2.60651665e+02
  -3.36704814e+02  -4.09320543e+03]
[  1.95627590e+00  -1.17375920e+00   0.00000000e+00 ...,  -1.34814650e+02
  -3.28475006e+02  -3.84881225e+03]


In [77]:
# one epoch of sgd!

theta0 = theta.copy()

# combined learning_rate * scale_factor
alpha = 0.05

print theta0
print loglikelihood(theta0, Xrat, Xfeat, buckets, I, J, K, R)

# shuffle data here!
for row in xrange(Xrat.shape[0]):
    # update theta0 according to current row
    theta0 += alpha * gradrll(theta0, row, Xrat, Xfeat, buckets, I, J, K, R)
    
print theta0
print loglikelihood(theta0, Xrat, Xfeat, buckets, I, J, K, R)

[-0.00648461 -0.18237909  0.0618506  ...,  0.05158269  0.05723133
  0.04121165]
-7906.5050053
[-0.00621295 -0.18254208  0.0618506  ...,  0.08054081  0.00643034
 -1.24587309]
-1956.72718667


### Predicting a class

In [78]:
# given a testsample we derive the class via the latent variable
def predict(theta, Xrat, Xfeat, buckets, I, J, K, R):
    
    y = np.zeros(Xrat.shape[0])
    for row in xrange(Xrat.shape[0]):
        L = Xfeat.shape[1]
        rating = Xrat[row, 0]
        i = Xrat[row, 1]
        j = Xrat[row, 2]
    
         # the model for the latent variable
        u_i = theta[K * i:K*(i+1)]
        v_j = theta[(I + j) * K:(I + j + 1) * K]
        a_i = theta[(I + J) * K + i]
        b_j = theta[(I + J) * K + I + j]
        g = theta[(I + J) * K + I + J]
        beta = theta[(I + J) * K + I + J + 1:]

        # some asserts for the sizes
        assert len(u_i) == K
        assert len(v_j) == K
        assert len(beta) == L

        # the full model (reduce if necessary)
        # model with features does not work yet...
        model = np.dot(u_i, v_j) + a_i + b_j + g + np.dot(Xfeat[j, :], beta) 
        Y = -model
        
        # predict y based on where it lies within the buckets
        y[row] = np.sum(Y > buckets) + 1
    return y

In [81]:
np.min(predict(theta0, Xrat, Xfeat, buckets, I, J, K, R))

1.0

In [62]:
# # Overall Todos:
# # add Documentation, code in MLib style

# # This shall be designed for modelling ratings
# # assume we are given n users i =1, ..., I
# # that rated m items j= 1, ..., J
# # Y_n ~ u_i^Tv_j + a_i + b_j + g + X_n * w
# # u_i, v_j are K-dimensional latent variable vectors
# # a_i, b_j, g are user/item/global wise biases (latent)
# # X_n represents the n-th data row with L features
# # w is the weight vector we want to train for
# # Thus, in total our model trains the parameter vector
# # theta = (u_1, ..., u_I, v_1, ..., v_J, a_1, ..., a_I, b_1, ..., b_J, g, w)
# class CumulativeModel:
    
    
#     def __init__(self, buckets = None, modelType = 'logistic', spark=False, numLatentFactors=0):
#         # add here check for correct model types
#         # logistic, probit, minev, maxev
#         self.mtype = modelType
#         self.useSpark = spark
#         self.K = numLatentFactors
#         self.buckets = buckets
        
#     # add here sgd parameters
#     def fit(self, X, y, ...):
#         # first set buckets if necessary
#         if self.buckets is None:
#             warning()
        
#     # make a prediction
#     def predict(self, X, y, ...):
        