In [26]:
#import numpy as np
import matplotlib.pyplot as plt
#import scipy as sc
import autograd.scipy as sci  # Thinly-wrapped scipy
import autograd.numpy as np  # Thinly-wrapped numpy
from autograd import grad
from sklearn.linear_model import LogisticRegression
import os
import pandas as pd
import itertools
import time


In [3]:
sc

<pyspark.context.SparkContext at 0x1047e48d0>

In [4]:
# load joke data including features

# load data using pandas
df = pd.read_csv('../data/ratings.csv', sep=' ', header=None)
# uid = user id
# jid = joke id
df.columns = ['uid', 'jid', 'rating']


# Let N be the number of users, M be the number of jokes
# use +1 for easier referencing later
I = df.uid.max() + 1
J = df.jid.max() + 1

# for this problem, we do not need uid --> remove column!
#df = df[['rating', 'jid']]

# import joke features Jf
dfJ = pd.read_csv('../data/features.csv', sep=' ', header=None)
wrange = ['j'+str(w) for w in range(151)]
dfJ.columns = wrange
dfJ.head()

# convert to numpy matrix Jf
Jf = dfJ.values


The loglikelihood is given as
$$ \mathcal{L}(\beta \; \vert \; \mathcal{D}, \theta) = \sum_{i=1}^n \log \left( F_\epsilon(\theta_{Y_i} + X_i^T \beta) - F_\epsilon(\theta_{Y_i-1} + X_i^T\beta) \right)$$

In [5]:
# the log-likelihood function
# Xrat array consisting of (rating, user, item)
# Xfeat array indexable by the item column of Xrat which delivers item features
# theta is the parameter vector
# theta = (u_1, ..., u_I, v_1, ..., v_J, a_1, ..., a_I, b_1, ..., b_J, g, beta_1, ..., beta_L)
# i.e. theta has length (I + J) * K + I + J + 1 + L
# with u_i, v_j \in \mathbb{R}^K
# b stores the buckets for R categories (i.e. we have R-1 elements in b)
# i.e. in b we store b_1, b_2, ..., b_{R-1} for R different categories
def loglikelihood(theta, Xrat, Xfeat, b, I, J, K, R):
    
    L = Xfeat.shape[1]
    
    # first implement the easy latent model using probit...
    llsum = 0
    for row in xrange(Xrat.shape[0]):
        rating = Xrat[row, 0]
        i = Xrat[row, 1]
        j = Xrat[row, 2]

        # asserts for the indices i, j & rating
        assert i < I and i >= 0
        assert j < J and j >= 0
        assert Xfeat.shape[0] == J
        assert rating > 0 and rating <= R

        # the model for the latent variable
        u_i = theta[K * i:K*(i+1)]
        v_j = theta[(I + j) * K:(I + j + 1) * K]
        a_i = theta[(I + J) * K + i]
        b_j = theta[(I + J) * K + I + j]
        g = theta[(I + J) * K + I + J]
        beta = theta[(I + J) * K + I + J + 1:]

        # some asserts for the sizes
        assert len(u_i) == K
        assert len(v_j) == K
        assert len(beta) == L

        # the full model (reduce if necessary)
        # model with features does not work yet...
        model = np.dot(u_i, v_j) + a_i + b_j + g + np.dot(Xfeat[j, :], beta) 

        # here are some other choices

        # easy model using features only
        # model = np.dot(Xfeat[j, :], beta)

        # model using features + biases
        # model = np.dot(Xfeat[j, :], beta) + a_i + b_j + g

        # model using latent factors for user/item
        # model = np.dot(u_i, v_j)

        # model using latent factors for user/item and biases
        # model = np.dot(u_i, v_j) + a_i + b_j + g

        # the ordinal regression part
        # if rating is 1 or R we have a special case
        # another possibility would be to use instead of the ifelse construction
        # dummy values like +/- 9999 for infty
        # note the additional -1 due to space saving!
        if rating == R:
            # note F(infty) = 0 (mathematically not rigourous, limit is more correct)
            llsum += np.log(sci.stats.norm.cdf(b[rating - 2] + model))
        elif rating == 1:
            # note F(-infty) = 0
            llsum += np.log(1 - sci.stats.norm.cdf(b[rating - 1] + model))
        else:
            llsum += np.log(sci.stats.norm.cdf(b[rating - 1] + model) - sci.stats.norm.cdf(b[rating - 2] + model))
    return llsum

In [41]:
# testing the gradient

# create sample df
dftouse = df[['rating', 'uid', 'jid']].head(2000)

# as in the given dataset the indices are 1,...,I and 1, ..., J
# adjust jokes & uid s.t. they serve the 0, ..., I-1 and 0, ..., J-1 space 
dftouse['uid'] = dftouse['uid'] - 1
dftouse['jid'] = dftouse['jid'] - 1

# transform ratings to range 1, ..., R
rating_vals = np.sort(pd.unique(dftouse['rating'].values.ravel()))
minR = dftouse.rating.min()
dftouse['rating'] = dftouse['rating'] - minR + 1
R = len(rating_vals)

# create buckets as midpoints
buckets = 0.5 * (rating_vals[1:] + rating_vals[:-1])

# get length I, J
I = dftouse.uid.max() + 1
J = dftouse.jid.max() + 1

# define some K
K = 2

# convert to numpy data matrix
Xrat = np.array(dftouse)
Xfeat = Jf

# create dummy theta vector (all zeros)
L = Xfeat.shape[1]
theta = np.zeros((I + J) * K + I + J + 1 + L)

# init theta vector with some random values
theta = np.random.normal(size=theta.shape[0], loc=0., scale=0.1)

# compute log likelihood
loglikelihood(theta, Xrat, Xfeat, buckets, I, J, K, R)

-8947.9186545983339

In [7]:
## the rowlikelihood for sgd
def rowloglikelihood(theta, row, Xfeat, b, I, J, K, R):
    L = Xfeat.shape[1]
    rating = row[0]
    i = row[1]
    j = row[2]

    # asserts for the indices i, j & rating
    assert i < I and i >= 0
    assert j < J and j >= 0
    assert Xfeat.shape[0] == J
    assert rating > 0 and rating <= R

    # the model for the latent variable
    u_i = theta[K * i:K*(i+1)]
    v_j = theta[(I + j) * K:(I + j + 1) * K]
    a_i = theta[(I + J) * K + i]
    b_j = theta[(I + J) * K + I + j]
    g = theta[(I + J) * K + I + J]
    beta = theta[(I + J) * K + I + J + 1:]

    # some asserts for the sizes
    assert len(u_i) == K
    assert len(v_j) == K
    assert len(beta) == L

    # the full model (reduce if necessary)
    # model with features does not work yet...
    model = np.dot(u_i, v_j) + a_i + b_j + g + np.dot(Xfeat[j, :], beta) 
    
    # here are some other choices
    
    # easy model using features only
    # model = np.dot(Xfeat[j, :], beta)
    
    # model using features + biases
    # model = np.dot(Xfeat[j, :], beta) + a_i + b_j + g
    
    # model using latent factors for user/item
    # model = np.dot(u_i, v_j)
    
    # model using latent factors for user/item and biases
    # model = np.dot(u_i, v_j) + a_i + b_j + g

    # the ordinal regression part
    # if rating is 1 or R we have a special case
    # another possibility would be to use instead of the ifelse construction
    # dummy values like +/- 9999 for infty
    # note the additional -1 due to space saving!
    if rating == R:
        # note F(infty) = 0 (mathematically not rigourous, limit is more correct)
        return np.log(sci.stats.norm.cdf(b[rating - 2] + model))
    elif rating == 1:
        # note F(-infty) = 0
        return np.log(1 - sci.stats.norm.cdf(b[rating - 1] + model))
    else:
        return np.log(sci.stats.norm.cdf(b[rating - 1] + model) - sci.stats.norm.cdf(b[rating - 2] + model))

In [8]:
# use autograd to get a gradient out of the rowlikelihood
gradrll = grad(rowloglikelihood)
gradll = grad(loglikelihood)

# check that gradient has the right length and output sample
assert len(gradrll(theta, Xrat[5], Xfeat, buckets, I, J, K, R)) == (I + J) * K + I + J + 1 + L
gradrll(theta, Xrat[5], Xfeat, buckets, I, J, K, R)

array([ 0.        ,  0.        ,  0.        , ..., -3.65815073,
        0.        , -3.65815073])

In [9]:
# test the row likelihood by comparing to the full one
llsum = 0.
for row in Xrat:
    llsum += rowloglikelihood(theta, row, Xfeat, buckets, I, J, K, R)
print (llsum, loglikelihood(theta, Xrat, Xfeat, buckets, I, J, K, R))

# do analogously the gradient test...
gsum = 0.
for row in Xrat:
    gsum += gradrll(theta, row, Xfeat, buckets, I, J, K, R)
print gsum
print gradll(theta, Xrat, Xfeat, buckets, I, J, K, R)

(-8065.0295793309624, -8065.0295793309624)
[ -9.38462676e-03  -1.12376626e-02   0.00000000e+00 ...,  -2.50382169e+02
  -3.44893040e+02  -4.16101994e+03]
[ -9.38462676e-03  -1.12376626e-02   0.00000000e+00 ...,  -2.50382169e+02
  -3.44893040e+02  -4.16101994e+03]


In [86]:
# do sgd approximation
M = 100 # choose M samples
gsum = 0.
for row in np.random.randint(Xrat.shape[0], size=M):
    # don't forget scaling!
    gsum += 1. * Xrat.shape[0] / M * gradrll(theta, Xrat[row], Xfeat, buckets, I, J, K, R)
print gsum

[    0.             0.             0.         ...,  -350.56791086
  -425.33674269 -4612.06896786]


In [106]:
# one epoch of sgd!

theta0 = theta.copy()

# combined learning_rate * scale_factor
alpha = 0.05
# shuffle data here!

t=time.time()
for row in Xrat:
    # update theta0 according to current row
    theta0 += alpha * gradrll(theta0, row, Xfeat, buckets, I, J, K, R)
print theta0

print "Time taken: %f s" % (time.time()-t)
print loglikelihood(theta0, Xrat, Xfeat, buckets, I, J, K, R)

[-0.0050993   0.06732006  0.08307213 ..., -0.01384602 -0.11538214
 -1.40406265]
Time taken: 7.128969 s
-1850.51150326


array([-0.0050993 ,  0.06732006,  0.08307213, ..., -0.01384602,
       -0.11538214, -1.40406265])

In [137]:
# Now to parallelize! Stick Xrat into an RDD
xrat_rdd=sc.parallelize(Xrat)

# Get a fresh theta
theta1=theta.copy()

#And then compute the gradient for each row individually, and the sum it
%time ptheta1=xrat_rdd.map(lambda x: alpha*gradrll(theta1,x,Xfeat, buckets, I, J, K, R)).mean()
n=xrat_rdd.count()
# subtract out (n-1)*theta1 to get final theta
loglikelihood(ptheta1, Xrat, Xfeat, buckets, I, J, K, R)

CPU times: user 66 ms, sys: 97 ms, total: 163 ms
Wall time: 9.55 s


-4875.4270370881022

That log likelihood is pretty awful though. Lets break it up into subarrays of size 5 and try again.

In [138]:
# A function we pass to the array to calculate the updated thetas from each subarray

def n_row_sgd(theta,subX, Xfeat, buckets, I, J, K, R):
    for row in subX:
        # update theta0 according to current row
        theta += alpha * gradrll(theta, row, Xfeat, buckets, I, J, K, R)
    return theta

In [142]:
size=5
#Split the array into subarrays of size n
split_xrat=np.split(Xrat,Xrat.shape[0]/size)

#And then parallelize it
split_xrat = sc.parallelize(split_xrat,5)

# Get a fresh theta
theta2=theta.copy()

# Run the sgd!
%time ptheta2=split_xrat.map(lambda subX:n_row_sgd(theta2, subX, Xfeat, buckets, I, J, K, R)).mean()

CPU times: user 75.3 ms, sys: 108 ms, total: 183 ms
Wall time: 6.68 s


In [141]:
loglikelihood(ptheta2, Xrat, Xfeat, buckets, I, J, K, R)

-1640.866923790534

In [104]:
theta2

array([-0.00481152,  0.06098107,  0.08307213, ..., -0.13811418,
       -0.09687869, -0.1490642 ])

In [None]:
# # Overall Todos:
# # add Documentation, code in MLib style

# # This shall be designed for modelling ratings
# # assume we are given n users i =1, ..., I
# # that rated m items j= 1, ..., J
# # Y_n ~ u_i^Tv_j + a_i + b_j + g + X_n * w
# # u_i, v_j are K-dimensional latent variable vectors
# # a_i, b_j, g are user/item/global wise biases (latent)
# # X_n represents the n-th data row with L features
# # w is the weight vector we want to train for
# # Thus, in total our model trains the parameter vector
# # theta = (u_1, ..., u_I, v_1, ..., v_J, a_1, ..., a_I, b_1, ..., b_J, g, w)
# class CumulativeModel:
    
    
#     def __init__(self, buckets = None, modelType = 'logistic', spark=False, numLatentFactors=0):
#         # add here check for correct model types
#         # logistic, probit, minev, maxev
#         self.mtype = modelType
#         self.useSpark = spark
#         self.K = numLatentFactors
#         self.buckets = buckets
        
#     # add here sgd parameters
#     def fit(self, X, y, ...):
#         # first set buckets if necessary
#         if self.buckets is None:
#             warning()
        
#     # make a prediction
#     def predict(self, X, y, ...):
        