# Recommendation System

Implement collaborative filtering learning algorithm and apply it to a dataset of movie ratings.

In [52]:
import numpy as np
from scipy.io import loadmat
import scipy.optimize as opt
import matplotlib.pyplot as plt
%matplotlib inline

## 1 Load the data

In [53]:
data = loadmat('data/ex8_movies.mat')
# User ratings of movies
Y = data['Y']
# R(i, j) = 1 if the i-th movie was rated by j-th user
R = data['R']
print(Y.shape)
print(R.shape)

(1682, 943)
(1682, 943)


In [54]:
param = loadmat('data/ex8_movieParams.mat')
m, u = Y.shape
m, u

(1682, 943)

In [55]:
theta = param['Theta']
X = param['X']
print(theta.shape)
print(X.shape)

(943, 10)
(1682, 10)


## 2 Collaborative filtering learning algorithm

In [56]:
def serialize(X, theta):
    '''
    Unroll the parameters into a single vector parameters.
    '''
    return np.concatenate([X.ravel(), theta.ravel()])

In [57]:
def deserialize(param, n, u, m):
    '''
    Transform serialized parameters into origin X and theta.
    '''
    return param[:m * n].reshape(m, n), param[m * n:].reshape(u, n)

In [58]:
def cost(param, Y, R, n):
    X, theta = deserialize(param, n, Y.shape[1], Y.shape[0])
    return np.power((X @ theta.T - Y) * R, 2).sum() / 2

In [59]:
u_sub = 4
m_sub = 5
n_sub = 3

X_sub = X[:m_sub, :n_sub]
theta_sub = theta[:u_sub, :n_sub]
Y_sub = Y[:m_sub, :u_sub]
R_sub = R[:m_sub, :u_sub]
cost(serialize(X_sub, theta_sub), Y_sub, R_sub, n_sub)

22.224603725685675

In [60]:
def regularized_cost(param, Y, R, n, reg):
    reg_term = np.power(param, 2).sum()
    return cost(param, Y, R, n) + (reg / 2) * reg_term

In [61]:
regularized_cost(serialize(X_sub, theta_sub), Y_sub, R_sub, n_sub, reg=1.5)

31.34405624427422

In [62]:
def gradient(param, Y, R, n):
    X, theta = deserialize(param, n, Y.shape[1], Y.shape[0])
    inner_term = (X @ theta.T - Y) * R
    X_grad = inner_term @ theta
    theta_grad = inner_term.T @ X
    return serialize(X_grad, theta_grad)

In [63]:
def regularized_gradient(param, Y, R, n, reg):
    return gradient(param, Y, R, n) + reg * param

In [64]:
def random_init(n, u, m):
    X = np.random.standard_normal((m, n))
    theta = np.random.standard_normal((u, n))
    return serialize(X, theta)

In [65]:
def collaborative_filter(Y, R, n, reg):
    param = random_init(n, Y.shape[1], Y.shape[0])
    Y_norm = Y - Y.mean()
    
    return opt.minimize(fun=regularized_cost, 
                        x0=param, 
                        args=(Y_norm, R, n, reg), 
                        method='TNC', 
                        jac=regularized_gradient)

In [66]:
def predict(X, theta, Y, user_id):
    return (X @ theta.T)[:, user_id] + Y.mean() 

## 3 Recommendation system

In [67]:
# Original rating provided
ratings = np.zeros(1682)

ratings[0] = 4
ratings[6] = 3
ratings[11] = 5
ratings[53] = 4
ratings[63] = 5
ratings[65] = 3
ratings[68] = 5
ratings[97] = 2
ratings[182] = 4
ratings[225] = 5
ratings[354] = 5

In [68]:
# Add new user and ratings
Y = np.insert(Y, 0, ratings, axis=1)
R = np.insert(R, 0, ratings != 0, axis=1)
print(Y.shape)
print(R.shape)

(1682, 944)
(1682, 944)


In [69]:
# Get movie list
movie_list = []

with open('data/movie_ids.txt', encoding='latin-1') as f:
    for line in f:
        tokens = line.strip().split(' ')
        movie_list.append(' '.join(tokens[1:]))

movie_list = np.array(movie_list)
movie_list.shape

(1682,)

In [70]:
res = collaborative_filter(Y, R, n=50, reg=10)
res

     fun: 64721.4978150674
     jac: array([ 5.20494698e-07,  1.67992963e-06, -3.22015397e-07, ...,
       -1.10269041e-06, -3.48987402e-07,  7.62450979e-07])
 message: 'Converged (|f_n-f_(n-1)| ~= 0)'
    nfev: 2343
     nit: 73
  status: 1
 success: True
       x: array([ 0.34870462, -0.00431869,  0.28995785, ...,  0.08995447,
       -0.2101121 ,  0.85348229])

In [85]:
X_train, theta_train = deserialize(res.x, 50, u + 1, m)
print(X_train.shape)
print(theta_train.shape)

(1682, 50)
(944, 50)


In [91]:
pred = predict(X_train, theta_train, Y, 0)
idx = np.argsort(pred)[::-1]
pred[idx[:10]]

array([4.12535269, 4.04413158, 3.99324183, 3.91902783, 3.81690297,
       3.81556239, 3.76602646, 3.76322429, 3.75905   , 3.75078089])

In [95]:
for i, m in enumerate(movie_list[idx[:10]]):
    print("{:2d}. {:35} Rating: {:.2f}".format(i + 1, m, pred[idx[i]]))

 1. Titanic (1997)                      Rating: 4.13
 2. Star Wars (1977)                    Rating: 4.04
 3. Shawshank Redemption, The (1994)    Rating: 3.99
 4. Forrest Gump (1994)                 Rating: 3.92
 5. Raiders of the Lost Ark (1981)      Rating: 3.82
 6. Braveheart (1995)                   Rating: 3.82
 7. Return of the Jedi (1983)           Rating: 3.77
 8. Usual Suspects, The (1995)          Rating: 3.76
 9. Godfather, The (1972)               Rating: 3.76
10. Schindler's List (1993)             Rating: 3.75
