# 推荐系统

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context="notebook", style="white", palette=sns.color_palette("RdBu"))
import numpy as np
import pandas as pd
import scipy.io as sio

# load data

In [2]:
movies_mat = sio.loadmat('./data/ex8_movies.mat')
Y, R = movies_mat.get('Y'), movies_mat.get('R')
'''
Y : (1682, 943) : 1682 movies, every movie of 943 users rating (1682部影片，每部影片有943个观众评分)
R : (1682, 943) : num_movies x num_users matrix, where R(i, j) = 1 if the  i-th movie was rated by the j-th user  
'''
Y.shape, R.shape

((1682, 943), (1682, 943))

In [3]:
m, u = Y.shape
# m: how many movies
# u: how many users

n = 10  # how many features for a movie

In [4]:
param_mat = sio.loadmat('./data/ex8_movieParams.mat')
theta, X = param_mat.get('Theta'), param_mat.get('X')

'''
theta:(943, 10): 943个观众对影片10个特征的喜爱度
X:(1682, 10): 1682个影片的特征向量
'''
theta.shape, X.shape

((943, 10), (1682, 10))

# cost
<img style="float: left;" src="../img/rcmd_cost.png">

In [5]:
def serialize(X, theta):
    """serialize 2 matrix
    """
    # X (movie, feature), (1682, 10): movie features
    # theta (user, feature), (943, 10): user preference
    return np.concatenate((X.ravel(), theta.ravel()))


def deserialize(param, n_movie, n_user, n_features):
    """into ndarray of X(1682, 10), theta(943, 10)"""
    return param[:n_movie * n_features].reshape(n_movie, n_features), \
           param[n_movie * n_features:].reshape(n_user, n_features)
           
def cost(param, Y, R, n_features):
    
    n_movie, n_user = Y.shape
    X, theta = deserialize(param, n_movie, n_user, n_features)
    # if R(i, j) = 1 => inner = X @ theta.T
    # if R(i, j) = 0 => inner = 0
    inner = np.multiply(X @ theta.T - Y, R)
    
    return np.power(inner, 2).sum() / 2 

def gradient(param, Y, R, n_features):
    # theta (user, feature), (943, 10): user preference
    # X (movie, feature), (1682, 10): movie features
    n_movies, n_user = Y.shape
    X, theta = deserialize(param, n_movies, n_user, n_features)

    inner = np.multiply(X @ theta.T - Y, R)  # (1682, 943)

    # X_grad (1682, 10)
    X_grad = inner @ theta

    # theta_grad (943, 10)
    theta_grad = inner.T @ X

    # roll them together and return
    return serialize(X_grad, theta_grad)

def regularized_cost(param, Y, R, n_features, l=1):
    reg_term = np.power(param, 2).sum() * (l / 2)

    return cost(param, Y, R, n_features) + reg_term

def regularized_gradient(param, Y, R, n_features, l=1):
    grad = gradient(param, Y, R, n_features)
    reg_term = l * param

    return grad + reg_term

In [6]:
param = serialize(X, theta)  # total real params

cost(serialize(X, theta), Y, R, 10)  # this is real total cost

27918.64012454421

# gradient
<img style="float: left;" src="../img/rcmd_gradient.png">

In [10]:
n_movie, n_user = Y.shape

X_grad, theta_grad = deserialize(gradient(param, Y, R, 10), n_movie, n_user, 10)
# if False 会报异常。
assert X_grad.shape == X.shape 
assert theta_grad.shape == theta.shape

# regularized cost

In [12]:
regularized_cost(param, Y, R, 10, l=0.5)  # total regularized cost

30219.661287386883

# regularized gradient
<img style="float: left;" src="../img/rcmd_reg_grad.png">

In [13]:
n_movie, n_user = Y.shape

X_grad, theta_grad = deserialize(regularized_gradient(param, Y, R, 10), n_movie, n_user, 10)

assert X_grad.shape == X.shape
assert theta_grad.shape == theta.shape

# examples

In [25]:
movie_list = []

with open('./data/movie_ids.txt', encoding='latin-1') as f:
    for line in f:
        tokens = line.strip().split(' ')
        movie_list.append(' '.join(tokens[1:]))

movie_list = np.array(movie_list)
print(movie_list)
movie_list.shape

['Toy Story (1995)' 'GoldenEye (1995)' 'Four Rooms (1995)' ...
 'Sliding Doors (1998)' 'You So Crazy (1994)'
 'Scream of Stone (Schrei aus Stein) (1991)']


(1682,)

In [17]:
#randomly init ratings
ratings = np.zeros(1682)

ratings[0] = 4
ratings[6] = 3
ratings[11] = 5
ratings[53] = 4
ratings[63] = 5
ratings[65] = 3
ratings[68] = 5
ratings[97] = 2
ratings[182] = 4
ratings[225] = 5
ratings[354] = 5
ratings

array([4., 0., 0., ..., 0., 0., 0.])

# prepare data

In [18]:
Y, R = movies_mat.get('Y'), movies_mat.get('R')


Y = np.insert(Y, 0, ratings, axis=1)  # now I become user 0
Y.shape

(1682, 944)

In [20]:
n_features = 50
n_movie, n_user = Y.shape
l = 10

In [19]:
R = np.insert(R, 0, ratings != 0, axis=1)
R.shape

(1682, 944)

In [21]:
X = np.random.standard_normal((n_movie, n_features))
theta = np.random.standard_normal((n_user, n_features))

X.shape, theta.shape

((1682, 50), (944, 50))

In [22]:
param = serialize(X, theta)

# normalized ratings 

In [23]:
Y_norm = Y - Y.mean()
Y_norm.mean()

4.6862111343939375e-17

# training

In [24]:
import scipy.optimize as opt
res = opt.minimize(fun=regularized_cost,
                   x0=param,
                   args=(Y_norm, R, n_features, l),
                   method='TNC',
                   jac=regularized_gradient)
#这里很慢

In [26]:
res

     fun: 64721.49781506649
     jac: array([-1.59612340e-06,  7.36853734e-08, -2.20209144e-07, ...,
       -7.74720767e-07,  1.87453345e-07, -2.86958654e-07])
 message: 'Converged (|f_n-f_(n-1)| ~= 0)'
    nfev: 2067
     nit: 63
  status: 1
 success: True
       x: array([-0.88899212,  0.33510132,  0.28911036, ...,  0.2481757 ,
       -0.45441441,  0.47895589])

In [27]:
X_trained, theta_trained = deserialize(res.x, n_movie, n_user, n_features)
X_trained.shape, theta_trained.shape

((1682, 50), (944, 50))

In [28]:
prediction = X_trained @ theta_trained.T

In [29]:
my_preds = prediction[:, 0] + Y.mean() # because above Y_norm = Y - Y.mean()
my_preds

array([3.65723632, 2.52103952, 2.29350485, ..., 0.50708576, 0.57596449,
       0.65630735])

In [34]:
idx = np.argsort(my_preds)[::-1]  # Descending orderz
print(idx)
my_preds[idx[0]], my_preds[idx[-1]]

[ 312   49   63 ... 1578 1579 1570]


(4.1253508787622675, 0.2730206376041848)

In [35]:
# top ten
my_preds[idx][:10]

array([4.12535088, 4.0441375 , 3.99324291, 3.91902759, 3.81690614,
       3.81556165, 3.76602629, 3.76322551, 3.75905225, 3.75077927])

In [36]:
for m in movie_list[idx][:10]:
    print(m)

Titanic (1997)
Star Wars (1977)
Shawshank Redemption, The (1994)
Forrest Gump (1994)
Raiders of the Lost Ark (1981)
Braveheart (1995)
Return of the Jedi (1983)
Usual Suspects, The (1995)
Godfather, The (1972)
Schindler's List (1993)
