In [1]:
import numpy as np
import pickle
import matplotlib.pyplot as plt

In [2]:
with open('data/get_movies.pkl', 'rb') as f:
    t = pickle.load(f)
    
movie_dic = t[0]
movie_genre_dic = t[1]

with open('data/get_ratings.pkl', 'rb') as f:
    lst = pickle.load(f)

In [3]:
def grad_U(Ui, Yij, Vj, y_mean, reg, eta):
    """
    Takes as input Ui (the ith row of U), a training point Yij, the column
    vector Vj (jth column of V^T), reg (the regularization parameter lambda),
    and eta (the learning rate).

    Returns the gradient of the regularized loss function with
    respect to Ui multiplied by eta.
    """
    return (1-2*reg*eta)*Ui + 2 * eta * Vj * (Yij - y_mean - np.dot(Ui,Vj))     

def grad_V(Vj, Yij, Ui, y_mean, reg, eta):
    """
    Takes as input the column vector Vj (jth column of V^T), a training point Yij,
    Ui (the ith row of U), reg (the regularization parameter lambda),
    and eta (the learning rate).

    Returns the gradient of the regularized loss function with
    respect to Vj multiplied by eta.
    """
    return (1-2*reg*eta)*Vj + 2 * eta * Ui * (Yij - y_mean - np.dot(Ui,Vj))

def get_err(U, V, Y, reg=0.0):
    """
    Takes as input a matrix Y of triples (i, j, Y_ij) where i is the index of a user,
    j is the index of a movie, and Y_ij is user i's rating of movie j and
    user/movie matrices U and V.

    Returns the mean regularized squared-error of predictions made by
    estimating Y_{ij} as the dot product of the ith row of U and the jth column of V^T.
    """
    
    y_mean = np.mean(np.array([tup[2] for tup in Y]))
    
    # Compute mean squared error on each data point in Y; include
    # regularization penalty in error calculations.
    # We first compute the total squared squared error
    err = 0.0
    for (i,j,Yij) in Y:
        err += (Yij - y_mean - np.dot(U[i-1], V[:,j-1]))**2
    # Add error penalty due to regularization if regularization
    # parameter is nonzero
    if reg != 0:
        U_frobenius_norm = np.linalg.norm(U, ord='fro')
        V_frobenius_norm = np.linalg.norm(V, ord='fro')
        err += reg * (U_frobenius_norm ** 2)
        err += reg * (V_frobenius_norm ** 2)
    # Return the mean of the regularized error
    return err / float(len(Y))

def get_msqe_err(U, V, Y):
    y_mean = np.mean(np.array([tup[2] for tup in Y]))
    
    err = 0.0
    for (i,j,Yij) in Y:
        err += (Yij - y_mean - np.dot(U[i-1], V[:,j-1]))**2
    
    err /= len(Y)
    
    return err ** 0.5

def train_model(M, N, K, eta, reg, Y, Y_test):
    """
    Given a training data matrix Y containing rows (i, j, Y_ij)
    where Y_ij is user i's rating on movie j, learns an
    M x K matrix U and N x K matrix V such that rating Y_ij is approximated
    by (UV)_ij.

    Uses a learning rate of <eta> and regularization of <reg>. Stops after
    <max_epochs> epochs, or once the magnitude of the decrease in regularized
    MSE between epochs is smaller than a fraction <eps> of the decrease in
    MSE after the first epoch.

    Returns a tuple (U, V, err) consisting of U, V, and the unregularized MSE
    of the model.
    """
    
    y_mean = np.mean(np.array([tup[2] for tup in Y]))
    
    # Initialize U, V  
    U = np.random.random((M,K)) - 0.5
    V = np.random.random((K,N)) - 0.5
    
    best_test_error = float('inf')
    
    size = len(Y)
    delta = None
    indices = np.arange(size)    
    epoch = 0

    
    while epoch < 301:
        epoch += 1
        
        # Run an epoch of SGD
        before_E_in = get_err(U, V, Y, reg)
        np.random.shuffle(indices)
        
        prev_U = U.copy()
        prev_V = V.copy()
        
        for ind in indices:
            (i,j, Yij) = Y[ind]
            # Update U[i], V[j]
            U[i-1] = grad_U(U[i-1], Yij, V[:,j-1], y_mean, reg, eta)
            V[:,j-1] = grad_V(V[:,j-1], Yij, U[i-1], y_mean, reg, eta);
        # At end of epoch, print E_in
        E_in = get_err(U, V, Y, reg)
        E_out = get_msqe_err(U, V, Y_test)

        print("Epoch %s, E_in (regularized MSE): %s; E_out (MSQE): %s"%(epoch, E_in, E_out))
#         print("Epoch %s, E_in (regularized MSE): %s; MSQE: %s"%(epoch + 1, E_in, get_msqe_err(U, V, Y)))


        if best_test_error > E_out:
            best_test_error = E_out
        else:
            return (prev_U, prev_V, get_msqe_err(prev_U, prev_V, Y), get_msqe_err(prev_U, prev_V, Y_test), epoch - 1)

#     return (U, V, get_err(U, V, Y,y_mean))
    return -1

In [4]:
ratings = np.array(lst)

# train/test

In [None]:
Y_train = np.loadtxt('data/train.txt').astype(int)
Y_test = np.loadtxt('data/test.txt').astype(int)

In [None]:
num_movies = len(movie_dic)
num_users = 943
m = num_users
n = num_movies
k = 20

etas = [0.005, 0.007, 0.009, 0.003, 0.001]
regs = [0.1, 0.12, 0.14, 0.08, 0.06]

best_pair = (1, 0.001)
best_e_out = float('inf')

for eta in etas:
    for reg in regs:
        u,v, final_err, final_test_err, epoch = train_model(m, n, k, eta, reg, Y_train, Y_test)
        
        print(eta, reg, final_test_err)
        
        if best_e_out > final_test_err:
            best_pair = (eta, reg)
            best_e_out = final_test_err
            
            print("new record")

print("Best (eta, reg) pair: ", best_pair)


u,v, final_err, final_test_err, epoch = train_model(m, n, k, best_pair[0], best_pair[1], Y_train, Y_test)

print(final_test_err, epoch)

Epoch 1, E_in (regularized MSE): 1.2683515392126055; E_out (MSQE): 1.1467184257810337
Epoch 2, E_in (regularized MSE): 1.21515972204921; E_out (MSQE): 1.1335590230081707
Epoch 3, E_in (regularized MSE): 1.172035594262741; E_out (MSQE): 1.1210308496305361
Epoch 4, E_in (regularized MSE): 1.1160989382816926; E_out (MSQE): 1.1021959310725873
Epoch 5, E_in (regularized MSE): 1.0457170763641035; E_out (MSQE): 1.0766063489157343
Epoch 6, E_in (regularized MSE): 0.9762149406503593; E_out (MSQE): 1.0510708435576046
Epoch 7, E_in (regularized MSE): 0.9183678281520177; E_out (MSQE): 1.0299434949140875
Epoch 8, E_in (regularized MSE): 0.871808813901724; E_out (MSQE): 1.0131169258489767
Epoch 9, E_in (regularized MSE): 0.8355780092238864; E_out (MSQE): 0.9997674030972631
Epoch 10, E_in (regularized MSE): 0.8068969904131255; E_out (MSQE): 0.989486220850158
Epoch 11, E_in (regularized MSE): 0.783914397299441; E_out (MSQE): 0.9814665566911536
Epoch 12, E_in (regularized MSE): 0.7645343967455785; E_ou

In [None]:
with open("matrices/normal_best_model_eta_{}_reg_{}_epoch_{}_testerr_{}.matrix".format(best_pair[0], best_pair[1], epoch, final_test_err), "wb+") as f:
    np.save(f, v)

# SVD visualization

In [None]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
svd.fit_transform(v)
print("done")

In [None]:
from numpy.linalg import svd
a, sigma, b = np.linalg.svd(v)

In [None]:
a.shape

In [None]:
b.shape

In [None]:
np.diag(sigma).shape

In [None]:
u.shape

In [None]:
v.shape

In [None]:
transformer = a[:, :2].T
print(transformer.shape)

In [None]:
transformed_v = np.matmul(transformer, v).T
print(transformed_v.shape)

In [None]:
transformed_u = np.matmul(transformer, u.T).T
print(transformed_u.shape)

# visualizations

In [None]:
indices = np.random.randint(1682, size=10)

In [None]:
x = [transformed_v[i][0] for i in indices]
y = [transformed_v[i][1] for i in indices]

In [None]:
plt.scatter(x,y)

In [None]:
num_ratings_vs_movies = [0 for i in movie_dic.keys()]
for (i, j, yij) in lst:
    num_ratings_vs_movies[j] += 1
num_ratings_vs_movies = np.array(num_ratings_vs_movies)

In [None]:
indices = num_ratings_vs_movies.argsort()[-10:][::-1]

In [None]:
x = [transformed_v[i][0] for i in indices]
y = [transformed_v[i][1] for i in indices]
plt.scatter(x,y)

In [None]:
movie_rating_dic = {id: [] for id in movie_dic.keys()} # dic of movie id and ratings

for rating in lst:
    _, movie_id, r = rating
    movie_rating_dic[movie_id].append(r)

avg_ratings = {movie: sum(movie_rating_dic[movie]) / len(movie_rating_dic[movie]) \
               for movie in movie_rating_dic}

# get ten best movies that received highest average rating
top_10 = sorted(avg_ratings.items(), key=lambda tup: tup[1], reverse=True)[:10]
indices = [t[0] for t in top_10]

In [None]:
x = [transformed_v[i][0] for i in indices]
y = [transformed_v[i][1] for i in indices]
plt.scatter(x,y)

In [None]:
fant_movies = movie_genre_dic['Fantasy']
indices = fant_movies[:10]
x = [transformed_v[i][0] for i in indices]
y = [transformed_v[i][1] for i in indices]
plt.scatter(x,y)

In [None]:
fant_movies = movie_genre_dic['Documentary']
indices = fant_movies[:10]
x = [transformed_v[i][0] for i in indices]
y = [transformed_v[i][1] for i in indices]
plt.scatter(x,y)

In [None]:
fant_movies = movie_genre_dic["Children's"]
indices = fant_movies[:10]
x = [transformed_v[i][0] for i in indices]
y = [transformed_v[i][1] for i in indices]
plt.scatter(x,y)