In [50]:
import implicit
import pandas as pd
from scipy.sparse import csr_matrix
import numpy as np
from statistics import mean

In [284]:
def implicitModel(movieLensDataTrainPath='../data/train_clean.txt', movieLensDataTestPath='../data/test_clean.txt'):
    dfTrain = pd.read_csv(movieLensDataTrainPath, sep="\t", header=None)
    dfTrain.columns = ["User Id", "Movie Id", "Rating"]

    dfTest = pd.read_csv(movieLensDataTestPath, sep="\t", header=None)
    dfTest.columns = ["User Id", "Movie Id", "Rating"]

    test = dfTest.to_numpy()
    train1 = dfTrain.to_numpy()
    
    
    # initialize a model
    model = implicit.als.AlternatingLeastSquares(factors=25, iterations=400, regularization=0.01)
    #print(train)
    M = max(max(train1[:, 0]), max(test[:, 0])).astype(int)
    N = max(max(train1[:, 1]), max(test[:, 1])).astype(int)
    
    newTrains = np.zeros((M, N))
    #print(len(newTrains))
    #print(len(newTrains[0]))
    for y in train1:
        i, j, yij = y
        i = i - 1
        j = j - 1
        #print(newTrains[i])
        newTrains[i][j] = yij
    newTrains = np.array(newTrains)
    train = csr_matrix(newTrains)
    # train the model on a sparse matrix of item/user/confidence weights
    model.fit(train)

    #print(model.item_factors)
    #print(len(model.item_factors))
    #print(model.user_factors)
    #print(len(model.user_factors))
    U = model.item_factors
    V = model.user_factors
    U = np.float64(U)
    V = np.float64(V)
    print("Insample, outofsample")
    print(get_err2(U, V, train1))
    print(get_err2(U, V, test))
    return model.item_factors, model.user_factors
    # recommend items for a user
    #user_items = item_user_data.T.tocsr()
    #recommendations = model.recommend(userid, user_items)

    # find related items
    #related = model.similar_items(itemid)

In [216]:
def get_err2(U, V, Y, reg=0.0):
    """
    Takes as input a matrix Y of triples (i, j, Y_ij) where i is the index of a user,
    j is the index of a movie, and Y_ij is user i's rating of movie j and
    user/movie matrices U and V.

    Returns the mean regularized squared-error of predictions made by
    estimating Y_{ij} as the dot product of the ith row of U and the jth column of V^T.
    """
    totalLength = len(Y)

    sumOfSqs = 0
    #meanYs = mean(Y[:, 2])
    for y in Y:
        #print(y)
        i = int(y[0])
        j = int(y[1])
        yij = y[2]
        i = i - 1
        j = j - 1
        sumOfSqs = sumOfSqs + ((yij - np.dot(U[i], V[j])) ** 2)

    normSum = (np.linalg.norm(U, ord='fro') ** 2 + np.linalg.norm(V, ord='fro') ** 2)
    return ((reg * normSum) + sumOfSqs) / (2 * totalLength)


In [266]:
def Vtrain(M, N, K, eta, reg, Y, max_epochs=300):
    model = implicit.als.AlternatingLeastSquares(factors=25, iterations=400, regularization=0.01)
    #print(train)
    
    
    
    newTrains = np.array(Y)
    train = csr_matrix(newTrains)
    print(train.shape)
    # train the model on a sparse matrix of item/user/confidence weights
    model.fit(train)

    print(model.item_factors.shape)
    #print(len(model.item_factors))
    print(model.user_factors.shape)
    #print(len(model.user_factors))
    return model.item_factors, model.user_factors

In [218]:
def SVDofV(oldV):
    M = len(oldV)  # users
    N = len(oldV[0])  # movies
    print("Factorizing with ", M, " users, ", N, " movies.")
    # Ks = [10, 20, 30, 50, 100]
    #print("oldV")
    #print(oldV)
    K = 20
    reg = 0.0
    eta = 0.03  # learning rate
    # Use to compute Ein and Eout
    A, B = Vtrain(M, N, K, eta, reg, oldV, max_epochs=300)
    #print(err)
    return A, B

In [271]:
def tryThis():
    U, V = implicitModel()
    #U = np.float64(U)
    #V = np.float64(V)
    U = U.T
    V = V.T
    
    dfTest = pd.read_csv('../data/test_clean.txt', sep="\t", header=None)
    dfTest.columns = ["User Id", "Movie Id", "Rating"]
    
    Y_test = dfTest.to_numpy()
   
    
    #U = np.array(U)
    #V = np.array(V)
    for i in range(len(V)):
        V[i] = V[i] - mean(V[i])
    for i in range(len(U)):
        U[i] = U[i] - mean(U[i])
    # SVD of V!
    
    A, B = SVDofV(V)
    A = A.T
    # Use the first 2 cols for work
    Asub = A[:, :2]
    
    
    projU = np.dot(Asub.T, U)
    projV = np.dot(Asub.T, V)
  
    # Rescale dimensions to compress the image
    for i in range(len(projV)):
        projV[i] = projV[i] / max(projV[i])
    for i in range(len(projU)):
        projU[i] = projU[i] / max(projU[i])
    dfTest = pd.read_csv('../data/test_clean.txt', sep="\t", header=None)
    dfTest.columns = ["User Id", "Movie Id", "Rating"]
    
    Y_test = dfTest.to_numpy()
    print(get_err2(U.T, V.T, Y_test))
    print(get_err2(projU.T, projV.T, Y_test))
    return projU, projV

In [214]:
tryThis()

HBox(children=(IntProgress(value=0, max=400), HTML(value='')))


Factorizing with  25  users,  1682  movies.
(25, 1682)


HBox(children=(IntProgress(value=0, max=400), HTML(value='')))


(25, 25)
(1682, 25)
5.986126027193461
6.762353965648446


(array([[ 0.05268528, -0.01596341, -0.00939974, ...,  0.09213511,
          0.06527893, -0.31026629],
        [ 0.10317169, -0.05010444, -0.1843411 , ..., -0.01632538,
         -0.22638909,  0.55292897]]),
 array([[ 0.02320117, -0.11842777, -0.06163454, ..., -0.00236728,
          0.08143574, -0.0357781 ],
        [ 0.70187777,  0.61530984,  0.3870219 , ..., -0.12506976,
         -0.11006051, -0.11378403]]))

In [220]:
# try factors = 10 - 30
# factor = 10
tryThis()

HBox(children=(IntProgress(value=0, max=400), HTML(value='')))


Factorizing with  10  users,  1682  movies.
(10, 1682)


HBox(children=(IntProgress(value=0, max=400), HTML(value='')))


(10, 10)
(1682, 10)
6.02367142745021
6.710804176143008


(array([[-0.18773026,  0.02722864, -0.61324116, ...,  0.41145416,
          0.04937325,  0.2802474 ],
        [ 0.20595127,  0.1905625 ,  0.00386952, ...,  0.20186408,
         -0.45943988,  0.13527139]]),
 array([[ 0.27812148,  0.10102885, -0.23861826, ...,  0.17245929,
          0.17798938,  0.17160375],
        [ 0.42012342,  0.02981057,  0.06573513, ...,  0.08290199,
          0.08062281,  0.09543647]]))

In [224]:
# factor = 20
tryThis()

HBox(children=(IntProgress(value=0, max=400), HTML(value='')))


Factorizing with  20  users,  1682  movies.
(20, 1682)


HBox(children=(IntProgress(value=0, max=400), HTML(value='')))


(20, 20)
(1682, 20)
5.973503195163266
6.683976377432382


(array([[-0.34212088, -0.05473271, -0.01763584, ..., -0.41762962,
         -0.12816006,  0.28655833],
        [ 0.20344307, -0.00435654, -0.27315505, ...,  0.02260609,
         -0.25944322, -0.05902345]]),
 array([[ 0.71028516, -0.2020249 ,  0.0390325 , ..., -0.17089333,
         -0.18243513, -0.1698089 ],
        [ 0.91943676,  0.39897596,  0.0896046 , ..., -0.20103413,
         -0.20880517, -0.1826126 ]]))

In [227]:
tryThis()

HBox(children=(IntProgress(value=0, max=400), HTML(value='')))


Factorizing with  30  users,  1682  movies.
(30, 1682)


HBox(children=(IntProgress(value=0, max=400), HTML(value='')))


(30, 30)
(1682, 30)
6.01786048579628
6.882924867683018


(array([[-0.13976212,  0.08158357, -0.07820875, ...,  0.43147188,
         -0.04931598, -0.38308099],
        [-0.04860368,  0.18700423, -0.02066827, ..., -0.06466594,
         -0.69803174,  0.42407291]]),
 array([[-0.18404948, -0.22185305, -0.28803219, ...,  0.15989776,
          0.15109004,  0.10136365],
        [-2.70986514, -0.01427051,  0.21092751, ...,  0.44964241,
          0.47512116,  0.43501239]]))

In [230]:
# iterations 100 - 400
# iterations: 100
tryThis()

HBox(children=(IntProgress(value=0), HTML(value='')))


Factorizing with  20  users,  1682  movies.
(20, 1682)


HBox(children=(IntProgress(value=0), HTML(value='')))


(20, 20)
(1682, 20)
5.972868458912404
6.790715096096255


(array([[-0.38496393, -0.21907763,  0.1031503 , ..., -0.09697943,
          0.18586857, -0.4902487 ],
        [ 0.49185578, -0.20401817, -0.17707963, ...,  0.30011438,
         -0.13499003, -0.29155747]]),
 array([[ 0.0039801 , -0.33779609,  0.13614662, ...,  0.04222423,
          0.05711166,  0.07634828],
        [-0.62891121, -0.25017245,  0.32843423, ...,  0.1360753 ,
          0.11710342,  0.15779714]]))

In [233]:
# iterations:200
tryThis()

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Factorizing with  20  users,  1682  movies.
(20, 1682)


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


(20, 20)
(1682, 20)
5.973474659833587
6.787159449866254


(array([[ 1.16180626e-01,  1.34208351e-01,  1.22407386e-02, ...,
         -3.08678152e-01,  1.15044143e-04,  7.87919329e-02],
        [-1.73709396e-02, -3.06361248e-02, -9.50952631e-02, ...,
         -8.72808711e-02, -7.20160317e-02, -1.28628194e-01]]),
 array([[ 0.44847868, -0.28895148,  0.19644355, ..., -0.16918855,
         -0.17517802, -0.15643743],
        [-0.40218023, -0.66541018, -0.1825704 , ...,  0.13507914,
          0.11260007,  0.1052276 ]]))

In [237]:
# iterations: 300
tryThis()

HBox(children=(IntProgress(value=0, max=300), HTML(value='')))


Factorizing with  20  users,  1682  movies.
(20, 1682)


HBox(children=(IntProgress(value=0, max=300), HTML(value='')))


(20, 20)
(1682, 20)
5.973451519872026
6.736709330483526


(array([[ 0.45116178,  0.38645561, -0.14566257, ...,  0.10805086,
         -0.32010844, -0.0573154 ],
        [ 0.10142944, -0.00461617,  0.24076497, ..., -0.07723325,
          0.04280729,  0.47136875]]),
 array([[ 0.62688051,  0.25374831,  0.230187  , ..., -0.08271771,
         -0.09922818, -0.07676293],
        [ 0.42525583,  0.31339303,  0.48834504, ..., -0.09443604,
         -0.06273913, -0.07405505]]))

In [240]:
# iterations: 400
tryThis()

HBox(children=(IntProgress(value=0, max=400), HTML(value='')))


Factorizing with  20  users,  1682  movies.
(20, 1682)


HBox(children=(IntProgress(value=0, max=400), HTML(value='')))


(20, 20)
(1682, 20)
5.9734925479193395
6.283465322642221


(array([[-0.22504067,  0.37916534,  0.02319564, ...,  0.11974443,
         -0.17384361,  0.39342908],
        [-0.43477646, -0.16778532,  0.05711982, ..., -0.11001323,
         -0.09593267, -0.46196447]]),
 array([[-1.53275005, -0.56527865, -0.44065113, ...,  0.56765532,
          0.46550347,  0.49697101],
        [-0.99376215, -1.49326465, -0.11199443, ...,  0.19913505,
          0.14469228,  0.18373081]]))

In [243]:
# iterations: 500
tryThis()

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


Factorizing with  20  users,  1682  movies.
(20, 1682)


HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


(20, 20)
(1682, 20)
5.973499330592791
6.7011427055624635


(array([[-0.32450772,  0.15831069,  0.38982431, ...,  0.06480249,
          0.37984955, -0.37472679],
        [ 0.00492199, -0.04634218, -0.13545356, ..., -0.07052355,
         -0.18113828,  0.24322996]]),
 array([[ 0.05786028, -0.13702554, -0.17607546, ...,  0.08877518,
          0.07285865,  0.05870955],
        [ 0.58459888,  0.68518994, -0.33224843, ..., -0.02709495,
         -0.04307124, -0.02770385]]))

In [248]:
# regularization 0
tryThis()

HBox(children=(IntProgress(value=0, max=400), HTML(value='')))


Factorizing with  20  users,  1682  movies.
(20, 1682)


HBox(children=(IntProgress(value=0, max=400), HTML(value='')))


(20, 20)
(1682, 20)
5.97352038703301
6.519403336630566


(array([[ 0.29248027,  0.03343299, -0.27955791, ..., -0.19730645,
          0.04024245,  0.18217228],
        [ 0.37102111, -0.03236087, -0.0853858 , ..., -0.13162544,
         -0.13891402,  0.17129119]]),
 array([[ 0.68332213,  0.1314203 ,  0.17966133, ..., -0.15454847,
         -0.1477241 , -0.13972041],
        [ 0.59034826,  0.15431865,  0.3578188 , ..., -0.20956752,
         -0.20630329, -0.17779913]]))

In [253]:
# regularization 0.01
tryThis()

HBox(children=(IntProgress(value=0, max=400), HTML(value='')))


Factorizing with  20  users,  1682  movies.
(20, 1682)


HBox(children=(IntProgress(value=0, max=400), HTML(value='')))


(20, 20)
(1682, 20)
5.973487137713239
6.668532934397548


(array([[ 0.29532711, -0.02029674, -0.14924882, ..., -0.2944268 ,
         -0.08266219,  0.35367771],
        [-0.36468868,  0.18091353, -0.08703954, ..., -0.02328216,
         -0.05006783,  0.35031682]]),
 array([[ 0.01477682,  0.27860298, -0.14683453, ..., -0.18120656,
         -0.18147622, -0.17812888],
        [ 0.37168907,  0.08023991,  0.16457713, ..., -0.08502641,
         -0.08027096, -0.09807211]]))

In [256]:
# regularization 0.02
tryThis()

HBox(children=(IntProgress(value=0, max=400), HTML(value='')))


Factorizing with  20  users,  1682  movies.
(20, 1682)


HBox(children=(IntProgress(value=0, max=400), HTML(value='')))


(20, 20)
(1682, 20)
5.973513044541484
6.721587950794641


(array([[-0.20076363, -0.01408545, -0.37824274, ...,  0.44381576,
         -0.05180978,  0.00281068],
        [-0.30067288, -0.27350234, -0.13196942, ...,  0.21092256,
         -0.12279952,  0.24304971]]),
 array([[ 0.84382896,  0.07667003,  0.11194925, ..., -0.04147984,
         -0.04614309, -0.04888779],
        [-0.87303388,  0.35951529, -0.11392617, ...,  0.3155167 ,
          0.27981877,  0.28089811]]))

In [259]:
# regularization 0.05
tryThis()

HBox(children=(IntProgress(value=0, max=400), HTML(value='')))


Factorizing with  20  users,  1682  movies.
(20, 1682)


HBox(children=(IntProgress(value=0, max=400), HTML(value='')))


(20, 20)
(1682, 20)
5.973566527775447
6.7050490608003965


(array([[ 0.20164737, -0.04043921, -0.15172862, ...,  0.27690511,
         -0.1594858 , -0.30149305],
        [-0.51342166,  0.07018362,  0.14884264, ...,  0.0429057 ,
          0.02676919,  0.34776087]]),
 array([[ 0.56404248,  0.28774096,  0.04800291, ..., -0.0822675 ,
         -0.06097277, -0.07162179],
        [-1.39234628,  0.49090086, -1.07015688, ...,  0.33987097,
          0.31976139,  0.31173988]]))

In [262]:
# regularization 0.1
tryThis()

HBox(children=(IntProgress(value=0, max=400), HTML(value='')))


Factorizing with  20  users,  1682  movies.
(20, 1682)


HBox(children=(IntProgress(value=0, max=400), HTML(value='')))


(20, 20)
(1682, 20)
5.973869039037238
6.804373491099524


(array([[ 0.15716904, -0.20166257,  0.16806437, ..., -0.04462776,
         -0.08706374, -0.06883067],
        [ 0.12440249, -0.15395602,  0.21830658, ..., -0.29955412,
         -0.04255784,  0.45278841]]),
 array([[ 0.60494486,  0.1286197 ,  0.30265511, ..., -0.15047205,
         -0.15530692, -0.15178372],
        [ 0.15663008,  0.2206094 ,  0.10353355, ..., -0.08279333,
         -0.08853967, -0.0796932 ]]))

In [267]:
# factors = 25
tryThis()

HBox(children=(IntProgress(value=0, max=400), HTML(value='')))


Factorizing with  25  users,  1682  movies.
(25, 1682)


HBox(children=(IntProgress(value=0, max=400), HTML(value='')))


(25, 25)
(1682, 25)
5.987086275813317
6.598190361018987


(array([[-0.09437341,  0.41999004,  0.14257841, ..., -0.01348876,
         -0.40054237,  0.55627769],
        [ 0.05622261,  0.56165251, -0.06689724, ...,  0.08837381,
          0.17068624,  0.02579182]]),
 array([[ 0.26627343,  0.15761198,  0.18690835, ...,  0.11153964,
          0.09119824,  0.11119005],
        [ 0.09457608,  0.00472292,  0.08013387, ..., -0.00575889,
          0.0035797 , -0.02392721]]))

In [285]:
tryThis()

HBox(children=(IntProgress(value=0, max=400), HTML(value='')))


Insample, outofsample
4.637117367143353
4.962381218804235
Factorizing with  25  users,  1682  movies.
(25, 1682)


HBox(children=(IntProgress(value=0, max=400), HTML(value='')))


(25, 25)
(1682, 25)
5.98703244258325
6.748159214734561


(array([[-0.0685673 ,  0.18518588,  0.0167694 , ..., -0.11315601,
          0.0894184 ,  0.0339749 ],
        [ 0.05526666,  0.12772968, -0.00080178, ..., -0.11363699,
         -0.23556785, -0.20131168]]),
 array([[-1.55254431, -0.63647678, -0.59828789, ...,  0.17141748,
          0.19126394,  0.16369444],
        [-0.92706849, -0.08865029, -0.08549678, ...,  0.08191924,
          0.07359059,  0.06361702]]))