In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from time import time

In [2]:
df_ratings_contents = pd.read_table("data/u.data",
                                    names=["user", "movie", "rating", "timestamp"])

df_utility = pd.pivot_table(data=df_ratings_contents, 
                            values='rating', 
                            index='user', 
                            columns='movie', 
                            fill_value=0)

df_utility.head()

movie,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5,3,4,3,3,5,4,1,5,3,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
df_utility.head()

movie,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5,3,4,3,3,5,4,1,5,3,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df_utility.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 943 entries, 1 to 943
Columns: 1682 entries, 1 to 1682
dtypes: int64(1682)
memory usage: 12.1 MB


In [5]:
highest_user_id = df_ratings_contents.user.max()
highest_movie_id = df_ratings_contents.movie.max()
ratings_mat = sparse.lil_matrix((highest_user_id, highest_movie_id))

for _, row in df_ratings_contents.iterrows():
    # subtract 1 from id's due to match 0 indexing
    ratings_mat[row.user-1, row.movie-1] = row.rating

ratings_mat

<943x1682 sparse matrix of type '<type 'numpy.float64'>'
	with 100000 stored elements in LInked List format>

### NMF

In [6]:
from sklearn.decomposition import NMF

def fit_nmf(M,k):
    nmf = NMF(n_components=k)
    nmf.fit(M)
    W = nmf.transform(M);
    H = nmf.components_;
    err = nmf.reconstruction_err_
    return W,H,err

# decompose
W,H,err = fit_nmf(ratings_mat,200)
print err
print W.shape,H.shape

603.065970426
(943L, 200L) (200L, 1682L)


In [7]:
# reconstruct
ratings_mat_fitted = W.dot(H)
errs = np.array((ratings_mat-ratings_mat_fitted).flatten()).squeeze()
mask = np.array((ratings_mat.todense()).flatten()).squeeze()>0

mse = np.mean(errs[mask]**2)
average_abs_err = abs(errs[mask]).mean()
print mse
print average_abs_err

2.52300501249
1.09266948942


In [33]:
ratings_mat_fitted

array([[  4.99465434e+00,   3.06702900e+00,   4.03501457e+00, ...,
          1.36039702e-05,   4.57832509e-04,   2.38915623e-01],
       [  4.05043862e+00,   2.41608508e-02,   6.85436876e-02, ...,
          1.57833899e-02,   1.91892693e-03,   2.25212076e-03],
       [  1.87814404e-02,   2.32716457e-02,   1.00412197e-02, ...,
          4.63825792e-02,   5.33899506e-03,   0.00000000e+00],
       ..., 
       [  4.90307220e+00,   4.84641406e-02,   1.29646521e-01, ...,
          8.04079189e-04,   3.56989291e-04,   1.85059226e-03],
       [  7.08626009e-02,   3.93412355e-01,   1.04473693e-02, ...,
          1.08709243e-02,   1.79619835e-02,   1.76012854e-03],
       [  1.34614798e-01,   2.48662207e+00,   2.17392339e+00, ...,
          0.00000000e+00,   1.36314004e-02,   1.49114505e-03]])

In [38]:
# get recommendations for one user
user_id = 100
n = 10

pred_ratings = ratings_mat_fitted[user_id,:]
item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))[::-1]

items_rated_by_this_user = ratings_mat[user_id].nonzero()[1]

unrated_items_by_pred_rating = [item for item in item_index_sorted_by_pred_rating
                                if item not in items_rated_by_this_user]

unrated_items_by_pred_rating[:n]

[475, 273, 684, 24, 590, 454, 290, 863, 147, 249]

In [20]:
### check errors
# truth
ratings_true = ratings_mat[user_id, items_rated_by_this_user].todense()
# prediction
ratings_pred = pred_ratings[items_rated_by_this_user]
print zip(np.array(ratings_true).squeeze(),ratings_pred)
err_one_user = ratings_true-ratings_pred
print err_one_user
print abs(err_one_user).mean()

[(3.0, 2.9629520029509417), (3.0, 2.7449877176627329), (4.0, 2.6050796973933052), (4.0, 3.8996882914595106), (2.0, 0.73972613585803748), (2.0, 2.5737815733534721), (4.0, 3.1301414904754834), (3.0, 3.6961044249957724), (4.0, 4.9076080279180143), (1.0, 0.52709324283716885), (2.0, 1.3858658717955363), (4.0, 3.7647118617696327), (4.0, 2.9838171525786881), (3.0, 3.0559630550344581), (4.0, 3.4755788588429195), (3.0, 3.0501635650609873), (3.0, 1.2457615708521204), (5.0, 4.6984927891751109), (3.0, 2.1267204307078376), (4.0, 2.6713107918015448), (4.0, 3.8899096814434833), (2.0, 1.2853338766942393), (3.0, 1.9556590165911498), (2.0, 1.6938319814790206), (3.0, 3.573331832423237), (4.0, 2.5305592651773914), (4.0, 2.8992234260805732), (3.0, 1.9580426556514785), (2.0, 1.0185592354432385), (2.0, 0.54912226142829279), (4.0, 3.6709778100162356), (2.0, 2.0497992332729336), (2.0, 0.8726701283656948), (3.0, 3.1361308386012809), (3.0, 2.5905343811392352), (4.0, 3.0865793166576232), (2.0, 1.4319961061545685)

### UVD/SVD

In [11]:
from sklearn.decomposition import TruncatedSVD

def fit_uvd(M,k):
    # use TruncatedSVD to realize UVD
    svd = TruncatedSVD(n_components=k, n_iter=7, random_state=0)
    svd.fit(M)

    V = svd.components_
    U = svd.transform(M) # effectively, it's doing: U = M.dot(V.T)
    # we can ignore svd.singular_values_ for our purpose
    
    # why we can do this?
    # recall: 
    # SVD start from u*s*v=M => u*s=M*v.T, where M*v.T is our transformation above to get U in UVD
    # so the above U is effectively u*s in SVD
    # that's why U*V = u*s*v = M our original matrix
    # there are many ways to understand it!
    # here we by-passed singular values.
    
    return U,V

# decompose
U,V = fit_uvd(ratings_mat,200)


In [13]:
print ratings_mat.shape, U.shape,V.shape

(943, 1682) (943L, 200L) (200L, 1682L)


In [14]:
# reconstruct
ratings_mat_fitted = U.dot(V) # U*V
# recall: U = M.dot(V.T), then this is M.dot(V.T).dot(V)
# original M is transformed to new space, then transformed back
# this is another way to understand it!

# calculate errs
errs = np.array((ratings_mat-ratings_mat_fitted).flatten()).squeeze()
mask = np.array((ratings_mat.todense()).flatten()).squeeze()>0

mse = np.mean(errs[mask]**2)
average_abs_err = abs(errs[mask]).mean()
print mse
print average_abs_err

1.03552547234
0.754982820003


In [15]:
# with the above "tranformed to the new space and back" language
# without the UV language, we can do:
M = ratings_mat
k = 200
svd = TruncatedSVD(n_components=k, n_iter=7, random_state=0)
svd.fit(M)
M_prime = svd.inverse_transform(svd.transform(M))
print sum(sum(U.dot(V) - M_prime))
# they are just equivalent!!

0.0


In [16]:
M_prime

array([[  5.10986304e+00,   3.21122582e+00,   2.79462833e+00, ...,
         -1.53418143e-02,  -4.48169325e-02,   1.63902493e-01],
       [  4.09627950e+00,   1.16269655e-01,   5.26546593e-01, ...,
          2.98694011e-03,  -1.66378827e-02,  -1.29433745e-02],
       [ -3.49348385e-01,  -5.75526384e-01,  -4.91766861e-01, ...,
          5.89932906e-02,  -3.81695359e-02,  -1.09241961e-02],
       ..., 
       [  4.16911033e+00,  -1.25975080e-01,   4.54718614e-01, ...,
         -3.32419824e-03,   1.48767976e-02,  -2.78277895e-02],
       [  2.10606916e-01,   3.16056252e-01,   5.56351259e-01, ...,
         -2.75769057e-02,  -5.01965386e-02,  -1.01546069e-01],
       [ -1.80260324e-02,   3.72262167e+00,   9.99008586e-01, ...,
         -2.31869146e-02,   2.81208494e-02,  -1.37596348e-02]])

In [18]:
# get recommendations for one user
user_id = 100
n = 10

pred_ratings = ratings_mat_fitted[user_id,:]
item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))[::-1]

items_rated_by_this_user = ratings_mat[user_id].nonzero()[1]

unrated_items_by_pred_rating = [item for item in item_index_sorted_by_pred_rating
                                if item not in items_rated_by_this_user]

unrated_items_by_pred_rating[:n]

[475, 684, 409, 863, 454, 147, 24, 273, 239, 933]

In [19]:
### check errors
# truth
ratings_true = ratings_mat[user_id, items_rated_by_this_user].todense()
# prediction
ratings_pred = pred_ratings[items_rated_by_this_user]
print zip(np.array(ratings_true).squeeze(),ratings_pred)
err_one_user = ratings_true-ratings_pred
print err_one_user
print abs(err_one_user).mean()

[(3.0, 2.9629520029509417), (3.0, 2.7449877176627329), (4.0, 2.6050796973933052), (4.0, 3.8996882914595106), (2.0, 0.73972613585803748), (2.0, 2.5737815733534721), (4.0, 3.1301414904754834), (3.0, 3.6961044249957724), (4.0, 4.9076080279180143), (1.0, 0.52709324283716885), (2.0, 1.3858658717955363), (4.0, 3.7647118617696327), (4.0, 2.9838171525786881), (3.0, 3.0559630550344581), (4.0, 3.4755788588429195), (3.0, 3.0501635650609873), (3.0, 1.2457615708521204), (5.0, 4.6984927891751109), (3.0, 2.1267204307078376), (4.0, 2.6713107918015448), (4.0, 3.8899096814434833), (2.0, 1.2853338766942393), (3.0, 1.9556590165911498), (2.0, 1.6938319814790206), (3.0, 3.573331832423237), (4.0, 2.5305592651773914), (4.0, 2.8992234260805732), (3.0, 1.9580426556514785), (2.0, 1.0185592354432385), (2.0, 0.54912226142829279), (4.0, 3.6709778100162356), (2.0, 2.0497992332729336), (2.0, 0.8726701283656948), (3.0, 3.1361308386012809), (3.0, 2.5905343811392352), (4.0, 3.0865793166576232), (2.0, 1.4319961061545685)