In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('data/ratings.csv')[:100]
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [51]:
movie_data = pd.read_csv('data/movies.csv')
movie_data.head()

      movieId                                              title  \
0           1                                   Toy Story (1995)   
1           2                                     Jumanji (1995)   
2           3                            Grumpier Old Men (1995)   
3           4                           Waiting to Exhale (1995)   
4           5                 Father of the Bride Part II (1995)   
5           6                                        Heat (1995)   
6           7                                     Sabrina (1995)   
7           8                                Tom and Huck (1995)   
8           9                                Sudden Death (1995)   
9          10                                   GoldenEye (1995)   
10         11                     American President, The (1995)   
11         12                 Dracula: Dead and Loving It (1995)   
12         13                                       Balto (1995)   
13         14                                   

In [4]:
# Create rating matrix of shape (m x u) with rows as movies and columns as users
ratings_mat = np.ndarray(
    shape=(np.max(data.movieId.values), np.max(data.userId.values)),
    dtype=np.uint8)

ratings_mat[data.movieId.values - 1, data.userId.values - 1] = data.rating.values

In [5]:
# Normalize matrix (subtract mean off)
normalised_mat = ratings_mat - np.asarray([(np.mean(ratings_mat, 1))]).T
print(normalised_mat)

[[ 0.          0.          0.        ]
 [ 0.          0.          0.        ]
 [-3.         -3.          6.        ]
 ..., 
 [-1.33333333 -1.33333333  2.66666667]
 [ 1.33333333 -0.66666667 -0.66666667]
 [ 2.         -1.         -1.        ]]


In [6]:
# Compute svd
num_movies = data.shape[0] - 1
A = normalised_mat.T / np.sqrt(num_movies)
U, S, V = np.linalg.svd(A)
print(U, S, V)

[[ 0.35654013  0.73453782  0.57735027]
 [-0.81439847 -0.0584961   0.57735027]
 [ 0.45785835 -0.67604172  0.57735027]] [  3.02876736e+02   2.74557212e+02   9.39216305e-13] [[ -2.98538890e-15   3.43800415e-16   1.36738286e-03 ...,   6.07725715e-04
    2.36621878e-04   3.54932818e-04]
 [ -1.06137294e-15  -5.59991992e-16  -2.22723248e-03 ...,  -9.89881101e-04
    5.37766447e-04   8.06649670e-04]
 [  9.89264167e-01   1.39802496e-01  -6.90506663e-06 ...,   9.05634998e-04
    1.39452279e-05   4.17808430e-06]
 ..., 
 [ -1.02167246e-03   7.72501302e-04  -1.62574570e-03 ...,   9.99997186e-01
    3.64475714e-07   5.73877716e-07]
 [ -2.02054491e-05   3.58568058e-05   5.71815199e-04 ...,   9.03861692e-07
    9.99999663e-01  -5.14639858e-07]
 [ -1.37527632e-05   5.61594187e-05   8.74435377e-04 ...,   1.37087751e-06
   -5.04834639e-07   9.99999228e-01]]


In [65]:
def top_cosine_similarity(data, movie_id, top_n=10):
    index = movie_id - 1
    movie_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n]

def print_similar_movies(movie_data, movie_id, top_indexes):
    print('Recommendations for {0}: \n'.format(movie_data[movie_data.movieId == movie_id].title.values[0]))
    for id in top_indexes + 1:
        if not movie_data[movie_data.movieId == id].empty:
            print(movie_data[movie_data.movieId == id].title.values[0])

In [67]:
k = 50
movie_id = 2
top_n = 10

sliced = V.T[:, :k]
indexes = top_cosine_similarity(sliced, movie_id, top_n)
print_similar_movies(movie_data, movie_id, indexes)

Recommendations for Jumanji (1995): 

Jumanji (1995)
Toy Story (1995)
Room at the Top (1959)
Virus (1999)
White Men Can't Jump (1992)
Dumb & Dumber (Dumb and Dumber) (1994)
French Kiss (1995)
I Saw What You Did (1965)


  """
