# Recommender system

Let's build a simple collaborative filtering recommender system. We will use MovieLens dataset.

In [3]:
import numpy as np
import pandas as pd


In [21]:
ratings_df = pd.read_csv('D:\\ml\\ml-1m\\ratings.dat',names=['user_id', 'movie_id', 'rating', 'time'], engine='python', delimiter='::')
movies_df = pd.read_csv('D:\\ml\\ml-1m\\movies.dat', names=['movie_id', 'title', 'genre'], engine='python', delimiter='::')

In [22]:
ratings_df.head()

Unnamed: 0,user_id,movie_id,rating,time
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [23]:
movies_df.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [24]:
ratings_mat = np.ndarray(shape=(np.max(ratings_df.movie_id.values), np.max(ratings_df.user_id.values)), dtype=np.uint8)
ratings_mat[ratings_df.movie_id.values-1, ratings_df.user_id.values-1] = ratings_df.rating.values

In [25]:
ratings_mat

array([[5, 0, 0, ..., 0, 0, 3],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

Normalization

In [27]:
normalised_mat = ratings_mat - np.asarray([(np.mean(ratings_mat, 1))]).T

## SVD

On this dataset we can simply calculate an SVD matrix.

In [28]:
A = normalised_mat.T / np.sqrt(ratings_mat.shape[0] - 1)
U, S, V = np.linalg.svd(A)

In [29]:
def top_cosine_similarity(data, movie_id, top_n=10):
    index = movie_id - 1 # Movie id starts from 1
    movie_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n]

# Helper function to print top N similar movies
def print_similar_movies(movie_data, movie_id, top_indexes):
    print('Recommendations for {0}: \n'.format(
    movie_data[movie_data.movie_id == movie_id].title.values[0]))
    for id in top_indexes + 1:
        print(movie_data[movie_data.movie_id == id].title.values[0])

In [34]:
k = 100
movie_id = 10
top_n = 10

sliced = V.T[:, :k]
indexes = top_cosine_similarity(sliced, movie_id, top_n)
print_similar_movies(movies_df, movie_id, indexes)

Recommendations for GoldenEye (1995): 

GoldenEye (1995)
Tomorrow Never Dies (1997)
World Is Not Enough, The (1999)
Licence to Kill (1989)
Mask of Zorro, The (1998)
On Her Majesty's Secret Service (1969)
Thunderball (1965)
View to a Kill, A (1985)
Spy Who Loved Me, The (1977)
Man with the Golden Gun, The (1974)


  """


## PCA

In [36]:
normalised_mat = ratings_mat - np.matrix(np.mean(ratings_mat, 1)).T
cov_mat = np.cov(normalised_mat)
evals, evecs = np.linalg.eig(cov_mat)

In [37]:
k = 100
movie_id = 10
top_n = 10

sliced = evecs[:, :k] # representative data
top_indexes = top_cosine_similarity(sliced, movie_id, top_n)
print_similar_movies(movies_df, movie_id, top_indexes)

Recommendations for GoldenEye (1995): 

GoldenEye (1995)
Tomorrow Never Dies (1997)
World Is Not Enough, The (1999)
Licence to Kill (1989)
Mask of Zorro, The (1998)
On Her Majesty's Secret Service (1969)
Thunderball (1965)
View to a Kill, A (1985)
Spy Who Loved Me, The (1977)
Man with the Golden Gun, The (1974)


  """
