# Recommender system

In [1]:
import csv
import numpy as np
from scipy.sparse import lil_matrix

### Data loader

Loading the data and representing it as a sparce utility matrix (review_data).

Note that indeces in the dataset are not sequential. That is why we create a separate functiona for translating dataset indeces into unitily matrix (review_data) indeces and back.

In [2]:
users_index = []
movies_index = []

num_users_processed = 0
num_movies_processed = 0

def get_user_index(dataset_user_id):
    global num_users_processed
    global users_index
    if dataset_user_id not in users_index:
        users_index.append(dataset_user_id)
        num_users_processed += 1
    return users_index.index(dataset_user_id)

def get_movie_index(dataset_movie_id):
    global num_movies_processed
    global movies_index
    if dataset_movie_id not in movies_index:
        movies_index.append(dataset_movie_id)
        num_movies_processed += 1
    return movies_index.index(dataset_movie_id)

In [3]:
num_movies = 9125
num_users = 671

review_data = lil_matrix((num_users, num_movies))

data_file = 'ratings.csv' 

with open(data_file) as ratings_file:
    rating_reader = csv.reader(ratings_file, delimiter=',')
    next(rating_reader)
    iter = 0

    for record in rating_reader:
        user_id, movie_id, rating, _ = record
        user_id = get_user_index(int(user_id))
        movie_id = get_movie_index(int(movie_id))
        rating = float(rating)
        review_data[user_id, movie_id] = rating
        iter += 1
        if iter % 10000 == 0: print('Processed ', iter, 'records out of 100k.')

Processed  10000 records out of 100k.
Processed  20000 records out of 100k.
Processed  30000 records out of 100k.
Processed  40000 records out of 100k.
Processed  50000 records out of 100k.
Processed  60000 records out of 100k.
Processed  70000 records out of 100k.
Processed  80000 records out of 100k.
Processed  90000 records out of 100k.
Processed  100000 records out of 100k.


In [4]:
movies = {}

data_file = 'movies.csv' 

with open(data_file) as movie_file:
    movie_reader = csv.reader(movie_file, delimiter=',')
    next(movie_reader)
    for record in movie_reader:
        movie_id, title, _ = record
        movies[int(movie_id)] = title

In [5]:
def get_movie_name_by_id(id):
    dataset_movie_id = movies_index[id]
    return movies[dataset_movie_id]

def get_movie_id_by_name(movie_name):
    movie_data_id = int(list(movies.keys())[list(movies.values()).index(movie_name)])
    return movies_index.index(movie_data_id)

In [6]:
print(get_movie_id_by_name('Matrix, The (1999)'))

402


### Item-item collaborative filtering

Computing item-item collaborative filtering from the utility matrix (review_data). The output of your computation should be pairwise similarities between all movies.


In [7]:
from scipy.sparse.linalg import norm

def cosine_similarity(m):
    mm = np.dot(m.T, m)
    mm_norm = norm(m, axis=0)
    mm_norm = np.dot(mm_norm.reshape((-1,1)), mm_norm.reshape((1, -1)))
    
    # now we could divide two matrices 
    # to avoid division by z, we could skip all zero elements 
    # as the matrices are identical
    # https://stackoverflow.com/questions/26248654/numpy-return-0-with-divide-by-zero
    
    return np.divide(mm.todense(), mm_norm, out=np.zeros(mm.shape), where=mm_norm!=0)
    
similarity = cosine_similarity(review_data)

### Finding most similar movies

Using the item-item similarity, finding 5 movies you would recommend the someone who likes the following:
- Matrix, The (1999)
- Toy Story (1995)
- From Dusk Till Dawn (1996)
- Gone with the Wind (1939)
- Iron Man (2008)

In [10]:
movie_ids = dict()
movie_names = ['Matrix, The (1999)',
               'Toy Story (1995)',
               'From Dusk Till Dawn (1996)',
               'Gone with the Wind (1939)',
               'Iron Man (2008)']

for movie_name in movie_names:
    movie_ids[movie_name] = get_movie_id_by_name(movie_name)

num_of_similar = 5

recommended = {}

for (m_name, m_id) in movie_ids.items():
    most_similar = reversed(similarity[m_id].argsort()[-(num_of_similar + 1):-1])
    recommended[m_name] = [get_movie_name_by_id(s_id) for s_id in most_similar]

for m, rec in recommended.items():
    print('Movie:{} \n Similar:\n   {}'.format(m, ',\n   '.join(rec)))
    print()

Movie:From Dusk Till Dawn (1996) 
 Similar:
   Nightmare on Elm Street, A (1984),
   Sleepy Hollow (1999),
   Batman Returns (1992),
   Candyman (1992),
   Alien³ (a.k.a. Alien 3) (1992)

Movie:Matrix, The (1999) 
 Similar:
   Lord of the Rings: The Fellowship of the Ring, The (2001),
   Lord of the Rings: The Two Towers, The (2002),
   Fight Club (1999),
   Back to the Future (1985),
   Lord of the Rings: The Return of the King, The (2003)

Movie:Iron Man (2008) 
 Similar:
   Dark Knight, The (2008),
   Star Trek (2009),
   Batman Begins (2005),
   Avatar (2009),
   Avengers, The (2012)

Movie:Toy Story (1995) 
 Similar:
   Toy Story 2 (1999),
   Star Wars: Episode IV - A New Hope (1977),
   Forrest Gump (1994),
   Independence Day (a.k.a. ID4) (1996),
   Groundhog Day (1993)

Movie:Gone with the Wind (1939) 
 Similar:
   Casablanca (1942),
   It's a Wonderful Life (1946),
   Wizard of Oz, The (1939),
   African Queen, The (1951),
   North by Northwest (1959)

