In [2]:
import os
import time

import pandas as pd
import numpy as np
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import CoClustering, SlopeOne, NMF, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, NormalPredictor

import io  # needed because of weird encoding of u.item file

from surprise import BaselineOnly, SVD, KNNBaseline
from surprise import Reader
from surprise import get_dataset_dir
from surprise.model_selection import cross_validate
from surprise import dump

This notebook is based on [this guide](https://www.analyticsvidhya.com/blog/2018/06/comprehensive-guide-recommendation-engine-python/) and [Surprise library docs](https://surprise.readthedocs.io/en/stable/FAQ.html).

Datasets available [here](https://grouplens.org/datasets/movielens/) and should be downloaded into `datasets/` directory.

In [118]:
# Reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('datasets/ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1')

# Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('datasets/ml-100k/u.data', sep='\t', names=r_cols, encoding='latin-1')

# Reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('datasets/ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')

In [119]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [5]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


There are 100k ratings (it's a small version of 27million-rating dataset):

In [6]:
ratings.shape

(100000, 4)

In [7]:
items.head()

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


Get train/test data:

In [8]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_train = pd.read_csv('datasets/ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('datasets/ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')
ratings_train.shape, ratings_test.shape

((90570, 4), (9430, 4))

We will recommend movies based on user-user similarity and item-item similarity. For that, first we need to calculate the number of unique users and movies.

In [9]:
n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]
n_users, n_items

(943, 1682)

Now, we will create a user-item matrix which can be used to calculate the similarity between users and items.

In [32]:
# create a matrix filled with zeroes
data_matrix = np.zeros((n_users, n_items))

# iterate over the rating line by line and update data_matrix
# each line looks like this: 
# Pandas(Index=3, user_id=244, movie_id=51, rating=2, unix_timestamp=880606923)
for line in ratings.itertuples():
    data_matrix[line.user_id-1, line.movie_id-1] = line.rating

In [34]:
data_matrix.shape

(943, 1682)

In [36]:
data_matrix[:5]

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [4., 3., 0., ..., 0., 0., 0.]])

Now, we will calculate the similarity. We can use the pairwise_distance function from sklearn to calculate the cosine similarity.

In [37]:
from sklearn.metrics.pairwise import pairwise_distances 

user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

This gives us the item-item and user-user similarity in an array form. The next step is to make predictions based on these similarities. Let’s define a function to do just that.

In [38]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')

In [42]:
item_prediction.shape, user_prediction.shape

((943, 1682), (943, 1682))

In [45]:
item_prediction[0]

array([0.44627765, 0.475473  , 0.50593755, ..., 0.58815455, 0.5731069 ,
       0.56669645])

The aforementioned calculations are just an example of what's happening under the hood of Surprise and apple/turicreate libraries

## How to get the k nearest neighbors of a user (or item)

You can use the get_neighbors() methods of the algorithm object. This is only relevant for algorithms that use a similarity measure, such as the [k-NN algorithms](https://surprise.readthedocs.io/en/stable/knn_inspired.html#pred-package-knn-inpired).

Here is an example where we retrieve the 10 nearest neighbors of the movie Toy Story from the MovieLens-100k dataset. The output should be as follows:
- The 10 nearest neighbors of Toy Story are:
- Beauty and the Beast (1991)
- Raiders of the Lost Ark (1981)
- That Thing You Do! (1996)
- Lion King, The (1994)
- Craft, The (1996)
- Liar Liar (1997)
- Aladdin (1992)
- Cool Hand Luke (1967)
- Winnie the Pooh and the Blustery Day (1968)
- Indiana Jones and the Last Crusade (1989)

In [47]:
import io  # needed because of weird encoding of u.item file

from surprise import KNNBaseline
from surprise import Dataset
from surprise import get_dataset_dir


def read_item_names():
    """Read the u.item file from MovieLens 100-k dataset and return two
    mappings to convert raw ids into movie names and movie names into raw ids.
    """

    file_name = 'datasets/ml-100k/u.item'
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            # line[0] means id and line[1] represents movie's name
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid


# First, train the algortihm to compute the similarities between items
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)

# Read the mappings raw id <-> movie name
rid_to_name, name_to_rid = read_item_names()

# Retrieve inner id of the movie Toy Story
toy_story_raw_id = name_to_rid['Toy Story (1995)']
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)

# Retrieve inner ids of the nearest neighbors of Toy Story.
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)

# Convert inner ids of the neighbors into names.
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in toy_story_neighbors)
toy_story_neighbors = (rid_to_name[rid]
                       for rid in toy_story_neighbors)

print('\nThe 10 nearest neighbors of Toy Story are:')
for movie in toy_story_neighbors:
    print(movie)

Dataset ml-100k could not be found. Do you want to download it? [Y/n] Y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /Users/admin/.surprise_data/ml-100k
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

The 10 nearest neighbors of Toy Story are:
Beauty and the Beast (1991)
Raiders of the Lost Ark (1981)
That Thing You Do! (1996)
Lion King, The (1994)
Craft, The (1996)
Liar Liar (1997)
Aladdin (1992)
Cool Hand Luke (1967)
Winnie the Pooh and the Blustery Day (1968)
Indiana Jones and the Last Crusade (1989)


### Now lets train the same algorighm on the ml-100k dataset (loaded manually)

In [10]:
ratings.shape

(100000, 4)

In [22]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [21]:
ratings.loc[ratings.unix_timestamp.isnull()]

Unnamed: 0,user_id,movie_id,rating,unix_timestamp


In [11]:
items.shape

(1682, 24)

In [37]:
items.columns

Index(['movie id', 'movie title', 'release date', 'video release date',
       'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children's',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western'],
      dtype='object')

items dataset has messy column names so we should tweak them before proccessing

In [38]:
items.columns = items.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

In [40]:
items.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,imdb_url,unknown,action,adventure,animation,children's,...,fantasy,film-noir,horror,musical,mystery,romance,sci-fi,thriller,war,western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


Lets build a dataset and train new KNN model for items

In [23]:
start = time.time()

print('Loading data...')
reader = Reader(rating_scale=(1, 5))

print('Building a dataset...')
data = Dataset.load_from_df(ratings.loc[:, ['user_id', 'movie_id', 'rating']], reader=reader)
trainset = data.build_full_trainset()

end = time.time()
print('Loading data and building the dataset took {} seconds'.format(end - start))

Loading data...
Building a dataset...
Loading data and building the dataset took 0.5182929039001465 seconds


In [24]:
start = time.time()

print('Training KNNBaseline...')
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)

end = time.time()
print('Training took {} seconds'.format(end - start))

Training KNNBaseline...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Training took 5.110960006713867 seconds


These methods returns id of the movie or name of the movie by its id:

In [90]:
def get_id_by_name(name):
    return items.loc[items['movie_title'].str.contains(name), 'movie_id'].iloc[0]

def get_name_by_id(_id):
    return items.loc[items['movie_id']==_id, 'movie_title'].iloc[0]

In [91]:
get_id_by_name('Toy Story')

1

In [92]:
get_name_by_id(1)

'Toy Story (1995)'

Check a couple of movie ids from the ratings just to be sure that all those movies exist in our movies dataset (it wasn't the case for some datasets)

In [93]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [94]:
print(get_name_by_id(242))
print(get_name_by_id(302))
print(get_name_by_id(377))
print(get_name_by_id(51))
print(get_name_by_id(346))

Kolya (1996)
L.A. Confidential (1997)
Heavyweights (1994)
Legends of the Fall (1994)
Jackie Brown (1997)


In [50]:
# Retrieve inner id of the movie Toy Story
toy_story_raw_id = get_id_by_name('Toy Story')
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)

toy_story_inner_id

24

In [52]:
# Retrieve inner ids of the nearest neighbors of Toy Story.
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)

# Convert inner ids of the neighbors into names.
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in toy_story_neighbors)
toy_story_neighbors = (get_name_by_id(rid)
                       for rid in toy_story_neighbors)

print('\nThe 10 nearest neighbors of Toy Story are:')
for movie in toy_story_neighbors:
    print(movie)


The 10 nearest neighbors of Toy Story are:
Beauty and the Beast (1991)
Raiders of the Lost Ark (1981)
That Thing You Do! (1996)
Lion King, The (1994)
Craft, The (1996)
Liar Liar (1997)
Aladdin (1992)
Cool Hand Luke (1967)
Winnie the Pooh and the Blustery Day (1968)
Indiana Jones and the Last Crusade (1989)


Results are identical to those we got above, so this approach could be used to build basic title/title recommendation system with arbitrary data.

### Pack title/title recommendation logic into a function

In [95]:
def make_recommendations(title=None, _id=None, num_rec=10):
    '''Makes recommendation for a particular title.
    Returns a list of recommended titles.
    '''
    if not title and not _id:
        print(
            "I can make recommendations by _id or title."
            "For example: make_recommendations(\'Batman\')"
            "or make_recommendations(12))"
        )
        return

    if title:
        # Retrieve inner id of the movie
        try:
            raw_id = get_id_by_name(title)
        except IndexError:
            print('Error: there is no "{}" title in the dataset'.format(title))
            return
    else:
        raw_id = _id
    
    print('Found "{}" title'.format(get_name_by_id(raw_id)))
    inner_id = algo.trainset.to_inner_iid(raw_id)
    neighbors = algo.get_neighbors(inner_id, k=num_rec)

    # Convert inner ids of the neighbors into names.
    neighbors = (
        algo.trainset.to_raw_iid(inner_id)
        for inner_id in neighbors
    )
    return list(
        get_name_by_id(rid)
        for rid in neighbors
    )

### Lets test our recommendation engine :)

In [98]:
make_recommendations('Star Wars')

Found "Star Wars (1977)" title


['Empire Strikes Back, The (1980)',
 'Return of the Jedi (1983)',
 'Raiders of the Lost Ark (1981)',
 'Indiana Jones and the Last Crusade (1989)',
 'Sting, The (1973)',
 'L.A. Confidential (1997)',
 'Princess Bride, The (1987)',
 'E.T. the Extra-Terrestrial (1982)',
 'Terminator, The (1984)',
 'Get Shorty (1995)']

In [100]:
make_recommendations('Pulp Fiction')

Found "Pulp Fiction (1994)" title


['Trainspotting (1996)',
 'People vs. Larry Flynt, The (1996)',
 'GoodFellas (1990)',
 'Raising Arizona (1987)',
 'Reservoir Dogs (1992)',
 'Cable Guy, The (1996)',
 'True Romance (1993)',
 'Apocalypse Now (1979)',
 'Casino (1995)',
 'Ed Wood (1994)']

In [102]:
make_recommendations('Shawshank Redemption')

Found "Shawshank Redemption, The (1994)" title


['Glory (1989)',
 'Blues Brothers, The (1980)',
 "Schindler's List (1993)",
 'Usual Suspects, The (1995)',
 'Forrest Gump (1994)',
 'Braveheart (1995)',
 'Stand by Me (1986)',
 'Silence of the Lambs, The (1991)',
 'Apollo 13 (1995)',
 "It's a Wonderful Life (1946)"]

In [106]:
make_recommendations('Fargo')

Found "Fargo (1996)" title


['To Die For (1995)',
 'Lone Star (1996)',
 'Bullets Over Broadway (1994)',
 'Sling Blade (1996)',
 'People vs. Larry Flynt, The (1996)',
 'This Is Spinal Tap (1984)',
 'Quiz Show (1994)',
 'Mighty Aphrodite (1995)',
 '2001: A Space Odyssey (1968)',
 'Dolores Claiborne (1994)']

In [105]:
make_recommendations('Home Alone')

Found "Home Alone (1990)" title


['Maverick (1994)',
 'Back to the Future (1985)',
 'Raiders of the Lost Ark (1981)',
 'Speed (1994)',
 'Jurassic Park (1993)',
 'Groundhog Day (1993)',
 'Net, The (1995)',
 'Sound of Music, The (1965)',
 'E.T. the Extra-Terrestrial (1982)',
 'Blues Brothers, The (1980)']

In [110]:
make_recommendations('Clockwork Orange')

Found "Clockwork Orange, A (1971)" title


['Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)',
 '2001: A Space Odyssey (1968)',
 'People vs. Larry Flynt, The (1996)',
 'Apocalypse Now (1979)',
 'Full Metal Jacket (1987)',
 'Boogie Nights (1997)',
 'Quiet Man, The (1952)',
 'Grifters, The (1990)',
 'Twelve Monkeys (1995)',
 'Psycho (1960)']

In [109]:
make_recommendations('Willy Wonka and the Chocolate Factory')

Found "Willy Wonka and the Chocolate Factory (1971)" title


['James and the Giant Peach (1996)',
 'Winnie the Pooh and the Blustery Day (1968)',
 'Monty Python and the Holy Grail (1974)',
 'Very Brady Sequel, A (1996)',
 'Nikita (La Femme Nikita) (1990)',
 'Raising Arizona (1987)',
 'Boogie Nights (1997)',
 'Hudsucker Proxy, The (1994)',
 'Koyaanisqatsi (1983)',
 'Rosencrantz and Guildenstern Are Dead (1990)']

In [117]:
make_recommendations('Fifth Element')

Found "Fifth Element, The (1997)" title


['Beavis and Butt-head Do America (1996)',
 'Brazil (1985)',
 'Rumble in the Bronx (1995)',
 'Clerks (1994)',
 'Saint, The (1997)',
 'Nikita (La Femme Nikita) (1990)',
 'Addams Family Values (1993)',
 'Chasing Amy (1997)',
 'Mortal Kombat (1995)',
 'Glengarry Glen Ross (1992)']

Pretty cool, right? :) 