In [1]:
# Partially based on https://medium.com/radon-dev/als-implicit-collaborative-filtering-5ed653ba39fe

# Intialization
import os
import sys
import time

import warnings

# data science imports
import math
import numpy as np
import pandas as pd
import implicit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

# visualization imports
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Set data location
data_path = 'C:/Users/IOLAP-USER/Documents/Movie-Recommendation-System/Data/Reviews-1M/'

In [3]:
# Import movies data
movies_df = pd.read_csv(data_path + 'movies.dat', 
                        sep = '::', header=None)
movies_df.columns = ['itemId', 'title', 'genres']
# movies_df['genres'] = movies_df.genres.apply(lambda x: x.split('|'))
movies_df.head()

# Import ratings data
ratings_df = pd.read_csv(data_path + 'ratings.dat',
                        sep = '::', header = None)
ratings_df.columns = ['userId', 'itemId', 'label', 'timestamp']
ratings_df.drop(['timestamp'], axis = 1, inplace = True)

  This is separate from the ipykernel package so we can avoid doing imports until
  # Remove the CWD from sys.path while we load stuff.


In [4]:
movies_df.head(3)

Unnamed: 0,itemId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [5]:
ratings_df.head(3)

Unnamed: 0,userId,itemId,label
0,1,1193,5
1,1,661,3
2,1,914,3


In [6]:
print('Distinct values of ratings:')
print(ratings_df.label.unique())

Distinct values of ratings:
[5 3 4 2 1]


In [7]:
tmp1 = ratings_df.groupby('userId').count().min()[0]
tmp2 = ratings_df.groupby('itemId').count().min()[0]
print('For the users that rated movies and the movies that were rated:')
print('Minimum number of ratings per user is {}'.format(tmp1))
print('Minimum number of ratings per movie is {}'.format(tmp2))

For the users that rated movies and the movies that were rated:
Minimum number of ratings per user is 20
Minimum number of ratings per movie is 1


In [8]:
# tmp2 = ratings.select('movieId').distinct().count()
tmp1 = sum(ratings_df.groupby('itemId').count().userId == 1)
tmp2 = len(ratings_df.itemId.unique())
print('{} out of {} movies are rated by only one user'.format(tmp1, tmp2))

114 out of 3706 movies are rated by only one user


In [9]:
tmp = len(movies_df.itemId.unique())
print('We have a total of {} distinct movies in the data sets'.format(tmp))

We have a total of 3883 distinct movies in the data sets


In [10]:
tmp1 = len(movies_df.itemId.unique())
tmp2 = len(ratings_df.itemId.unique())
print('We have a total of {} distinct movies that are rated by users in ratings table'.format(tmp2))
print('We have {} movies that are not rated yet'.format(tmp1-tmp2))

We have a total of 3706 distinct movies that are rated by users in ratings table
We have 177 movies that are not rated yet


In [11]:
ratings_df.dropna(inplace = True)

# The implicit library expects data as a item-user matrix so we
# create two matricies, one for fitting the model (item-user) 
# and one for recommendations (user-item)
sparse_item_user = sparse.csr_matrix((ratings_df['label'].astype(float), 
                                      (ratings_df['itemId'], ratings_df['userId'])))
sparse_user_item = sparse.csr_matrix((ratings_df['label'].astype(float), 
                                      (ratings_df['userId'], ratings_df['itemId'])))

In [12]:
# Build the recommendation model using Alternating Least Squares from the Implicit package
num_factors = 10
num_iters = 10
reg_param = 0.1

# initialize a model
model = implicit.als.AlternatingLeastSquares(factors=num_factors, iterations = num_iters, 
                                             regularization = reg_param, use_cg = True, 
                                             calculate_training_loss = True)

# Calculate the confidence by multiplying it by our alpha value.
alpha_val = 1
data_conf = (sparse_item_user * alpha_val).astype('double')

#Fit the model
model.fit(data_conf)

100%|███████████████████████████████████████████████████████████████████| 10.0/10 [00:09<00:00,  1.14it/s, loss=0.0548]


In [20]:
#---------------------
# FIND SIMILAR ITEMS
#---------------------

# Find the 10 most similar to Soylent Green
itemId = 2009 # Soylent Green
n_similar = 10

# Use implicit to get similar items.
similar = model.similar_items(itemId, n_similar)

# Print the names of our most similar artists
for item in similar:
    idx, score = item
    print(movies_df.title.loc[movies_df.itemId == idx].iloc[0])

Soylent Green (1973)
Omega Man, The (1971)
Logan's Run (1976)
Westworld (1973)
2010 (1984)
Barbarella (1968)
Forbidden Planet (1956)
War of the Worlds, The (1953)
Escape from the Planet of the Apes (1971)
Stepford Wives, The (1975)


In [25]:
#------------------------------
# CREATE USER RECOMMENDATIONS
#------------------------------

# Create recommendations for user with id 2025
userId = 100

# user_rated_items = ratings_df.iloc[np.where((ratings_df.userId == userId) & (ratings_df.label >= 1))]
# rated_movies = movies_df.join(user_rated_items, movies_df.itemId == user_rated_items.itemId, how = 'inner')
# print('User', userId, 'Movie Reviews')
# print(user_rated_items)

# Use the implicit recommender.
recommended = model.recommend(userId, sparse_user_item)

movie_recs = []
scores = []

# Get artist names from ids
for item in recommended:
    idx, score = item
    movie_recs.append(movies_df.title.loc[movies_df.itemId == idx].iloc[0])
    scores.append(score)

# Create a dataframe of artist names and scores
recommendations = pd.DataFrame({'movies': movie_recs, 'score': scores})

print('User', userId, 'Movie Recommendations')
print(recommendations)

User 100 Movie Recommendations
                              movies     score
0  Terminator 2: Judgment Day (1991)  0.819821
1                Total Recall (1990)  0.781552
2               Fugitive, The (1993)  0.772758
3   Hunt for Red October, The (1990)  0.762520
4             Terminator, The (1984)  0.711448
5                       Speed (1994)  0.661089
6                    Die Hard (1988)  0.655873
7                   Rock, The (1996)  0.653317
8                       Alien (1979)  0.635274
9                    Face/Off (1997)  0.616831
