In [7]:
import pandas as pd
from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import train_test_split

DATA_DIR = 'data/'
train_ratings = pd.read_csv(f'{DATA_DIR}train_ratings.csv')
train_ratings.head(1)

Unnamed: 0,userId,movieId,rating,timestamp
0,509,7347,3.0,1435994597


In [8]:
n_users = train_ratings['userId'].nunique()
n_movies = train_ratings['movieId'].nunique()
print(f'Number of users: {n_users}')
print(f'Number of movies: {n_movies}')

Number of users: 610
Number of movies: 8983


In [9]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(train_ratings[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=.25)

In [10]:
model = SVD(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02)
model.fit(trainset)
predictions = model.test(testset)
accuracy.rmse(predictions)

RMSE: 0.8779


0.8779221720100647

In [191]:
import implicit
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split as ts

In [192]:
train_ratings_matrix = train_ratings.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)

In [193]:
train2, test2 = ts(train_ratings_matrix, train_size=0.8, random_state=42)

In [194]:
print(train_ratings_matrix.shape)
print(train2.shape)
print(test2.shape)

(610, 8983)
(488, 8983)
(122, 8983)


In [195]:
train_data = csr_matrix(train2)
test_data = csr_matrix(test2)

In [196]:
num_factors = 2

In [197]:
model = implicit.als.AlternatingLeastSquares(factors=num_factors)

In [198]:
model.fit(train_data)

  0%|          | 0/15 [00:00<?, ?it/s]

In [199]:
user_factors = model.user_factors
item_factors = model.item_factors

In [200]:
predictions = user_factors.dot(item_factors.T)
actual_values = test_data[test_data.nonzero()]
actual_values = actual_values.reshape(-1,1)
actual_values = np.ravel(actual_values)
predicted_values = predictions[test_data.nonzero()]

In [201]:
print(predictions.shape)
print(actual_values.shape)
print(predicted_values.shape)

(488, 8983)
(22134,)
(22134,)


In [202]:
rmse = mean_squared_error(actual_values, predicted_values, squared=False)

In [203]:
print(rmse)

3.4605218657147905
