In [2]:
import pandas as pd

from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate


# Creation of the dataframe. Column names are irrelevant.
df1 = pd.read_csv('./scraper/sample_data/qq.txt', header=0, sep=",", names = ["reviewerId","stars","artistId","artistName","albumId","albumName"])
df = df1[['reviewerId', 'albumId', 'stars']]

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['reviewerId', 'albumId', 'stars']], reader)

# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(NormalPredictor(), data, cv=2)

{'test_rmse': array([1.38790556, 1.38739696]),
 'test_mae': array([1.11077496, 1.11051809]),
 'fit_time': (0.694892168045044, 1.0136229991912842),
 'test_time': (4.37396502494812, 4.066109895706177)}

In [8]:
from collections import defaultdict

def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [3]:
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import KFold

# define a cross-validation iterator
kf = KFold(n_splits=3)

algo = SVD()

for trainset, testset in kf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.8339
RMSE: 0.8346
RMSE: 0.8340


In [4]:
from surprise import SVD
from surprise.model_selection import GridSearchCV

param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())

0.8531499392639157
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x115f60e80>

In [7]:
uid = str(5080)  # raw user id (as in the ratings file). They are **strings**!
iid = str(7092)  # raw item id (as in the ratings file). They are **strings**!

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, r_ui=3, verbose=True)

user: 5080       item: 7092       r_ui = 3.00   est = 3.70   {'was_impossible': False}


In [None]:
trainset = data.build_full_trainset()
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])