In [1]:
# https://github.com/NicolasHug/Surprise

In [2]:
import pandas as pd
from surprise import Dataset, Reader
from surprise import SVD, NMF
from surprise.model_selection import cross_validate

In [3]:
df = pd.read_pickle("../data/data.pkl")

In [4]:
df = df[["user_id", "name", "user_rating"]]

In [5]:
df.shape

(5957004, 3)

### Create my dataset

In [6]:
test_id = df["user_id"].max() + 1
print(test_id)

test_df = [
    [test_id, "Angel Beats!", 10],
    [test_id, "Ookami to Koushinryou", 10],
    [test_id, "Shinsekai yori", 8],
    [test_id, "Seikimatsu Occult Gakuin", 6],
    [test_id, "Futari wa Precure", 8],
    [test_id, "Shugo Chara!", 8],
    [test_id, "Uchuu Senkan Yamato 2199", 9],
]

test_df = pd.DataFrame(test_df, columns=df.columns)

73517


### Create dataset

In [7]:
# reduce data due to memory limitation
sample_data = df.sample(10000, random_state=42)
sample_data = pd.concat([sample_data, test_df], axis=0)

In [8]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(sample_data, reader)

In [9]:
del df, sample_data

### SVD (5-fold cross-validation)

In [10]:
# Use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.4519  1.4475  1.5053  1.5090  1.5200  1.4867  0.0306  
MAE (testset)     1.1214  1.1183  1.1679  1.1635  1.1837  1.1510  0.0263  
Fit time          0.06    0.06    0.06    0.07    0.07    0.07    0.00    
Test time         0.03    0.01    0.01    0.01    0.01    0.01    0.01    


{'test_rmse': array([1.45192788, 1.4475261 , 1.50528692, 1.50895954, 1.52004329]),
 'test_mae': array([1.12143984, 1.11828161, 1.16787544, 1.16352523, 1.18369023]),
 'fit_time': (0.06277012825012207,
  0.06219673156738281,
  0.064300537109375,
  0.07235503196716309,
  0.06697344779968262),
 'test_time': (0.03257417678833008,
  0.00816488265991211,
  0.007932901382446289,
  0.008143424987792969,
  0.008212804794311523)}

In [11]:
del algo

### Non negative matrix factorization

In [12]:
# Non negative matrix factorization
trainset = data.build_full_trainset()
nmf = NMF()
nmf.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x7f94e0498130>

In [13]:
del data

In [14]:
testset = trainset.build_anti_testset()

In [15]:
del trainset

In [16]:
pred = nmf.test(testset)

In [17]:
from collections import defaultdict

def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

#### Prediction result

In [18]:
top_n = get_top_n(pred, n=10)

In [19]:
del pred

In [20]:
top_n[73517]

[('Hunter x Hunter OVA', 9.009031391122406),
 ('ef: A Tale of Melodies.', 8.804478493850281),
 ('Kara no Kyoukai 7: Satsujin Kousatsu (Kou)', 8.455577301611182),
 ('RD Sennou Chousashitsu', 8.262711284310521),
 ('Kuroshitsuji Picture Drama', 8.042635712454237),
 ('Kuroko no Basket: Tip Off', 7.948158120115213),
 ('Serial Experiments Lain', 7.931442639297254),
 ('Initial D First Stage', 7.8876203748266915),
 ('Higurashi no Naku Koro ni', 7.866073612615435),
 ('Rurouni Kenshin: Meiji Kenkaku Romantan - Ishinshishi e no Chinkonka',
  7.857234296938298)]

In [21]:
# test_df = [
#     [test_id, "Angel Beats!", 10],
#     [test_id, "Ookami to Koushinryou", 10],
#     [test_id, "Shinsekai yori", 8],
#     [test_id, "Seikimatsu Occult Gakuin", 6],
#     [test_id, "Futari wa Precure", 8],
#     [test_id, "Shugo Chara!", 8],
#     [test_id, "Uchuu Senkan Yamato 2199", 9],
# ]