-
Notifications
You must be signed in to change notification settings - Fork 611
/
movielens.py
146 lines (118 loc) · 4.58 KB
/
movielens.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
"""
An example based off the MovieLens 20M dataset.
This code will automatically download a HDF5 version of this
dataset when first run. The original dataset can be found here:
https://grouplens.org/datasets/movielens/.
Since this dataset contains explicit 5-star ratings, the ratings are
filtered down to positive reviews (4+ stars) to construct an implicit
dataset
"""
from __future__ import print_function
import argparse
import codecs
import logging
import time
import numpy as np
import tqdm
from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking
from implicit.datasets.movielens import get_movielens
from implicit.lmf import LogisticMatrixFactorization
from implicit.nearest_neighbours import (
BM25Recommender,
CosineRecommender,
TFIDFRecommender,
bm25_weight,
)
log = logging.getLogger("implicit")
def calculate_similar_movies(output_filename, model_name="als", min_rating=4.0, variant="20m"):
# read in the input data file
start = time.time()
titles, ratings = get_movielens(variant)
# remove things < min_rating, and convert to implicit dataset
# by considering ratings as a binary preference only
ratings.data[ratings.data < min_rating] = 0
ratings.eliminate_zeros()
ratings.data = np.ones(len(ratings.data))
log.info("read data file in %s", time.time() - start)
# generate a recommender model based off the input params
if model_name == "als":
model = AlternatingLeastSquares()
# lets weight these models by bm25weight.
log.debug("weighting matrix by bm25_weight")
ratings = (bm25_weight(ratings, B=0.9) * 5).tocsr()
elif model_name == "bpr":
model = BayesianPersonalizedRanking()
elif model_name == "lmf":
model = LogisticMatrixFactorization()
elif model_name == "tfidf":
model = TFIDFRecommender()
elif model_name == "cosine":
model = CosineRecommender()
elif model_name == "bm25":
model = BM25Recommender(B=0.2)
else:
raise NotImplementedError(f"model {model_name} isn't implemented for this example")
user_ratings = ratings.T.tocsr()
# train the model
log.debug("training model %s", model_name)
start = time.time()
model.fit(user_ratings)
log.debug("trained model '%s' in %s", model_name, time.time() - start)
log.debug("calculating top movies")
user_count = np.ediff1d(ratings.indptr)
to_generate = sorted(np.arange(len(titles)), key=lambda x: -user_count[x])
log.debug("calculating similar movies")
with tqdm.tqdm(total=len(to_generate)) as progress:
with codecs.open(output_filename, "w", "utf8") as o:
batch_size = 1000
for startidx in range(0, len(to_generate), batch_size):
batch = to_generate[startidx : startidx + batch_size]
ids, scores = model.similar_items(batch, 11)
for i, movieid in enumerate(batch):
# if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has
# no ratings > 4 meaning we've filtered out all data for it.
if ratings.indptr[movieid] != ratings.indptr[movieid + 1]:
title = titles[movieid]
for other, score in zip(ids[i], scores[i]):
o.write(f"{title}\t{titles[other]}\t{score}\n")
progress.update(len(batch))
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Generates related movies from the MovieLens 20M "
"dataset (https://grouplens.org/datasets/movielens/20m/)",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--output",
type=str,
default="similar-movies.tsv",
dest="outputfile",
help="output file name",
)
parser.add_argument(
"--model",
type=str,
default="als",
dest="model",
help="model to calculate (als/bm25/tfidf/cosine)",
)
parser.add_argument(
"--variant",
type=str,
default="20m",
dest="variant",
help="Whether to use the 20m, 10m, 1m or 100k movielens dataset",
)
parser.add_argument(
"--min_rating",
type=float,
default=4.0,
dest="min_rating",
help="Minimum rating to assume that a rating is positive",
)
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG)
calculate_similar_movies(
args.outputfile, model_name=args.model, min_rating=args.min_rating, variant=args.variant
)