In [10]:
import torch
import pandas as pd
import pickle
import numpy as np
from lenskit.algorithms.als import BiasedMF
from lenskit.batch import predict
from lenskit.metrics.predict import rmse

In [21]:
train = pd.read_pickle("../data/ml-20m-split/train.pkl")
val = pd.read_pickle("../data/ml-20m-split/val.pkl")
test = pd.read_pickle("../data/ml-20m-split/test.pkl")
ratings = pd.read_csv("../data/ml-20m/ratings.csv")
ratings = ratings.rename(columns={'userId': 'user', 'movieId': 'item'})

In [55]:
test.head()

Unnamed: 0,user,item,rating,timestamp
86,1,2683,3.5,1094785650
169,1,8482,3.5,1112485781
119,1,4306,4.0,1094785784
5,1,112,3.5,1094785740
0,1,2,3.5,1112486027


In [65]:
algo = BiasedMF(30)

In [66]:
algo_fit = algo.fit(train[:800])

In [72]:
algo_fit.user_features_.shape

(11, 30)

In [63]:
preds = predict(algo, val[:300])

In [64]:
preds["prediction"].values

array([3.4222084 , 3.84325577,        nan, 3.9314203 , 3.92816578,
       4.01567657,        nan,        nan,        nan, 3.71318274,
              nan, 4.02778894,        nan,        nan,        nan,
              nan, 3.66295553,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan, 3.60892355,        nan, 4.99295349,        nan,
              nan, 3.74438137, 3.52554412,        nan,        nan,
       4.66773143, 4.21865286,        nan, 3.74431611, 4.02768752,
       4.13513965,        nan,        nan,        nan,        nan,
       4.40863561, 4.00736817,        nan,        nan,        nan,
       4.13513965,        nan, 4.10395713, 4.06257091,        nan,
              nan,        nan,        nan, 4.00239079,        nan,
              nan, 4.13467334,        nan,        nan,        nan,
              nan,        nan,        nan, 4.05767454, 3.93919558,
       4.13117007, 3.99746117,        nan, 4.94484486,        

In [61]:
rmse(preds["prediction"], preds["rating"])

1.223269315010456

In [55]:
val[200:400]

Unnamed: 0,user,item,rating,timestamp
1324,11,31878,5.0,1230854021
1336,11,34319,4.0,1230788451
1345,11,37386,1.5,1230784056
1346,11,37830,5.0,1230788438
1355,11,43928,2.5,1230788917
...,...,...,...,...
2649,24,1073,2.0,1000098839
2653,24,1120,4.0,995096982
2655,24,1127,3.0,993972376
2666,24,1215,3.0,993972341


In [5]:
train.head()

Unnamed: 0,user,item,rating,timestamp
1,1,29,3.5,1112484676
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580
6,1,151,4.0,1094785734
8,1,253,4.0,1112484940


In [23]:
freqs_series = train.groupby("user").size()

In [24]:
freqs = freqs_series.reset_index().rename(columns={0: "num_items"})

In [39]:
freqs

Unnamed: 0,user,num_items
0,1,123
1,2,43
2,3,131
3,4,20
4,5,46
...,...,...
138488,138489,27
138489,138490,106
138490,138491,15
138491,138492,57


In [32]:
freqs2 = freqs.groupby("num_items").size().reset_index().rename(columns={0: "num_users"})

In [36]:
freqs2 = freqs2.sort_values(by=["num_users"], ascending=False)

In [53]:
freqs2[freqs2["num_users"] < 32].sort_values(by=["num_items"])["num_users"].sum()

6401

In [52]:
freqs2[:60]

Unnamed: 0,num_items,num_users
1,15,5936
3,17,4764
0,14,4485
6,20,3938
8,22,3487
11,25,3000
2,16,2641
13,27,2629
15,29,2520
17,31,2251


In [12]:
user_ratings = train.groupby("user").size()

In [13]:
np.median(user_ratings.values)

48.0

In [14]:
user_ratings

user
1         123
2          43
3         131
4          20
5          46
         ... 
138489     27
138490    106
138491     15
138492     57
138493    261
Length: 138493, dtype: int64

In [None]:
item_ratings = train.groupby("user").size()

In [19]:
ratings.groupby("userId").size()

userId
1         175
2          61
3         187
4          28
5          66
         ... 
138489     38
138490    151
138491     22
138492     82
138493    373
Length: 138493, dtype: int64

In [20]:
train.groupby("user").size()

user
1         123
2          43
3         131
4          20
5          46
         ... 
138489     27
138490    106
138491     15
138492     57
138493    261
Length: 138493, dtype: int64

In [26]:
ratings

Unnamed: 0,user,item,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580
...,...,...,...,...
20000258,138493,68954,4.5,1258126920
20000259,138493,69526,4.5,1259865108
20000260,138493,69644,3.0,1260209457
20000261,138493,70286,5.0,1258126944


In [23]:
with open("../data/item_dict.pkl", "rb") as f:
  item_dict = pickle.load(f)

In [24]:
item_indices = [item_dict[item] for item in ratings["item"].values]

In [25]:
item_indices

[2,
 29,
 32,
 47,
 50,
 112,
 151,
 222,
 252,
 259,
 292,
 295,
 317,
 335,
 365,
 539,
 585,
 589,
 647,
 904,
 909,
 992,
 1019,
 1059,
 1060,
 1069,
 1070,
 1077,
 1115,
 1171,
 1173,
 1175,
 1177,
 1178,
 1184,
 1190,
 1191,
 1193,
 1195,
 1198,
 1214,
 1217,
 1220,
 1223,
 1232,
 1233,
 1235,
 1236,
 1240,
 1252,
 1265,
 1277,
 1294,
 1306,
 1320,
 1322,
 1330,
 1341,
 1345,
 1358,
 1480,
 1534,
 1688,
 1767,
 1838,
 1885,
 1912,
 1915,
 1939,
 2018,
 2036,
 2056,
 2058,
 2061,
 2091,
 2092,
 2111,
 2112,
 2170,
 2205,
 2208,
 2459,
 2545,
 2561,
 2564,
 2580,
 2599,
 2608,
 2632,
 2677,
 2678,
 2720,
 2788,
 2834,
 2860,
 2863,
 2875,
 2884,
 2916,
 2945,
 2952,
 2996,
 3068,
 3180,
 3351,
 3389,
 3392,
 3401,
 3411,
 3798,
 3841,
 3904,
 3905,
 3919,
 3935,
 4013,
 4036,
 4041,
 4134,
 4214,
 4354,
 4375,
 4479,
 4628,
 4661,
 4785,
 4803,
 4818,
 4822,
 4848,
 4887,
 4900,
 4933,
 4946,
 4947,
 5053,
 5078,
 5446,
 5583,
 5701,
 5720,
 5802,
 5856,
 5903,
 5997,
 6146,
 6237,