In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

In [5]:
data = pd.read_csv(
    "ml-100k/u.data",
    sep="\t",
    names=["user_id", "item_id", "rating", "timestamp"]
)

data = data.drop("timestamp", axis=1)
data.head()


Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [13]:
user_ids = data["user_id"].unique()
item_ids = data["item_id"].unique()
print(user_ids, len(user_ids))


[196 186  22 244 166 298 115 253 305   6  62 286 200 210 224 303 122 194
 291 234 119 167 299 308  95  38 102  63 160  50 301 225 290  97 157 181
 278 276   7  10 284 201 287 246 242 249  99 178 251  81 260  25  59  72
  87  42 292  20  13 138  60  57 223 189 243  92 241 254 293 127 222 267
  11   8 162 279 145  28 135  32  90 216 250 271 265 198 168 110  58 237
  94 128  44 264  41  82 262 174  43  84 269 259  85 213 121  49 155  68
 172  19 268   5  80  66  18  26 130 256   1  56  15 207 232  52 161 148
 125  83 272 151  54  16  91 294 229  36  70  14 295 233 214 192 100 307
 297 193 113 275 219 218 123 158 302  23 296  33 154  77 270 187 170 101
 184 112 133 215  69 104 240 144 191  61 142 177 203  21 197 134 180 236
 263 109  64 114 239 117  65 137 257 111 285  96 116  73 221 235 164 281
 182 129  45 131 230 126 231 280 288 152 217  79  75 245 282  78 118 283
 171 107 226 306 173 185 150 274 188  48 311 165 208   2 205 248  93 159
 146  29 156  37 141 195 108  47 255  89 140 190  2

In [12]:
print(item_ids, len(item_ids))

[ 242  302  377 ... 1637 1630 1641] 1682


In [None]:
# embedding要求index从0开始编号
user2idx = {u: i for i, u in enumerate(user_ids)}
item2idx = {i: j for j, i in enumerate(item_ids)}
print(user2idx, len(user2idx))

{np.int64(196): 0, np.int64(186): 1, np.int64(22): 2, np.int64(244): 3, np.int64(166): 4, np.int64(298): 5, np.int64(115): 6, np.int64(253): 7, np.int64(305): 8, np.int64(6): 9, np.int64(62): 10, np.int64(286): 11, np.int64(200): 12, np.int64(210): 13, np.int64(224): 14, np.int64(303): 15, np.int64(122): 16, np.int64(194): 17, np.int64(291): 18, np.int64(234): 19, np.int64(119): 20, np.int64(167): 21, np.int64(299): 22, np.int64(308): 23, np.int64(95): 24, np.int64(38): 25, np.int64(102): 26, np.int64(63): 27, np.int64(160): 28, np.int64(50): 29, np.int64(301): 30, np.int64(225): 31, np.int64(290): 32, np.int64(97): 33, np.int64(157): 34, np.int64(181): 35, np.int64(278): 36, np.int64(276): 37, np.int64(7): 38, np.int64(10): 39, np.int64(284): 40, np.int64(201): 41, np.int64(287): 42, np.int64(246): 43, np.int64(242): 44, np.int64(249): 45, np.int64(99): 46, np.int64(178): 47, np.int64(251): 48, np.int64(81): 49, np.int64(260): 50, np.int64(25): 51, np.int64(59): 52, np.int64(72): 53, 

In [16]:
print(item2idx, len(item2idx))

{np.int64(242): 0, np.int64(302): 1, np.int64(377): 2, np.int64(51): 3, np.int64(346): 4, np.int64(474): 5, np.int64(265): 6, np.int64(465): 7, np.int64(451): 8, np.int64(86): 9, np.int64(257): 10, np.int64(1014): 11, np.int64(222): 12, np.int64(40): 13, np.int64(29): 14, np.int64(785): 15, np.int64(387): 16, np.int64(274): 17, np.int64(1042): 18, np.int64(1184): 19, np.int64(392): 20, np.int64(486): 21, np.int64(144): 22, np.int64(118): 23, np.int64(1): 24, np.int64(546): 25, np.int64(95): 26, np.int64(768): 27, np.int64(277): 28, np.int64(234): 29, np.int64(246): 30, np.int64(98): 31, np.int64(193): 32, np.int64(88): 33, np.int64(194): 34, np.int64(1081): 35, np.int64(603): 36, np.int64(796): 37, np.int64(32): 38, np.int64(16): 39, np.int64(304): 40, np.int64(979): 41, np.int64(564): 42, np.int64(327): 43, np.int64(201): 44, np.int64(1137): 45, np.int64(241): 46, np.int64(4): 47, np.int64(332): 48, np.int64(100): 49, np.int64(432): 50, np.int64(322): 51, np.int64(181): 52, np.int64(1

In [17]:
data["user_id"] = data["user_id"].map(user2idx)
data["item_id"] = data["item_id"].map(item2idx)

In [18]:
data.head()

Unnamed: 0,user_id,item_id,rating
0,0,0,3
1,1,1,3
2,2,2,1
3,3,3,2
4,4,4,1


In [20]:
num_users = len(user2idx)
num_items = len(item2idx)
print(num_users, num_items)

943 1682


In [21]:
# 划分训练集和测试集
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

In [22]:
train_df.head()

Unnamed: 0,user_id,item_id,rating
75220,804,901,1
48955,467,488,5
44966,465,139,4
13568,321,289,4
92727,618,261,4


In [23]:
test_df.head()

Unnamed: 0,user_id,item_id,rating
75721,873,377,4
80184,808,601,3
19864,90,354,4
76699,409,570,2
92991,496,356,2


In [25]:
train_df["user_id"].values

array([804, 467, 465, ..., 434,  40,  70], shape=(80000,))

In [27]:
# Dataset解决的是第i个样本是什么，每次通过geitem返回
class RatingDataset(Dataset):
    def __init__(self, df):
        # 一连串的tensor序列
        self.users = torch.tensor(df["user_id"].values, dtype=torch.long)
        self.items = torch.tensor(df["item_id"].values, dtype=torch.long)
        self.ratings = torch.tensor(df["rating"].values, dtype=torch.float)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        # 找到第idx个用户的用户名称，评价的电影和评分
        return self.users[idx], self.items[idx], self.ratings[idx]


In [28]:
train_dataset = RatingDataset(train_df)
test_dataset = RatingDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

In [None]:
class FunkSVD(nn.Module):
    def __init__(self, num_users, num_items, k):
        super().__init__()
        # 要把每个用户用k维向量表示，所以embedding矩阵的维度就是用户数量，k维
        # 查表操作就是找到用户的idx，然后变成k维向量
        self.user_emb = nn.Embedding(num_users, k)
        self.item_emb = nn.Embedding(num_items, k)

        # 初始化
        # 这里的意思就是给需要学习的embedding矩阵的每个元素的值初始化
        # 从正态分布N(0, 0.01的平方)随机生成数值
        nn.init.normal_(self.user_emb.weight, std=0.01)
        nn.init.normal_(self.item_emb.weight, std=0.01)

    def forward(self, user, item):
        # 前向传播找评分，然后降维列
        u = self.user_emb(user)
        i = self.item_emb(item)
        # 沿着列维度相加，相当于求和操作，因为矩阵分解的内积公式就是逐元素相乘之后按照维度相加
        return (u * i).sum(dim=1)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = FunkSVD(num_users, num_items, k=50).to(device)

criterion = nn.MSELoss()
# weight_decay就是L2正则化，L2正则化的效果就是，在更新参数的时候额外减去lanmadtheta，所以直接写在优化器里面
# 优化器就是更新参数的一种方法，这里选择了Adam这种优化器。
# 正则化本质是让梯度多一项与参数大小成正比的惩罚项
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=1e-4)

In [None]:
epochs = 20

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for users, items, ratings in train_loader:
        users = users.to(device)
        items = items.to(device)
        ratings = ratings.to(device)

        preds = model(users, items)
        # loss是每个batch的平均损失
        # 我们想看到每个epoch的平均损失
        loss = criterion(preds, ratings)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")


Epoch 1, Loss: 9.4552
Epoch 2, Loss: 1.1598
Epoch 3, Loss: 0.9331
Epoch 4, Loss: 0.8973
Epoch 5, Loss: 0.8670
Epoch 6, Loss: 0.8368
Epoch 7, Loss: 0.8148
Epoch 8, Loss: 0.7927
Epoch 9, Loss: 0.7699
Epoch 10, Loss: 0.7464
Epoch 11, Loss: 0.7172
Epoch 12, Loss: 0.6863
Epoch 13, Loss: 0.6566
Epoch 14, Loss: 0.6271
Epoch 15, Loss: 0.5996
Epoch 16, Loss: 0.5681
Epoch 17, Loss: 0.5407
Epoch 18, Loss: 0.5102
Epoch 19, Loss: 0.4793
Epoch 20, Loss: 0.4527


In [None]:
model.eval()
preds_list = []
ratings_list = []

with torch.no_grad():
    for users, items, ratings in test_loader:
        users = users.to(device)
        items = items.to(device)

        preds = model(users, items)

        preds_list.extend(preds.cpu().numpy())
        ratings_list.extend(ratings.numpy())

# mean_squared_error是MSE，开根号就是RMSE
rmse = np.sqrt(mean_squared_error(ratings_list, preds_list))
print("Test RMSE:", rmse)


Test RMSE: 0.9533322071195511


In [None]:
# 训练好模型后：
# 保存模型参数
# 加载模型
# 给定用户 id
# 计算和所有物品向量的内积
# 取 Top-K
# 这就是完整召回流程。

In [44]:
mu = train_df["rating"].mean()
mu

np.float64(3.5312625)

In [None]:
# BaisSVD
class BiasSVD(nn.Module):
    def __init__(self, num_users, num_items, k):
        super().__init__()

        self.user_emb = nn.Embedding(num_users, k)
        self.item_emb = nn.Embedding(num_items, k)

        self.user_bias = nn.Embedding(num_users, 1)
        self.item_bias = nn.Embedding(num_items, 1)

        nn.init.normal_(self.user_emb.weight, std=0.01)
        nn.init.normal_(self.item_emb.weight, std=0.01)

        # 两个常数初始化维0
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)

        # 这里相当于把这个常数也当成一个参数去学习了
        # 如果要这样子的话，必须给出一个合理的初始值
        self.global_bias = nn.Parameter(torch.zeros(1))

    def forward(self, user, item):
        u = self.user_emb(user)
        i = self.item_emb(item)

        dot = (u * i).sum(dim=1)

        # 删除维度为1的轴，这样子才和上面（4）维度正确，原来是（4，1）
        b_u = self.user_bias(user).squeeze()
        b_i = self.item_bias(item).squeeze()

        return self.global_bias + b_u + b_i + dot


In [46]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)


In [47]:
model = BiasSVD(num_users, num_items, k=50).to(device)
model.global_bias.data.fill_(mu)

tensor([3.5313], device='cuda:0')

In [48]:
epochs = 20

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for users, items, ratings in train_loader:
        users = users.to(device)
        items = items.to(device)
        ratings = ratings.to(device)

        preds = model(users, items)
        # loss是每个batch的平均损失
        # 我们想看到每个epoch的平均损失
        loss = criterion(preds, ratings)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")


Epoch 1, Loss: 1.2683
Epoch 2, Loss: 1.2681
Epoch 3, Loss: 1.2688
Epoch 4, Loss: 1.2697
Epoch 5, Loss: 1.2707
Epoch 6, Loss: 1.2659
Epoch 7, Loss: 1.2701
Epoch 8, Loss: 1.2703
Epoch 9, Loss: 1.2665
Epoch 10, Loss: 1.2681
Epoch 11, Loss: 1.2662
Epoch 12, Loss: 1.2665
Epoch 13, Loss: 1.2671
Epoch 14, Loss: 1.2665
Epoch 15, Loss: 1.2683
Epoch 16, Loss: 1.2670
Epoch 17, Loss: 1.2672
Epoch 18, Loss: 1.2726
Epoch 19, Loss: 1.2695
Epoch 20, Loss: 1.2704


In [49]:
model.eval()
preds_list = []
ratings_list = []

with torch.no_grad():
    for users, items, ratings in test_loader:
        users = users.to(device)
        items = items.to(device)

        preds = model(users, items)

        preds_list.extend(preds.cpu().numpy())
        ratings_list.extend(ratings.numpy())

# mean_squared_error是MSE，开根号就是RMSE
rmse = np.sqrt(mean_squared_error(ratings_list, preds_list))
print("Test RMSE:", rmse)


Test RMSE: 1.1238636724234883
