In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

In [2]:
data = pd.read_csv(
    "ml-100k/u.data",
    sep="\t",
    names=["user_id", "item_id", "rating", "timestamp"]
)

data = data.drop("timestamp", axis=1)
data.head()


Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [4]:
# 只保留评分>0的交互
df = data.copy()

# 按时间排序（推荐这么做）
# 如果你已经drop timestamp，就不排序了
# 如果有timestamp最好按时间排序

user_sequences = (
    df.groupby("user_id")["item_id"]
      .apply(list)
      .tolist()
)
len(user_sequences)

943

In [5]:
user_sequences[1]

[292,
 251,
 50,
 314,
 297,
 290,
 312,
 281,
 13,
 280,
 303,
 308,
 307,
 257,
 316,
 315,
 301,
 313,
 279,
 299,
 298,
 19,
 277,
 282,
 111,
 258,
 295,
 242,
 283,
 276,
 1,
 305,
 14,
 287,
 291,
 293,
 294,
 310,
 309,
 306,
 25,
 273,
 10,
 311,
 269,
 255,
 284,
 274,
 237,
 300,
 100,
 127,
 285,
 289,
 304,
 272,
 278,
 288,
 286,
 275,
 302,
 296]

In [6]:
item2idx = {item: idx for idx, item in enumerate(df["item_id"].unique())}
idx2item = {idx: item for item, idx in item2idx.items()}

# 替换序列
encoded_sequences = [
    [item2idx[i] for i in seq]
    for seq in user_sequences
]

num_items = len(item2idx)


In [7]:
def generate_skipgram_pairs(sequences, window_size=2):
    pairs = []
    for seq in sequences:
        for i, center in enumerate(seq):
            for j in range(-window_size, window_size+1):
                if j == 0:
                    continue
                if 0 <= i+j < len(seq):
                    context = seq[i+j]
                    pairs.append((center, context))
    return pairs

pairs = generate_skipgram_pairs(encoded_sequences, window_size=2)
print("样本数量:", len(pairs))


样本数量: 394342


In [8]:
pairs[0]

(169, 236)

In [9]:
import random

neg_sample_num = 5

def negative_sampling(pair, num_items, neg_sample_num):
    center, pos = pair
    negatives = []
    while len(negatives) < neg_sample_num:
        neg = random.randint(0, num_items-1)
        if neg != pos:
            negatives.append(neg)
    return center, pos, negatives


In [13]:
import torch
import torch.nn as nn

class Item2Vec(nn.Module):
    def __init__(self, num_items, embed_dim):
        super().__init__()
        self.center_emb = nn.Embedding(num_items, embed_dim)
        self.context_emb = nn.Embedding(num_items, embed_dim)
        
        nn.init.normal_(self.center_emb.weight, std=0.01)
        nn.init.normal_(self.context_emb.weight, std=0.01)

    def forward(self, center, pos, neg):
        center_vec = self.center_emb(center)      # [B,K]
        pos_vec = self.context_emb(pos)          # [B,K]
        neg_vec = self.context_emb(neg)          # [B,neg,K]

        pos_score = torch.sum(center_vec * pos_vec, dim=1)
        neg_score = torch.bmm(neg_vec, center_vec.unsqueeze(2)).squeeze(2)

        return pos_score, neg_score


In [14]:
def loss_fn(pos_score, neg_score):
    pos_loss = torch.log(torch.sigmoid(pos_score))
    neg_loss = torch.log(torch.sigmoid(-neg_score)).sum(dim=1)
    return -(pos_loss + neg_loss).mean()


In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = Item2Vec(num_items=num_items, embed_dim=64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)

epochs = 5

for epoch in range(epochs):
    total_loss = 0
    
    for pair in pairs:
        center, pos, negs = negative_sampling(pair, num_items, neg_sample_num)
        
        center = torch.tensor([center]).to(device)
        pos = torch.tensor([pos]).to(device)
        negs = torch.tensor([negs]).to(device)

        pos_score, neg_score = model(center, pos, negs)
        loss = loss_fn(pos_score, neg_score)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(pairs):.4f}")


Epoch 1, Loss: 3.3418
Epoch 2, Loss: 3.5521
Epoch 3, Loss: 3.6097


KeyboardInterrupt: 

In [None]:
import torch.nn.functional as F

def most_similar(item_id, topk=10):
    idx = item2idx[item_id]
    target = item_embedding[idx]
    
    sims = F.cosine_similarity(target.unsqueeze(0), item_embedding)
    values, indices = torch.topk(sims, topk+1)
    
    result = [idx2item[i.item()] for i in indices[1:]]
    return result


In [None]:
most_similar(242)

In [16]:
# EGES - 加强版Item2Vec
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import random
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 商品总数
num_items = 6

# 每个商品的类别
item_category = {
    0: 0,
    1: 0,
    2: 1,
    3: 1,
    4: 2,
    5: 2
}

# 每个商品的品牌
item_brand = {
    0: 0,
    1: 1,
    2: 0,
    3: 1,
    4: 0,
    5: 1
}

num_categories = 3
num_brands = 2


In [17]:
# 每个用户的一次会话
sessions = [
    [0, 1, 2, 3],
    [2, 3, 4],
    [1, 0, 2],
    [4, 5, 3],
]


In [18]:
from collections import defaultdict

item_graph = defaultdict(lambda: defaultdict(int))

# 在同一会话里两两建立边
for session in sessions:
    for i in range(len(session)):
        for j in range(i+1, len(session)):
            item_graph[session[i]][session[j]] += 1
            item_graph[session[j]][session[i]] += 1

item_graph

defaultdict(<function __main__.<lambda>()>,
            {0: defaultdict(int, {1: 2, 2: 2, 3: 1}),
             1: defaultdict(int, {0: 2, 2: 2, 3: 1}),
             2: defaultdict(int, {0: 2, 1: 2, 3: 2, 4: 1}),
             3: defaultdict(int, {0: 1, 1: 1, 2: 2, 4: 2, 5: 1}),
             4: defaultdict(int, {2: 1, 3: 2, 5: 1}),
             5: defaultdict(int, {4: 1, 3: 1})})

In [None]:
def random_walk(start, walk_length=5):
    walk = [start]
    current = start
    
    for _ in range(walk_length - 1):
        neighbors = list(item_graph[current].keys())
        if not neighbors:
            break
        current = random.choice(neighbors)
        walk.append(current)
    return walk

# 生成训练序列
walks = []
for item in range(num_items):
    for _ in range(5):  # 每个item走5次
        walks.append(random_walk(item))

print(walks)


[[0, 3, 2, 1, 3], [0, 3, 0, 3, 2], [0, 1, 2, 0, 2], [0, 1, 0, 2, 4], [0, 1, 2, 3, 5], [1, 3, 1, 0, 3], [1, 0, 3, 2, 3], [1, 0, 1, 3, 0], [1, 3, 1, 0, 2], [1, 0, 1, 0, 3], [2, 0, 3, 1, 3], [2, 0, 2, 0, 2], [2, 3, 2, 3, 0], [2, 3, 1, 3, 2], [2, 0, 1, 0, 3], [3, 2, 0, 3, 4], [3, 4, 2, 4, 5], [3, 4, 3, 0, 2], [3, 2, 3, 2, 4], [3, 0, 2, 0, 3], [4, 3, 2, 3, 2], [4, 2, 4, 3, 0], [4, 2, 3, 0, 3], [4, 5, 3, 4, 5], [4, 5, 4, 5, 3], [5, 4, 3, 5, 3], [5, 3, 4, 2, 3], [5, 4, 3, 4, 2], [5, 4, 2, 4, 3], [5, 3, 2, 3, 1]]


In [20]:
print(len(walks))

30


In [21]:
window_size = 2
pairs = []

for walk in walks:
    for i, center in enumerate(walk):
        for j in range(-window_size, window_size+1):
            if j == 0:
                continue
            if 0 <= i+j < len(walk):
                context = walk[i+j]
                pairs.append((center, context))

print("训练样本数量:", len(pairs))

训练样本数量: 420


In [22]:
pairs[0]

(0, 3)

In [24]:
class EGESDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        center, context = self.pairs[idx]
        return torch.tensor(center), torch.tensor(context)

dataset = EGESDataset(pairs)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [25]:
class EGES(nn.Module):
    def __init__(self, num_items, num_categories, num_brands, emb_dim):
        super(EGES, self).__init__()
        
        self.emb_dim = emb_dim
        
        # 商品ID embedding
        self.item_emb = nn.Embedding(num_items, emb_dim)
        
        # 类别 embedding
        self.category_emb = nn.Embedding(num_categories, emb_dim)
        
        # 品牌 embedding
        self.brand_emb = nn.Embedding(num_brands, emb_dim)
        
        # 商品特定注意力参数 α
        # 每个商品有 3 个权重（ID + category + brand）
        self.attention = nn.Embedding(num_items, 3)
        
        # 输出embedding（用于skipgram）
        self.output_emb = nn.Embedding(num_items, emb_dim)

        self.init_weights()

    def init_weights(self):
        nn.init.normal_(self.item_emb.weight, std=0.01)
        nn.init.normal_(self.category_emb.weight, std=0.01)
        nn.init.normal_(self.brand_emb.weight, std=0.01)
        nn.init.normal_(self.output_emb.weight, std=0.01)
        nn.init.zeros_(self.attention.weight)

    def get_item_representation(self, item_ids):
        """
        计算融合后的商品向量 H_v
        """
        # 取出各个embedding
        id_emb = self.item_emb(item_ids)
        cat_ids = torch.tensor([item_category[i.item()] for i in item_ids]).to(device)
        brand_ids = torch.tensor([item_brand[i.item()] for i in item_ids]).to(device)
        
        cat_emb = self.category_emb(cat_ids)
        brand_emb = self.brand_emb(brand_ids)
        
        # 拼接 [batch, 3, emb_dim]
        stacked = torch.stack([id_emb, cat_emb, brand_emb], dim=1)
        
        # 取出对应商品的注意力权重
        alpha = self.attention(item_ids)
        alpha = torch.softmax(alpha, dim=1)  # softmax归一化
        
        alpha = alpha.unsqueeze(-1)  # [batch, 3, 1]
        
        # 加权求和
        h = torch.sum(stacked * alpha, dim=1)
        
        return h

    def forward(self, center_ids, context_ids):
        center_vec = self.get_item_representation(center_ids)
        context_vec = self.output_emb(context_ids)
        
        score = torch.sum(center_vec * context_vec, dim=1)
        return score


In [26]:
model = EGES(num_items, num_categories, num_brands, emb_dim=16).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.BCEWithLogitsLoss()

epochs = 10
neg_samples = 3

for epoch in range(epochs):
    total_loss = 0
    
    for center, context in dataloader:
        center = center.to(device)
        context = context.to(device)
        
        # 正样本
        pos_score = model(center, context)
        pos_label = torch.ones_like(pos_score)
        
        # 负采样
        neg_context = torch.randint(0, num_items, context.shape).to(device)
        neg_score = model(center, neg_context)
        neg_label = torch.zeros_like(neg_score)
        
        # 拼接
        scores = torch.cat([pos_score, neg_score])
        labels = torch.cat([pos_label, neg_label])
        
        loss = loss_fn(scores, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


Epoch 1, Loss: 9.6716
Epoch 2, Loss: 9.5172
Epoch 3, Loss: 9.3811
Epoch 4, Loss: 9.0642
Epoch 5, Loss: 9.2175
Epoch 6, Loss: 9.3063
Epoch 7, Loss: 8.8088
Epoch 8, Loss: 8.7373
Epoch 9, Loss: 8.9273
Epoch 10, Loss: 8.9564


In [27]:
item_ids = torch.arange(num_items).to(device)
embeddings = model.get_item_representation(item_ids)

print(embeddings.shape)  # [num_items, emb_dim]


torch.Size([6, 16])
