In [1]:
import torch
import pickle
import numpy as np
import pandas as pd
from torch import nn
from tqdm import tqdm
from utils import collate_fn
from graph_rec_model import GraphRec
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [2]:
# 读loaded_data取保存的 CSV 文件
loaded_data = pd.read_csv('data\\book_score.csv')

# 显示加载的数据
print(loaded_data)


           User     Book  Rate                       Time         Tag
0       1398478  1467022     0  2011-03-29T12:48:35+08:00         NaN
1       1398478  1777823     0  2011-02-02T21:58:55+08:00         NaN
2       1398478  1902628     0  2011-01-31T15:57:58+08:00         NaN
3       1398478  1878708     0  2011-01-26T11:27:59+08:00         NaN
4       1398478  4238362     0  2011-01-21T13:04:15+08:00         NaN
...         ...      ...   ...                        ...         ...
637249  4507957  1125186     4  2009-07-04T08:02:13+08:00  张爱玲,半生缘,爱情
637250  4507957  1002299     5  2009-07-04T08:01:28+08:00  金庸,武侠,笑傲江湖
637251  4507957  1001136     4  2009-07-04T07:55:17+08:00     彼得・潘,童话
637252  4507957  1021615     5  2009-07-04T07:53:54+08:00   小王子,童话,经典
637253  4507957  1962929     5  2009-06-29T22:13:37+08:00          爱情

[637254 rows x 5 columns]


### 根据graphrec构建数据表，分为以下若干
+ u_items_list：用户交互过的所有item以及评分
+ i_users_list：item交互过的所有用户以及评分
+ u_users_list：用户的社交网络
+ u_users_items_list：用户的社交关系的交互物品

In [3]:
def create_id_mapping(id_list):
    # 从ID列表中删除重复项并创建一个排序的列表
    unique_ids = sorted(set(id_list))
    
    # 创建将原始ID映射到连续索引的字典
    id_to_idx = {id: idx for idx, id in enumerate(unique_ids, start = 1)}
    
    # 创建将连续索引映射回原始ID的字典
    idx_to_id = {idx: id for id, idx in id_to_idx.items()}
    
    return id_to_idx, idx_to_id

In [4]:
user_ids = loaded_data['User'].unique()
book_ids = loaded_data['Book'].unique()

user_to_idx, idx_to_user = create_id_mapping(user_ids)
book_to_idx, idx_to_book = create_id_mapping(book_ids)

In [5]:
u_items_list, i_users_list = [(0, 0)], [(0, 0)]
loaded_data['user_map'] = loaded_data['User'].map(user_to_idx)
loaded_data['book_map'] = loaded_data['Book'].map(book_to_idx)

# 按映射后的用户 ID 分组
grouped_user = loaded_data.groupby('user_map')
grouped_book = loaded_data.groupby('book_map')

# 遍历排序后的分组
for user, group in tqdm(grouped_user):
    books = group['book_map'].tolist()
    rates = group['Rate'].tolist()
    
    u_items_list.append([(book, rate) for book, rate in zip(books, rates)])

for book, group in tqdm(grouped_book):
    users = group['user_map'].tolist()
    rates = group['Rate'].tolist()
    
    i_users_list.append([(user, rate) for user, rate in zip(users, rates)])

100%|██████████| 4419/4419 [00:00<00:00, 18040.36it/s]
100%|██████████| 1200/1200 [00:00<00:00, 9353.69it/s]


In [6]:
# 初始化一个空字典来存储社交关系
contact = {}

# 打开文件并读取内容
with open('data\Contacts.txt', 'r') as f:
    for line in f:
        # 分割每一行的内容
        user, friends = line.strip().split(':')
        # 将朋友列表转换为整数列表
        if int(user) in user_to_idx:
            friends_list = [user_to_idx[int(friend)] for friend in friends.split(',') if int(friend) in user_to_idx]
            # 将朋友列表添加到字典中
            contact[user_to_idx[int(user)]] = friends_list

contact_sorted = {k: v for k, v in sorted(contact.items())}
# 打印字典的内容
print(contact_sorted)


{1: [1038, 398, 192, 2173, 967, 1237, 52, 331, 716, 24, 3, 40, 270, 574, 422, 738, 1597, 238, 514, 1659, 160, 818, 1199, 271, 488, 709, 256, 27, 86, 11, 20, 2], 2: [270, 534, 2390, 2547, 178, 11, 2576, 2413, 524, 4213, 3121, 130, 1711, 2227, 1550, 3584, 982, 1699, 2011, 2720, 1325, 56, 1924, 69, 1748, 2163, 1111, 291, 386, 570, 441, 244, 213, 13, 3, 1], 3: [4327, 3327, 177, 1767, 3144, 238, 10, 56, 1486, 170, 16, 15, 410, 532, 25, 546, 18, 134, 236, 514, 291, 1, 698, 696, 271, 41, 198, 332, 174, 20, 250, 257], 4: [3987, 4029, 3278, 2946, 193, 2680, 2573, 16, 1649, 1408, 909, 213, 455, 118, 8], 5: [269, 3327, 2321, 1355, 1689, 1975, 533, 622, 441, 1811, 270, 2859, 41, 47], 6: [2356, 2031, 2054, 2688, 834, 25, 626, 1700, 1129, 793, 88, 8, 3], 7: [3104, 1533, 156, 137, 1880, 4270, 1924, 3346, 3576, 543, 2418, 3, 2433, 1536, 238, 41, 531, 3463, 1762, 1118, 1541, 139], 8: [818, 37, 3, 2, 682, 6, 4], 9: [2574, 291, 15, 2356, 458, 1392, 2289, 4370, 3738, 3564, 230, 1736, 2903, 2547, 3151, 323

In [7]:
u_users_list, u_users_items_list = [0], [[(0, 1)]]

# 按顺序遍历字典
for user, friends in tqdm(contact_sorted.items()):
    u_users_list.append(friends)
    u_users_items_list.append([u_items_list[uid] for uid in friends])

100%|██████████| 4419/4419 [00:00<00:00, 733783.18it/s]


In [8]:
class BookRatingDataset(Dataset):
	def __init__(self, data, user_to_idx, book_to_idx, u_items_list, u_users_list, u_users_items_list, i_users_list):
		self.data = data
		self.user_to_idx = user_to_idx
		self.book_to_idx = book_to_idx
		self.u_items_list = u_items_list
		self.u_users_list = u_users_list
		self.u_users_items_list = u_users_items_list
		self.i_users_list = i_users_list

	def __getitem__(self, index):
		row = self.data.iloc[index]
		user = self.user_to_idx[row['User']]
		book = self.book_to_idx[row['Book']]
		rating = row['Rate'].astype(np.float32)
		u_items = self.u_items_list[user]
		u_users = self.u_users_list[user]
		u_users_items = self.u_users_items_list[user]
		i_users = self.i_users_list[book]

		return (user, book, rating), u_items, u_users, u_users_items, i_users

	def __len__(self):
		return len(self.data)

In [9]:
# 按用户分组计算NDCG
def compute_ndcg(group):
    true_ratings = group['true'].tolist()
    pred_ratings = group['pred'].tolist()
    return ndcg_score([true_ratings], [pred_ratings], k = 50)

In [10]:
# 划分训练集和测试集
train_data, test_data = train_test_split(loaded_data, test_size=0.5, random_state=42)

# 创建训练集和测试集的数据集对象
train_dataset = BookRatingDataset(train_data, user_to_idx, book_to_idx, u_items_list, u_users_list, u_users_items_list, i_users_list)
test_dataset = BookRatingDataset(test_data, user_to_idx, book_to_idx, u_items_list, u_users_list, u_users_items_list, i_users_list)

# 创建训练集和测试集的数据加载器
train_dataloader = DataLoader(train_dataset, batch_size=4096, shuffle=True, collate_fn = collate_fn, drop_last = True)
test_dataloader = DataLoader(test_dataset, batch_size=4096, shuffle=False, collate_fn = collate_fn, drop_last = True)

num_users = loaded_data['User'].nunique()  # 假设有1000个用户
num_books = loaded_data['Book'].nunique()   # 假设有500本书
embedding_dim = 32

model = GraphRec(num_users + 1, num_books + 1, 7, embedding_dim).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)

### 训练

In [11]:
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    total_loss_train, total_loss_test = 0.0, 0.0

    for idx, (user_ids, book_ids, ratings, u_items, u_users, u_users_items, i_users) in tqdm(enumerate(train_dataloader)):
        # 使用user_ids, book_ids, ratings进行训练

        optimizer.zero_grad()
        
        predictions = model(user_ids.to(device), book_ids.to(device), u_items.to(device), u_users.to(device), u_users_items.to(device), i_users.to(device))
        loss = criterion(predictions.squeeze(1), ratings.to(device))
        
        loss.backward()
        optimizer.step()

        total_loss_train += loss.item()
        
        # if idx % 100 == 0:
        #     print(f'Step {idx}, Loss: {loss.item()}')
    output_loss_train = total_loss_train / (idx + 1)

    results = []
    model.eval()

    with torch.no_grad():
        for idx, (user_ids, item_ids, true_ratings, u_items, u_users, u_users_items, i_users) in enumerate(test_dataloader):
            pred_ratings = model(user_ids.to(device), book_ids.to(device), u_items.to(device), u_users.to(device), u_users_items.to(device), i_users.to(device))

            loss = criterion(pred_ratings.squeeze(1), ratings.to(device))
            total_loss_test += loss.item()
            # 将结果转换为 numpy arrays
            user_ids_np = user_ids.long().cpu().numpy().reshape(-1, 1)
            pred_ratings_np = pred_ratings.cpu().numpy().reshape(-1, 1)
            true_ratings_np = true_ratings.numpy().reshape(-1, 1)

            # 将这三个 arrays 合并成一个 2D array
            batch_results = np.column_stack((user_ids_np, pred_ratings_np, true_ratings_np))

            # 将这个 2D array 添加到 results
            results.append(batch_results)

        # 将结果的 list 转换为一个大的 numpy array
        results = np.vstack(results)

        # 将结果转换为DataFrame
        results_df = pd.DataFrame(results, columns=['user', 'pred', 'true'])
        results_df['user'] = results_df['user'].astype(int)

        ndcg_scores = results_df.groupby('user').apply(compute_ndcg)    

        # 计算平均NDCG
        avg_ndcg = ndcg_scores.mean()
        print(f'Epoch {epoch}, Loss: {output_loss_train}, MSE loss:, {total_loss_test / (idx + 1)}, Average NDCG: {avg_ndcg}')

19it [00:49,  2.63s/it]


KeyboardInterrupt: 