In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer


# 数据加载
# 读取电影数据和类别信息
movies_df = pd.read_csv('../data/ml-25m-reduced/movies.csv')
embeddings = np.load('save/ml-25m-reduced-unweight/items_embeddings.npy')  # N x 64
movies_df['genres'] = movies_df['genres'].str.split('|')

# 获取类别信息
mlb = MultiLabelBinarizer()
genres_matrix = mlb.fit_transform(movies_df['genres'])  # N x C
genre_labels = mlb.classes_  # 类别名称
print(genre_labels)
num_genres = len(genre_labels)

# 数据处理
movie_embeddings = torch.tensor(embeddings, dtype=torch.float32)  # N x 64
genre_matrix = torch.tensor(genres_matrix, dtype=torch.float32)   # N x C

['(no genres listed)' 'Action' 'Adventure' 'Animation' 'Children' 'Comedy'
 'Crime' 'Documentary' 'Drama' 'Fantasy' 'Film-Noir' 'Horror' 'IMAX'
 'Musical' 'Mystery' 'Romance' 'Sci-Fi' 'Thriller' 'War' 'Western']


In [6]:


# # 超参数
# embedding_dim = embeddings.shape[1]  # 64
# batch_size = 128
# num_epochs = 30
# learning_rate = 0.001

# # 自定义 Dataset
# class MovieDataset(Dataset):
#     def __init__(self, embeddings, genre_matrix):
#         self.embeddings = embeddings
#         self.genre_matrix = genre_matrix

#     def __len__(self):
#         return self.embeddings.shape[0]

#     def __getitem__(self, idx):
#         return self.embeddings[idx], self.genre_matrix[idx]

# # 加载数据
# dataset = MovieDataset(movie_embeddings, genre_matrix)
# dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# # 模型定义
# class GenreEmbeddingModel(nn.Module):
#     def __init__(self, num_genres, embedding_dim):
#         super(GenreEmbeddingModel, self).__init__()
#         # 类别嵌入矩阵
#         self.genre_embeddings = nn.Parameter(torch.randn(num_genres, embedding_dim))

#     def forward(self, genre_matrix):
#         # 根据类别矩阵计算加权嵌入
#         weighted_embeddings = torch.matmul(genre_matrix, self.genre_embeddings) / torch.sum(genre_matrix, dim=1, keepdim=True)
#         return weighted_embeddings

# # 初始化模型
# model = GenreEmbeddingModel(num_genres=num_genres, embedding_dim=embedding_dim)
# criterion = nn.MSELoss()
# optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# # 训练模型
# for epoch in range(num_epochs):
#     model.train()
#     epoch_loss = 0.0
#     for batch_embeddings, batch_genres in dataloader:
#         optimizer.zero_grad()
#         # 预测
#         predicted_embeddings = model(batch_genres)
#         # 计算损失
#         loss = criterion(predicted_embeddings, batch_embeddings)
#         loss.backward()
#         optimizer.step()
#         epoch_loss += loss.item()

#     print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")

# # 保存类别嵌入
# torch.save(model.genre_embeddings.data, 'genre_embeddings.pt')
# print("训练完成，类别嵌入已保存为 genre_embeddings.pt")


In [None]:
# 测试模型结果
def get_movie_embedding(genres, genre_embeddings, mlb):
    """
    根据电影的类别信息，计算其平均 embedding。
    """
    # 将输入的 genres 转化为 one-hot 编码
    genre_vector = mlb.transform([genres])  # 1 x C
    genre_vector = torch.tensor(genre_vector, dtype=torch.float32)  # 转为 Tensor

    # 通过类别 embedding 计算加权平均电影 embedding
    movie_embedding = torch.matmul(genre_vector, genre_embeddings) / torch.sum(genre_vector)
    return movie_embedding.squeeze(0)  # 去掉额外维度

def get_top_k_similar_movies(movie_embedding, movie_embeddings, k=5):
    """
    计算给定电影 embedding 和已有电影 embedding 的相似程度，返回 top-k 索引。
    """
    # 计算余弦相似度
    similarity = torch.nn.functional.cosine_similarity(movie_embedding.unsqueeze(0), movie_embeddings)
    # 获取 top-k 索引
    top_k_indices = torch.topk(similarity, k=k).indices
    return top_k_indices, similarity[top_k_indices]


# 加载训练好的类别嵌入
loaded_genre_embeddings = torch.load('save/ml-25m-reduced-unweight/genre_embeddings.pt')


# new_movie_genres = ['Children', 'Adventure', 'Animation','Comedy', 'Fantasy'] 
new_movie_genres = ['Comedy'] 
# 1. 计算新电影的 embedding
new_movie_embedding = get_movie_embedding(new_movie_genres, loaded_genre_embeddings, mlb)
# 2. 获取 top-k 相似电影索引
top_k_indices, similarities = get_top_k_similar_movies(new_movie_embedding, movie_embeddings, k=5)

# 输出结果
print(f"新电影类别: {new_movie_genres}")
print(f"Top-5 相似电影索引: {top_k_indices.tolist()}")
print(f"相似度: {similarities.tolist()}")

top_k_movies = movies_df.iloc[top_k_indices.tolist()]
print("\nTop-5 相似电影:")
print(top_k_movies[['title', 'genres']])

新电影类别: ['Comedy']
Top-5 相似电影索引: [328, 14324, 16212, 28046, 43996]
相似度: [0.6367900967597961, 0.6337140202522278, 0.6267668604850769, 0.618573009967804, 0.6138240098953247]

Top-5 相似电影:
                                       title                       genres
328                  Flintstones, The (1994)  [Children, Comedy, Fantasy]
14324                Pixar Story, The (2007)                [Documentary]
16212                      Wanderlust (2012)                     [Comedy]
28046             The Propaganda Game (2015)                [Documentary]
43996  Making Fun: The Story of Funko (2018)                [Documentary]
