In [10]:
import pandas as pd
from collections import defaultdict
from scipy.spatial.distance import cosine
import networkx as nx
from tqdm import tqdm
import random
import csv
import numpy as np

In [2]:
# 读取数据
ratings = pd.read_csv(
    'data/ml-25m-reduced/ratings.csv', 
    names=['userId', 'movieId', 'rating', 'timestamp'],
    low_memory=False
    )

In [3]:
movies = pd.read_csv(
    'data/ml-25m-reduced/movies.csv', 
    names=['movieId', 'title', 'genres'],
    low_memory=False
)

In [4]:
# 为每个电影分配类别
movie_genres = {row['movieId']: set(row['genres'].split('|')) for _, row in movies.iterrows()}

In [5]:
# 构建用户-电影评分的稀疏矩阵，使用字典来存储
user_ratings = defaultdict(dict)
for _, row in tqdm(ratings.iterrows(), total=len(ratings), desc="Building user-movie sparse matrix"):
    user_ratings[row['userId']][row['movieId']] = row['rating']

Building user-movie sparse matrix: 100%|██████████| 2500010/2500010 [00:59<00:00, 42037.73it/s]


In [6]:
# 构建电影到用户的倒排索引
movie_to_users = defaultdict(set)
for user, movies in user_ratings.items():
    for movie in movies:
        movie_to_users[movie].add(user)

In [21]:
# 计算电影相似度
def calculate_movie_similarity(movie1, movie2):
    # 获取电影的评分信息
    users1 = movie_to_users[movie1]
    users2 = movie_to_users[movie2]
    
    # 找到共同评分的用户
    common_users = users1.intersection(users2)
    if len(common_users) == 0:
        return 0
    
    # 获取共同评分用户的评分
    ratings1_common = [user_ratings[user][movie1] for user in common_users]
    ratings2_common = [user_ratings[user][movie2] for user in common_users]
    
    # 确保将评分转换为 numpy 数组并且是浮动类型
    ratings1_common = np.array(ratings1_common, dtype=float)
    ratings2_common = np.array(ratings2_common, dtype=float)
    
    # 计算评分的余弦相似度
    similarity = 1 - cosine(ratings1_common, ratings2_common)
    
    # 计算类别相似度：使用Jaccard相似度
    genres1 = movie_genres.get(movie1, set())
    genres2 = movie_genres.get(movie2, set())
    genre_similarity = len(genres1.intersection(genres2)) / len(genres1.union(genres2)) if len(genres1.union(genres2)) > 0 else 0
    
    # 综合考虑共同评分用户的数量和类别相似度
    weight = len(common_users) / (len(common_users) + 1)
    total_similarity = similarity * genre_similarity * weight
    
    return total_similarity

In [26]:
# 随机选择电影
random_movies = random.sample(list(movie_to_users.keys()), 1000)

In [None]:
# 创建图
G_movies = nx.Graph()

# 计算电影之间的相似度并添加到图中
for i, movie1 in tqdm(enumerate(random_movies), total=len(random_movies), desc="Calculating movie similarities"):
    for movie2 in random_movies[i+1:]:
        similarity = calculate_movie_similarity(movie1, movie2)
        if similarity > 0:
            G_movies.add_edge(movie1, movie2, weight=similarity)

Calculating movie similarities: 100%|██████████| 1000/1000 [00:02<00:00, 428.62it/s]


In [28]:
# 保存电影边信息到 graph_with_movie_similarities.csv
edges_output_file = 'vis-data/movie_graph_with_similarities.csv'
with open(edges_output_file, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['source', 'target', 'type', 'weight'])
    
    for source, target, data in G_movies.edges(data=True):  # 遍历图中的边
        weight = data.get('weight', 1)  # 获取边的权重
        writer.writerow([source, target, 'undirected', weight])  # 假设边为无向边

print(f"Edges saved to {edges_output_file}")

Edges saved to vis-data/movie_graph_with_similarities.csv


In [29]:
# 保存电影节点信息到 nodes_with_categories.csv
nodes_output_file = 'vis-data/nodes_with_categories.csv'
with open(nodes_output_file, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['id', 'label'])
    
    for movie in random_movies:  # 只保存随机选择的电影
        label = movie  # label 与 id 相同
        # category = movie_genres.get(movie, set())  # 获取电影的类别信息
        writer.writerow([movie, label])

print(f"Nodes saved to {nodes_output_file}")

Nodes saved to vis-data/nodes_with_categories.csv
