In [1]:
import pandas as pd
from collections import defaultdict
from scipy.spatial.distance import cosine
import networkx as nx
from tqdm import tqdm
import pickle
import gc
import random
import community as community_louvain
import csv
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# 读取数据
ratings = pd.read_csv(
    'data/ml-25m-reduced/ratings.csv', 
    names=['userId', 'movieId', 'rating', 'timestamp'],
    low_memory=False
    )

In [3]:
# 构建用户-电影评分的稀疏矩阵，使用字典来存储
user_ratings = defaultdict(dict)
for _, row in tqdm(ratings.iterrows(), total=len(ratings), desc="Building user-movie sparse matrix"):
    user_ratings[row['userId']][row['movieId']] = row['rating']

# 找出所有用户ID
all_users = list(user_ratings.keys())

Building user-movie sparse matrix: 100%|██████████| 2500010/2500010 [01:01<00:00, 40845.12it/s]


In [4]:
# 构建电影到用户的倒排索引
movie_to_users = defaultdict(set)
for user, movies in user_ratings.items():
    for movie in movies:
        movie_to_users[movie].add(user)

In [5]:
# 找到与某用户可能有相似兴趣的候选用户
def get_candidate_users(user):
    candidate_users = set()
    for movie in user_ratings[user]:
        candidate_users.update(movie_to_users[movie])
    candidate_users.discard(user)
    return candidate_users

In [13]:
# 计算用户相似度
def calculate_similarity(user1, user2):
    ratings1 = user_ratings[user1]
    ratings2 = user_ratings[user2]
    
    # 找到共同评分的电影，如果没有共同评分的电影，则返回相似度为 0
    common_movies = set(ratings1.keys()).intersection(set(ratings2.keys()))
    if len(common_movies) == 0:
        return 0
    
    # 获取这些共同电影的评分
    ratings1_common = [ratings1[movie] for movie in common_movies]
    ratings2_common = [ratings2[movie] for movie in common_movies]
    ratings1_common = np.array([float(rating) for rating in ratings1_common])
    ratings2_common = np.array([float(rating) for rating in ratings2_common])

    # 计算余弦相似度
    # 注意这个函数的使用方式
    similarity = 1 - cosine(ratings1_common, ratings2_common)
    
    # 计算加权：共同评分电影越多，权重越大
    weight = len(common_movies) / (len(common_movies) + 1)
    
    return similarity * weight

In [14]:
# 计算相似度并使用缓存
similarity_cache = {}

def calculate_similarity_with_cache(user1, user2):
    if (user1, user2) in similarity_cache:
        return similarity_cache[(user1, user2)]
    if (user2, user1) in similarity_cache:
        return similarity_cache[(user2, user1)]
    
    sim = calculate_similarity(user1, user2)
    similarity_cache[(user1, user2)] = sim
    return sim

In [26]:
# 随机选取用户
users = random.sample(all_users, 100)

graph_file = 'vis-data/incremental_user_similarity_graph.pkl'
progress_file = 'vis-data/processed_users.pkl'

# 加载已经保存的图和进度
try:
    with open(graph_file, 'rb') as f:
        G = pickle.load(f)
    with open(progress_file, 'rb') as f:
        processed_users = pickle.load(f)
    print("Resuming from saved progress...")
except FileNotFoundError:
    G = nx.Graph()
    processed_users = set()
    print("No saved progress found, starting fresh...")

# 找到尚未处理的用户
remaining_users = [user for user in users if user not in processed_users]

No saved progress found, starting fresh...


In [None]:
# # 处理用户并保存进度，每 batch_size 个用户保存一次
# batch_size = 500
# batch_counter = 0

for i, user1 in tqdm(enumerate(remaining_users), total=len(remaining_users), desc="Calculating similarities"):
    candidate_users = get_candidate_users(user1)  # 候选用户
    for user2 in candidate_users:
        dis = calculate_similarity(user1, user2)
        # print(f"Similarity between {user1} and {user2}: {dis}")
        if dis >= 0.9:  # 阈值
            G.add_edge(user1, user2, weight=dis)
#     # 记录已处理的用户
#     processed_users.add(user1)
#     batch_counter += 1

#     # 每处理完一批用户后保存进度
#     if batch_counter >= batch_size:
#         # 保存进度
#         with open(graph_file, 'wb') as graph_f, open(progress_file, 'wb') as progress_f:
#             pickle.dump(G, graph_f)
#             pickle.dump(processed_users, progress_f)
        
#         # 清除相似度缓存
#         similarity_cache.clear()
#         gc.collect()  # 手动触发垃圾回收释放内存
        
#         # print(f"Saved progress for {len(processed_users)} users. Cache cleared.")
#         batch_counter = 0  # 重置计数器

# # 在最后确保保存所有进度
# with open(graph_file, 'wb') as graph_f, open(progress_file, 'wb') as progress_f:
#     pickle.dump(G, graph_f)
#     pickle.dump(processed_users, progress_f)
# print(f"Processing complete. Graph saved to {graph_file}, progress saved to {progress_file}")

Calculating similarities: 100%|██████████| 100/100 [00:45<00:00,  2.19it/s]


Processing complete. Graph saved to vis-data/incremental_user_similarity_graph.pkl, progress saved to vis-data/processed_users.pkl


In [23]:
# 设置文件名
graph_file = 'vis-data/incremental_user_similarity_graph.pkl'
progress_file = 'vis-data/processed_users.pkl'

# 加载图和进度
with open(graph_file, 'rb') as graph_f, open(progress_file, 'rb') as progress_f:
    G = pickle.load(graph_f)
    processed_users = pickle.load(progress_f)

print(f"Graph and progress loaded. Processed {len(processed_users)} users.")

Graph and progress loaded. Processed 100 users.


In [24]:
# 对图进行社区分割
print("Performing community detection using Louvain method...")
partition = community_louvain.best_partition(G)  # Louvain算法分割社区

Performing community detection using Louvain method...


In [25]:
# 文件路径
edges_output_file = 'vis-data/graph_with_communities.csv'
nodes_output_file = 'vis-data/nodes_with_communities.csv'

# 保存边信息到 graph_with_communities.csv
with open(edges_output_file, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['source', 'target', 'type', 'weight'])
    
    for source, target, data in G.edges(data=True):  # 遍历图中的边
        if source in users and target in users:  # 只保存随机选择的用户相关的边
            weight = data.get('weight', 1)  # 获取边的权重，默认值为 1
            writer.writerow([source, target, 'undirected', weight])  # 假设边为无向边

print(f"Edges saved to {edges_output_file}")

# 保存节点信息到 nodes_with_communities.csv
with open(nodes_output_file, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['id', 'label', 'category'])
    
    for user in users:  # 只保存随机选择的用户
        label = user  # label 与 id 相同
        category = partition.get(user, -1)  # 获取社区标签作为分类信息
        writer.writerow([user, label, category])

print(f"Nodes saved to {nodes_output_file}")

Edges saved to vis-data/graph_with_communities.csv
Nodes saved to vis-data/nodes_with_communities.csv
