In [1]:
import pandas as pd
import numpy as np

# 用户基本信息
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
user_df = pd.read_csv('../../data/ml-1m/users.dat',
                      sep='::',
                      header=None,
                      names=unames,
                      engine='python')

# 电影信息
mnames = ['movie_id', 'title', 'genres']
movies_df = pd.read_csv('../../data/ml-1m/movies.dat',
                        sep='::',
                        header=None,
                        names=mnames,
                        engine='python',
                        encoding='ISO-8859-1')

# 评分信息
rnames = ['user_id', 'movie_id', 'imdbId', 'timestamp']
ratings_df = pd.read_csv('../../data/ml-1m/ratings.dat',
                         sep='::',
                         header=None,
                         engine='python',
                         names=rnames)


In [2]:
user_df

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [3]:
movies_df

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [4]:
ratings_df

Unnamed: 0,user_id,movie_id,imdbId,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [5]:
import networkx as nx

# 初始化空图
G = nx.Graph()

# ============ 添加用户节点和用户特征节点 ============
for _, row in user_df.iterrows():
    user_id = row['user_id']
    gender = row['gender']
    age = row['age']
    occupation = row['occupation']
    zip_code = row['zip']
    
    # 添加用户节点
    G.add_node(f"user_{user_id}", type="user")
    
    # 添加性别特征节点并连接到用户节点
    if not G.has_node(f"gender_{gender}"):
        G.add_node(f"gender_{gender}", type="gender")
    G.add_edge(f"user_{user_id}", f"gender_{gender}", relation="has_gender")
    
    # 添加年龄特征节点并连接到用户节点
    if not G.has_node(f"age_{age}"):
        G.add_node(f"age_{age}", type="age")
    G.add_edge(f"user_{user_id}", f"age_{age}", relation="has_age")
    
    # 添加职业特征节点并连接到用户节点
    if not G.has_node(f"occupation_{occupation}"):
        G.add_node(f"occupation_{occupation}", type="occupation")
    G.add_edge(f"user_{user_id}", f"occupation_{occupation}", relation="has_occupation")
    
    # 添加邮编特征节点并连接到用户节点
    if not G.has_node(f"zip_{zip_code}"):
        G.add_node(f"zip_{zip_code}", type="zip")
    G.add_edge(f"user_{user_id}", f"zip_{zip_code}", relation="has_zip")

# ============ 添加电影节点及其类型 ============
for _, row in movies_df.iterrows():
    movie_id = row['movie_id']
    genres = row['genres'].split('|')
    
    # 添加电影节点
    G.add_node(f"movie_{movie_id}", type="movie")
    
    # 为每种类型添加类型节点，并连接到电影节点
    for genre in genres:
        if not G.has_node(f"genre_{genre}"):
            G.add_node(f"genre_{genre}", type="genre")
        G.add_edge(f"movie_{movie_id}", f"genre_{genre}", relation="has_genre")

# ============ 添加用户-电影评分关系（仅保留高评分） ============
for _, row in ratings_df.iterrows():
    user_id = row['user_id']
    movie_id = row['movie_id']
    rating = row['imdbId']
    
    # 仅添加评分高于4的用户-电影边
    if rating >= 4:
        G.add_edge(f"user_{user_id}", f"movie_{movie_id}", relation="rated", rating=rating)

# 打印图的基本信息
print(f"图中的节点数：{G.number_of_nodes()}")
print(f"图中的边数：{G.number_of_edges()}")
print(f"节点示例：{list(G.nodes(data=True))[:10]}")
print(f"边示例：{list(G.edges(data=True))[:10]}")


图中的节点数：13410
图中的边数：605849
节点示例：[('user_1', {'type': 'user'}), ('gender_F', {'type': 'gender'}), ('age_1', {'type': 'age'}), ('occupation_10', {'type': 'occupation'}), ('zip_48067', {'type': 'zip'}), ('user_2', {'type': 'user'}), ('gender_M', {'type': 'gender'}), ('age_56', {'type': 'age'}), ('occupation_16', {'type': 'occupation'}), ('zip_70072', {'type': 'zip'})]
边示例：[('user_1', 'gender_F', {'relation': 'has_gender'}), ('user_1', 'age_1', {'relation': 'has_age'}), ('user_1', 'occupation_10', {'relation': 'has_occupation'}), ('user_1', 'zip_48067', {'relation': 'has_zip'}), ('user_1', 'movie_1193', {'relation': 'rated', 'rating': 5}), ('user_1', 'movie_3408', {'relation': 'rated', 'rating': 4}), ('user_1', 'movie_2355', {'relation': 'rated', 'rating': 5}), ('user_1', 'movie_1287', {'relation': 'rated', 'rating': 5}), ('user_1', 'movie_2804', {'relation': 'rated', 'rating': 5}), ('user_1', 'movie_594', {'relation': 'rated', 'rating': 4})]


In [None]:
# 将 G 的节点和边的属性值统一转换为 int 类型
for node, data in G.nodes(data=True):
    for attr, value in data.items():
        if isinstance(value, np.int64):  # 如果是 int64 类型，则转换为 int
            data[attr] = int(value)

for u, v, data in G.edges(data=True):
    for attr, value in data.items():
        if isinstance(value, np.int64):  # 如果是 int64 类型，则转换为 int
            data[attr] = int(value)

# 再进行 pyvis 的操作
from pyvis.network import Network

# 创建 Network 对象
net = Network(notebook=True, cdn_resources='in_line')

# 从 networkx 图导入数据
net.from_nx(G)

# 保存为 HTML 文件
net.save_graph("knowledge_graph.html")

# 或者直接用 show 方法
net.show("knowledge_graph.html")


**筛选数据操作**

In [7]:
# 读取用户 ID 列表
with open('../../data/movielensmini/user_list.txt', 'r') as f:
    user_list = f.read().splitlines()
    user_list = [int(uid.strip()) for uid in user_list if uid.strip().isdigit()]

# 读取电影 ID 列表
with open('../../data/movielensmini/item_list.txt', 'r') as f:
    item_list = f.read().splitlines()
    item_list = [int(mid.strip()) for mid in item_list if mid.strip().isdigit()]

# 筛选 users.dat 中的用户信息
filtered_user_df = user_df[user_df['user_id'].isin(user_list)]

# 筛选 movies.dat 中的电影信息
filtered_movies_df = movies_df[movies_df['movie_id'].isin(item_list)]

# 筛选 ratings.dat 中的评分信息
filtered_ratings_df = ratings_df[(ratings_df['user_id'].isin(user_list)) & (ratings_df['movie_id'].isin(item_list))]

# 定义一个函数，用于将 DataFrame 直接保存为双冒号分隔的文件
def save_with_double_colon(df, filename):
    content = df.to_csv(sep='\t', index=False, header=False)  # 使用制表符作为临时分隔符
    content = content.replace('\t', '::')  # 将制表符替换为双冒号
    with open(filename, 'w') as f:
        f.write(content)

# 保存筛选后的数据
save_with_double_colon(filtered_user_df, 'filtered_users.dat')
save_with_double_colon(filtered_movies_df, 'filtered_movies.dat')
save_with_double_colon(filtered_ratings_df, 'filtered_ratings.dat')

print("数据筛选和保存完成！")

数据筛选和保存完成！


**筛选后进行图谱的重新构建**

In [8]:
import pandas as pd
import numpy as np

# 用户基本信息
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
user_df = pd.read_csv('filtered_users.dat',
                      sep='::',
                      header=None,
                      names=unames,
                      engine='python')

# 电影信息
mnames = ['movie_id', 'title', 'genres']
movies_df = pd.read_csv('filtered_movies.dat',
                        sep='::',
                        header=None,
                        names=mnames,
                        engine='python',
                        encoding='ISO-8859-1')

# 评分信息
rnames = ['user_id', 'movie_id', 'imdbId', 'timestamp']
ratings_df = pd.read_csv('filtered_ratings.dat',
                         sep='::',
                         header=None,
                         engine='python',
                         names=rnames)


In [9]:
import networkx as nx

# 初始化空图
G = nx.Graph()

# ============ 添加用户节点和用户特征节点 ============
for _, row in user_df.iterrows():
    user_id = row['user_id']
    gender = row['gender']
    age = row['age']
    occupation = row['occupation']
    zip_code = row['zip']
    
    # 添加用户节点
    G.add_node(f"user_{user_id}", type="user")
    
    # 添加性别特征节点并连接到用户节点
    if not G.has_node(f"gender_{gender}"):
        G.add_node(f"gender_{gender}", type="gender")
    G.add_edge(f"user_{user_id}", f"gender_{gender}", relation="has_gender")
    
    # 添加年龄特征节点并连接到用户节点
    if not G.has_node(f"age_{age}"):
        G.add_node(f"age_{age}", type="age")
    G.add_edge(f"user_{user_id}", f"age_{age}", relation="has_age")
    
    # 添加职业特征节点并连接到用户节点
    if not G.has_node(f"occupation_{occupation}"):
        G.add_node(f"occupation_{occupation}", type="occupation")
    G.add_edge(f"user_{user_id}", f"occupation_{occupation}", relation="has_occupation")
    
    # 添加邮编特征节点并连接到用户节点
    if not G.has_node(f"zip_{zip_code}"):
        G.add_node(f"zip_{zip_code}", type="zip")
    G.add_edge(f"user_{user_id}", f"zip_{zip_code}", relation="has_zip")

# ============ 添加电影节点及其类型 ============
for _, row in movies_df.iterrows():
    movie_id = row['movie_id']
    genres = row['genres'].split('|')
    
    # 添加电影节点
    G.add_node(f"movie_{movie_id}", type="movie")
    
    # 为每种类型添加类型节点，并连接到电影节点
    for genre in genres:
        if not G.has_node(f"genre_{genre}"):
            G.add_node(f"genre_{genre}", type="genre")
        G.add_edge(f"movie_{movie_id}", f"genre_{genre}", relation="has_genre")

# ============ 添加用户-电影评分关系（仅保留高评分） ============
for _, row in ratings_df.iterrows():
    user_id = row['user_id']
    movie_id = row['movie_id']
    rating = row['imdbId']
    
    # 仅添加评分高于4的用户-电影边
    if rating >= 4:
        G.add_edge(f"user_{user_id}", f"movie_{movie_id}", relation="rated", rating=rating)

# 打印图的基本信息
print(f"图中的节点数：{G.number_of_nodes()}")
print(f"图中的边数：{G.number_of_edges()}")
print(f"节点示例：{list(G.nodes(data=True))[:10]}")
print(f"边示例：{list(G.edges(data=True))[:10]}")

# 定义一个函数来生成三元组
def generate_triples(G):
    triples = []
    for u, v, data in G.edges(data=True):
        # 获取节点的类型信息
        u_type = G.nodes[u]['type']
        v_type = G.nodes[v]['type']
        
        # 构建三元组
        triple = (u, data['relation'], v)
        triples.append(triple)
    
    return triples

# 生成三元组
triples = generate_triples(G)

# 打印三元组个数
print(f"图中的三元组个数：{len(triples)}")

图中的节点数：3469
图中的边数：30343
节点示例：[('user_2', {'type': 'user'}), ('gender_M', {'type': 'gender'}), ('age_56', {'type': 'age'}), ('occupation_16', {'type': 'occupation'}), ('zip_70072', {'type': 'zip'}), ('user_3', {'type': 'user'}), ('age_25', {'type': 'age'}), ('occupation_15', {'type': 'occupation'}), ('zip_55117', {'type': 'zip'}), ('user_4', {'type': 'user'})]
边示例：[('user_2', 'gender_M', {'relation': 'has_gender'}), ('user_2', 'age_56', {'relation': 'has_age'}), ('user_2', 'occupation_16', {'relation': 'has_occupation'}), ('user_2', 'zip_70072', {'relation': 'has_zip'}), ('user_2', 'movie_1357', {'relation': 'rated', 'rating': 5}), ('user_2', 'movie_3068', {'relation': 'rated', 'rating': 4}), ('user_2', 'movie_1537', {'relation': 'rated', 'rating': 4}), ('user_2', 'movie_2194', {'relation': 'rated', 'rating': 4}), ('user_2', 'movie_648', {'relation': 'rated', 'rating': 4}), ('user_2', 'movie_2268', {'relation': 'rated', 'rating': 5})]
图中的三元组个数：30343


In [None]:
# 将 G 的节点和边的属性值统一转换为 int 类型
for node, data in G.nodes(data=True):
    for attr, value in data.items():
        if isinstance(value, np.int64):  # 如果是 int64 类型，则转换为 int
            data[attr] = int(value)

for u, v, data in G.edges(data=True):
    for attr, value in data.items():
        if isinstance(value, np.int64):  # 如果是 int64 类型，则转换为 int
            data[attr] = int(value)

# 再进行 pyvis 的操作
from pyvis.network import Network

# 创建 Network 对象
net = Network(notebook=True, cdn_resources='in_line')

# 从 networkx 图导入数据
net.from_nx(G)

# 保存为 HTML 文件
net.save_graph("filtered_knowledge_graph.html")

# 或者直接用 show 方法
# net.show("filtered_knowledge_graph.html")

In [None]:
# 将 G 的节点和边的属性值统一转换为 int 类型
for node, data in G.nodes(data=True):
    for attr, value in data.items():
        if isinstance(value, np.int64):  # 如果是 int64 类型，则转换为 int
            data[attr] = int(value)

for u, v, data in G.edges(data=True):
    for attr, value in data.items():
        if isinstance(value, np.int64):  # 如果是 int64 类型，则转换为 int
            data[attr] = int(value)


subgraph_nodes = list(G.nodes)[:200]  # 仅取前100个节点
subgraph = G.subgraph(subgraph_nodes)

net = Network(notebook=True, cdn_resources='remote')
net.from_nx(subgraph)

filtered_knowledge_graph.html


In [None]:
from node2vec import Node2Vec
import pickle

# 使用 node2vec 进行随机游走
node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=300, p=1, q=1, workers=4)

# 训练嵌入模型
model = node2vec.fit(window=10, min_count=1, batch_words=4)

# 提取每个节点的嵌入向量
embeddings = {str(node): model.wv[str(node)] for node in G.nodes()}

# 查看用户1的嵌入
print("user_1的嵌入：", embeddings["user_2"])

model.save("filtered_node2vec_model.model")
print("模型已保存为 filtered_node2vec_model.model")

with open("filtered_node_embeddings.pkl", "wb") as f:
    pickle.dump(embeddings, f)
print("嵌入向量已保存为 filtered_node_embeddings.pkl")

Computing transition probabilities:   0%|          | 0/3469 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 75/75 [01:08<00:00,  1.09it/s]
Generating walks (CPU: 2): 100%|██████████| 75/75 [01:09<00:00,  1.09it/s]
Generating walks (CPU: 3): 100%|██████████| 75/75 [01:08<00:00,  1.09it/s]
Generating walks (CPU: 4): 100%|██████████| 75/75 [01:09<00:00,  1.08it/s]


user_1的嵌入： [-0.21829024 -0.20502831 -0.28292495  0.30108285 -0.18112746  0.28546846
 -0.02008447  0.690925    0.35797045 -0.24112792  0.16165027  0.13438416
 -0.62900114  0.24921234 -0.527734    0.42101336  0.2168916  -0.09771825
 -0.16147716  0.23127635  0.03425938 -0.4523417  -0.03188534 -0.21496822
  0.50563776 -0.12922755  0.2971205  -0.22516245 -0.22467275 -0.18568021
  0.3218033  -0.07609871 -0.27661598  0.37453273 -0.26181808 -0.17390099
 -0.12330624 -0.04805288  0.10648239 -0.27843267 -0.2409914  -0.436408
  0.15334843  0.16550386 -0.4091108   0.11950924  0.17806439 -0.30314696
 -0.13799234 -0.40604308  0.03557025 -0.07746519 -0.11777267  0.34700608
  0.02564448 -0.242588    0.0848164   0.11483973  0.12703444 -0.09937646
  0.39338794 -0.06685573  0.15133713  0.00825506]
模型已保存为 filtered_node2vec_model.model
嵌入向量已保存为 filtered_node_embeddings.pkl


In [None]:
import json
from gensim.models import Word2Vec

# 加载模型
model = Word2Vec.load("filtered_node2vec_model.model")
print("模型加载成功")

# 初始化重新编号的用户和物品字典
user_embeddings = {}
item_embeddings = {}

# 初始化用户和物品的编号映射表
user_id_map = {}
item_id_map = {}

# 用于重新编号的计数器
user_counter = 0
item_counter = 0

# 遍历模型中的所有节点
for node in model.wv.key_to_index:
    embedding = model.wv[node].tolist()
    if node.startswith("user_"):
        # 重新编号用户
        if node not in user_id_map:
            user_id_map[node] = user_counter
            user_counter += 1
        # 使用新的编号存储用户的嵌入
        user_embeddings[user_id_map[node]] = embedding
    elif node.startswith("movie_"):
        # 重新编号物品
        if node not in item_id_map:
            item_id_map[node] = item_counter
            item_counter += 1
        # 使用新的编号存储物品的嵌入
        item_embeddings[item_id_map[node]] = embedding

# 保存用户嵌入为 JSON 文件
with open("renumbered_user_embeddings.json", "w") as user_file:
    json.dump(user_embeddings, user_file, indent=4)
print("重新编号的用户嵌入已保存为 renumbered_user_embeddings.json")

# 保存物品嵌入为 JSON 文件
with open("renumbered_item_embeddings.json", "w") as item_file:
    json.dump(item_embeddings, item_file, indent=4)
print("重新编号的物品嵌入已保存为 renumbered_item_embeddings.json")

# 保存用户和物品的编号映射
with open("user_id_map.json", "w") as user_map_file:
    json.dump({k: v for k, v in user_id_map.items()}, user_map_file, indent=4)
with open("item_id_map.json", "w") as item_map_file:
    json.dump({k: v for k, v in item_id_map.items()}, item_map_file, indent=4)
print("用户和物品的编号映射已分别保存为 user_id_map.json 和 item_id_map.json")
