In [None]:
# Install packages
!pip install torch==1.12.0+cpu matplotlib==3.8.2 torch-geometric==2.4.0 --extra-index-url https://download.pytorch.org/whl/cpu

In [None]:
# Download large graph dataset
!wget https://raw.githubusercontent.com/ZhongTr0n/JD_Analysis/main/jd_data2.json 

In [None]:
import json
import torch
import random
import os

from torch_geometric.data import Data
from torch.optim import Adam
import torch.nn.functional as F

from redkg.models.gcn import GCN
from redkg.models.gat import GAT
from redkg.models.graphsage import GraphSAGE
import numpy as np

torch.autograd.set_detect_anomaly(True)
np.random.seed(42)
torch.manual_seed(42)

In [None]:
# Load dataset from file
with open('jd_data2.json', 'r') as f:
    graph_data = json.load(f)

# Extract list of nodes and convert it to a dictionary for fast search
node_list = [node['id'] for node in graph_data['nodes']]
node_mapping = {node_id: i for i, node_id in enumerate(node_list)}
node_index = {index: node for node, index in node_mapping.items()}

# Create list of edges in PyTorch Geometric format
edge_index = [[node_mapping[link['source']], node_mapping[link['target']]] for link in graph_data['links']]
edge_weights = [link['value'] for link in graph_data['links']]
edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
features = torch.randn(len(node_list), 1)
labels = torch.tensor(list(range(len(graph_data['nodes']))), dtype=torch.long)

large_dataset = Data(x=features, edge_index=edge_index, y=labels, node_mapping=node_mapping, node_index=node_index)
#torch.save(large_dataset, 'large_dataset.pth')
#large_dataset.cuda()

In [None]:
from torch import tensor
from redkg.visualization.contracts.graph_contract import GraphContract
from redkg.visualization.contracts.graph_visualization_contract import GraphVisualizationContract
from redkg.visualization.graph_visualization import GraphVisualizer

edge_list = list(map(tuple, edge_index.t().tolist()))

graph_contract: GraphContract = GraphContract(
    vertex_num=len(graph_data['nodes']),
    edge_list=(edge_list, edge_weights),
    edge_num=len(graph_data['links']),
    edge_weights=list(tensor(edge_weights * 2)),
)

vis_contract: GraphVisualizationContract = GraphVisualizationContract(
    graph=graph_contract,
    font_size=4.0,
    vertex_label=node_list
)

vis: GraphVisualizer = GraphVisualizer(vis_contract)
fig = vis.draw()
fig.show()

In [None]:
# Generate multiple subgaphs
def generate_subgraphs(dataset, num_subgraphs=5, min_nodes=2, max_nodes=5):
    subgraphs = []
    for _ in range(num_subgraphs):
        selected_nodes = []
        while len(selected_nodes) < random.randint(min_nodes, max_nodes):
            if selected_nodes:
                new_node = random.choice(
                    [link['target'] for link in dataset['links'] if
                     link['source'] in {node['id'] for node in selected_nodes}] +
                    [link['source'] for link in dataset['links'] if
                     link['target'] in {node['id'] for node in selected_nodes}]
                )
            else:
                new_node = random.choice(dataset['nodes'])['id']
            if new_node not in {node['id'] for node in selected_nodes}:
                selected_nodes.append({'id': new_node})
        selected_node_ids = {node['id'] for node in selected_nodes}
        selected_links = [link for link in dataset['links'] if
                          link['source'] in selected_node_ids and link['target'] in selected_node_ids]
        subgraphs.append({'nodes': selected_nodes, 'links': selected_links})
    return subgraphs


# Load subgraphs from file, or generate them if file does not exist
if not os.path.isfile('subgraphs.json'):
    # Generate subgraphs based on the dataset
    subgraphs = generate_subgraphs(graph_data, num_subgraphs=1000, min_nodes=3, max_nodes=15)
    with open('subgraphs.json', 'w') as f:
        json.dump(subgraphs, f)
else:
    with open('subgraphs.json', 'r') as f:
        subgraphs = json.load(f)

In [None]:
# Generate dataset from all subgraphs
dataset = []
for i in range(len(subgraphs)):
    user_edge_index = []
    for link in subgraphs[i]['links']:
        source_idx = node_mapping.get(link['source'])
        target_idx = node_mapping.get(link['target'])
        # Add edge only if both nodes are on the subgraph
        if source_idx is not None and target_idx is not None:
            user_edge_index.append([source_idx, target_idx])
    user_edge_index = torch.tensor(user_edge_index, dtype=torch.long).t().contiguous()

    # Convert subgraphs nodes of the small graph
    user_node_index = []
    for link in subgraphs[i]['nodes']:
        node_idx = node_mapping.get(link['id'])
        if node_idx is not None:
            user_node_index.append(node_idx)
    user_node_indices = large_dataset.x[user_node_index]

    # Make a mask for the subgraph nodes
    user_mask = torch.zeros_like(large_dataset.x)
    for idx in user_node_index:
        user_mask[idx] = 1
    masked_features = large_dataset.x * user_mask

    # Create a dataset from the subgraph using the same features and labels as the original dataset
    user_data = Data(x=masked_features, edge_index=user_edge_index, y=labels)

    dataset.append(user_data)

In [None]:
# Create a model object
# model = GCN(large_dataset.num_node_features, 64, large_dataset.num_node_features)
# model = GAT(large_dataset.num_node_features, 64, large_dataset.num_node_features)
model = GraphSAGE(large_dataset.num_node_features, 64, large_dataset.num_node_features)
#model.cuda()
model.train()

# Init optimizer
optimizer = Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)

In [None]:
def common_neighbors(edge_index, num_nodes):
    # Создание списка соседей для каждого узла
    neighbors = {i: set() for i in range(num_nodes)}
    for edge in edge_index.t().tolist():
        neighbors[edge[0]].add(edge[1])
        neighbors[edge[1]].add(edge[0])

    return neighbors

In [None]:
def generate_negative_samples(edge_index, num_nodes, num_neg_samples, max_attempts=1000):
    neighbors = common_neighbors(edge_index, num_nodes)
    negative_samples = []
    attempts = 0

    while len(negative_samples) < num_neg_samples and attempts < max_attempts:
        node1 = random.choice(range(num_nodes))
        node2 = random.choice(range(num_nodes))

        # Проверяем, что узлы не связаны и имеют общих соседей
        if node1 != node2 and node2 not in neighbors[node1]:
            common_neigh = neighbors[node1].intersection(neighbors[node2])
            # Условие можно ослабить, уменьшив требуемое количество общих соседей
            if len(common_neigh) > 0:  # Узлы имеют общих соседей
                negative_samples.append([node1, node2])

        attempts += 1

    return negative_samples

In [None]:
# Обновление функции обучения
def train(model, optimizer, subgraph, positive_edges, negative_edges):
    model.train()
    optimizer.zero_grad()
    #subgraph.cuda()

    # Получаем эмбеддинги узлов
    node_embeddings = model(subgraph.x, subgraph.edge_index)

    # Подготовка меток и объединение положительных и отрицательных примеров
    labels = torch.cat([torch.ones(len(positive_edges)), torch.zeros(len(negative_edges))], dim=0).to(subgraph.x.device)

    # Убедимся, что edges имеет правильный тип данных
    edges = torch.cat([torch.tensor(positive_edges), torch.tensor(negative_edges)], dim=0).to(subgraph.x.device).long()

    # Создаём эмбеддинги рёбер
    edge_embeddings = torch.cat([node_embeddings[edges[:, 0]], node_embeddings[edges[:, 1]]], dim=1)

    # Предсказание вероятности наличия связи
    predictions = torch.sigmoid(model.edge_predictor(edge_embeddings)).squeeze()

    # Вычисление потерь и обновление параметров модели
    loss = F.binary_cross_entropy(predictions, labels)
    loss.backward()
    optimizer.step()
    return loss.item()

In [None]:
# Model training
loss_values = []
for epoch in range(2):
    for subgraph in dataset:
        positive_edges = subgraph.edge_index.t().tolist()
        negative_edges = generate_negative_samples(subgraph.edge_index, subgraph.num_nodes, len(positive_edges))
        if len(negative_edges) == 0:
            continue
        loss = train(model, optimizer, subgraph, positive_edges, negative_edges)
        loss_values.append(loss)
        print(f"Epoch: {epoch}, Loss: {loss}")

# Save model to file
torch.save(model.state_dict(), 'model.pth')

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.plot(loss_values, label='Training Loss')
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.title('Training Loss Over Time')
plt.legend()
plt.grid(True)
#plt.savefig('training_loss.png')

In [None]:
# Test prediction
user_graph_data = {
    "nodes": [
        {"id": "node.js"},
        {"id": "react.js"},
        {"id": "javascript"},
        {"id": "angularjs"}
    ],
    "links": [
        {"source": "node.js", "target": "react.js"},
        {"source": "javascript", "target": "node.js"},
        {"source": "angularjs", "target": "javascript"},
        {"source": "angularjs", "target": "react.js"}
    ]
}

# Convert subgraphs edges of the small graph
user_edge_index = []
for link in user_graph_data['links']:
    source_idx = node_mapping.get(link['source'])
    target_idx = node_mapping.get(link['target'])
    # Add edge only if both nodes are on the subgraph
    if source_idx is not None and target_idx is not None:
        user_edge_index.append([source_idx, target_idx])

# Convert to PyTorch Geometric format
user_edge_index = torch.tensor(user_edge_index, dtype=torch.long).t().contiguous()

# Convert subgraphs nodes of the small graph
user_node_index = []
for link in user_graph_data['nodes']:
    node_idx = node_mapping.get(link['id'])
    if node_idx is not None:
        user_node_index.append(node_idx)

# Make a mask for the subgraph nodes
user_mask = torch.zeros_like(large_dataset.x)
for idx in user_node_index:
    user_mask[idx] = 1
masked_features = large_dataset.x * user_mask

# Create a dataset from the subgraph using the same features and labels as the original dataset
user_data = Data(x=masked_features, edge_index=user_edge_index, y=large_dataset.y)
#user_data.cuda()

In [None]:
from torch import tensor
from redkg.visualization.contracts.graph_contract import GraphContract
from redkg.visualization.contracts.graph_visualization_contract import GraphVisualizationContract
from redkg.visualization.graph_visualization import GraphVisualizer

# user_edge_list = list(map(tuple, user_edge_index.t().tolist()))
# user_edge_list = [(node_index[edge[0]], node_index[edge[1]]) for edge in user_edge_list]

user_node_list = [node['id'] for node in user_graph_data['nodes']]
user_index = {id: number for number, id in enumerate(user_node_list)}
user_edge_list = [
    (user_index[edge['source']], user_index[edge['target']])
    for edge in user_graph_data['links']
]
user_edge_weights = [1.0 for _ in range(len(user_edge_list))]

graph_contract: GraphContract = GraphContract(
    vertex_num=len(user_graph_data['nodes']),
    edge_list=(user_edge_list, user_edge_weights),
    edge_num=len(user_graph_data['links']),
    edge_weights=list(tensor(user_edge_weights * 2)),
)

vis_contract: GraphVisualizationContract = GraphVisualizationContract(
    graph=graph_contract,
    #font_size=4.0,
    vertex_label=user_node_list
)

vis: GraphVisualizer = GraphVisualizer(vis_contract)
fig = vis.draw()
fig.show()

In [None]:
def find_neighbors(edge_index, node_idx):
    neighbors = set()
    for edge in edge_index.t().tolist():
        if edge[0] == node_idx:
            neighbors.add(edge[1])
        elif edge[1] == node_idx:
            neighbors.add(edge[0])
    return neighbors

In [None]:
# Вычисление вероятностей связей
def predict_edges(model, data, edge_candidates):
    with torch.no_grad():
        node_embeddings = model(data.x, data.edge_index)
        probabilities = []

        for edge in edge_candidates:
            edge_features = torch.cat([node_embeddings[edge[0]], node_embeddings[edge[1]]], dim=0)
            prob = torch.sigmoid(model.edge_predictor(edge_features.unsqueeze(0))).item()
            probabilities.append((edge, prob))

        return probabilities

In [None]:
# Генерация кандидатов на связи
user_existing_edges = set(tuple(sorted((e[0].item(), e[1].item()))) for e in user_edge_index.t())
user_node_pairs = set(tuple(sorted((node1, node2))) for node1 in user_node_index for node2 in user_node_index if node1 != node2)
possible_large_graph_edges = set(tuple(sorted((e[0].item(), e[1].item()))) for e in large_dataset.edge_index.t())

# Фильтрация possible_edges: только связи, которые возможны в большом графе и отсутствуют на графе пользователя
possible_edges = [list(edge) for edge in possible_large_graph_edges if
                  edge not in user_existing_edges and edge not in user_node_pairs]

# Switch model to evaluation state
model.eval()

# Вычисление вероятностей связей и выбор топ-10
edge_probabilities = predict_edges(model, user_data, possible_edges)
edge_probabilities.sort(key=lambda x: x[1], reverse=True)

# Top o most possible edges
for i, (edge, prob) in enumerate(edge_probabilities[:10]):
    nodes = [node_index[edge[0]], node_index[edge[1]]]
    #print(f"| [{edge[0]}] {nodes[0]} | [{edge[1]}] {nodes[1]} | {prob} |")
    print(f"Edge: [{edge[0]:3}: {nodes[0]:15}] <=> [{edge[1]:3}: {nodes[1]:15}] with probability {prob}")