# GCN: Обучение и предсказание на новых данных

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# === Загрузка обучающих данных ===
nodes_df = pd.read_csv("data/nodes4.csv")
edges_df = pd.read_csv("data/edges4.csv")
resources_df = pd.read_csv("data/resources4.csv")
# Убедимся, что timestamp — это datetime
edges_df['timestamp'] = pd.to_datetime(edges_df['timestamp'], errors='coerce')

In [2]:
# Кодирование признаков узлов
role_encoder = LabelEncoder()
resource_encoder = LabelEncoder()

In [3]:
# === Подготовка признаков пользователей ===
nodes_df['role_encoded'] = role_encoder.fit_transform(nodes_df['role'])
nodes_df['last_login_time'] = pd.to_datetime(nodes_df['last_login_time'])
ref_date = pd.Timestamp("2025-05-18")
nodes_df['days_since_login'] = (ref_date - nodes_df['last_login_time']).dt.days


In [4]:
# Признаки пользователей
x_users = nodes_df[['role_encoded', 'login_count', 'access_level', 'session_time', 'var_activity', 'days_since_login']].to_numpy()
y_users = (nodes_df['session_time'] > 0).astype(int).to_numpy()

In [5]:
# === Подготовка признаков ресурсов ===
#resource_encoder = LabelEncoder().fit(resources_df['resource_type'])
#resources_df['resource_type_encoded'] = resource_encoder.transform(resources_df['resource_type'])
# Обработка ресурсов ---
resources_df['resource_type_encoded'] = resource_encoder.fit_transform(resources_df['resource_type'])
# Реальные признаки ресурсов
x_resources = resources_df[['resource_type_encoded', 'access_count_last_month', 'importance', 'access_level']].to_numpy()
x_resources_padded = np.hstack([x_resources, np.zeros((x_resources.shape[0], x_users.shape[1] - x_resources.shape[1]))])

x_combined = np.vstack([x_users, x_resources_padded])
num_users, num_resources = x_users.shape[0], x_resources.shape[0]

x = torch.tensor(x_combined, dtype=torch.float)

edges_df['target_resource'] += num_users
edge_index = torch.tensor([edges_df['source_user'].values, edges_df['target_resource'].values], dtype=torch.long)

y = torch.tensor(nodes_df['role_encoded'].values, dtype=torch.long)
y_full = np.concatenate([y, -1 * np.ones(num_resources)])

data = Data(x=x, edge_index=edge_index, y=torch.tensor(y_full, dtype=torch.long))

  edge_index = torch.tensor([edges_df['source_user'].values, edges_df['target_resource'].values], dtype=torch.long)


In [6]:
y_full

array([ 4.,  0.,  0.,  3.,  4.,  2.,  0.,  4.,  4.,  1.,  4.,  0.,  3.,
        1.,  2.,  0.,  0.,  0.,  3.,  1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1.])

In [7]:
y

tensor([4, 0, 0, 3, 4, 2, 0, 4, 4, 1, 4, 0, 3, 1, 2, 0, 0, 0, 3, 1])

In [8]:
x

tensor([[4.0000e+00, 8.4500e+02, 2.0000e+00, 2.2084e+03, 2.6500e+00, 1.5000e+01],
        [0.0000e+00, 6.4700e+02, 3.0000e+00, 1.2712e+03, 6.5600e+00, 2.1000e+01],
        [0.0000e+00, 7.1700e+02, 2.0000e+00, 2.5870e+03, 5.8000e+00, 2.7000e+01],
        [3.0000e+00, 6.0000e+02, 4.0000e+00, 1.4892e+03, 8.6000e+00, 2.1000e+01],
        [4.0000e+00, 8.4600e+02, 3.0000e+00, 2.8841e+03, 8.5100e+00, 3.1000e+01],
        [2.0000e+00, 8.8200e+02, 3.0000e+00, 2.0276e+03, 9.3400e+00, 2.8000e+01],
        [0.0000e+00, 1.9700e+02, 3.0000e+00, 1.2862e+03, 8.6400e+00, 1.5000e+01],
        [4.0000e+00, 1.3400e+02, 1.0000e+00, 2.4533e+03, 6.3000e+00, 1.5000e+01],
        [4.0000e+00, 7.8100e+02, 4.0000e+00, 3.5924e+03, 7.0400e+00, 1.5000e+01],
        [1.0000e+00, 7.8000e+01, 2.0000e+00, 2.7371e+03, 9.2500e+00, 1.4000e+01],
        [4.0000e+00, 9.6000e+02, 3.0000e+00, 2.8141e+03, 6.2900e+00, 3.5000e+01],
        [0.0000e+00, 7.3000e+01, 3.0000e+00, 2.8894e+03, 9.0900e+00, 1.0000e+01],
        [3.0000e

In [9]:
edge_index

tensor([[16,  3, 17, 10, 19, 10,  6, 16,  4, 11, 19,  4, 12,  3,  8, 14, 10, 15,
         18,  9,  0,  7, 13, 17,  9, 14, 11, 17, 16,  3, 16,  1,  9, 19,  1,  6,
          7, 17, 17,  1,  2,  1,  3, 13, 15,  4,  6,  2,  2,  1,  0, 13,  4, 10,
          3, 10, 18,  2,  9, 10,  8,  5, 14,  3,  1,  2, 15, 18, 16,  3,  5, 16,
          9, 17,  3, 18, 15,  3,  0, 15,  7, 16,  9,  9, 11, 12,  9,  1, 13,  7,
          4, 17,  6,  0, 14, 14, 18,  4,  7, 18],
        [20, 23, 25, 30, 22, 31, 21, 25, 30, 29, 24, 31, 24, 23, 30, 24, 31, 25,
         24, 27, 30, 31, 28, 25, 22, 24, 26, 29, 25, 23, 29, 20, 24, 27, 21, 28,
         30, 21, 29, 25, 21, 29, 24, 28, 26, 31, 20, 25, 21, 21, 30, 28, 31, 30,
         23, 31, 24, 29, 24, 30, 31, 24, 24, 23, 29, 26, 26, 24, 28, 23, 24, 25,
         24, 26, 24, 23, 29, 24, 30, 25, 31, 25, 23, 22, 26, 23, 27, 29, 24, 30,
         31, 21, 25, 30, 24, 24, 24, 31, 30, 23]])

In [None]:
data

In [142]:
# === Обучающая маска ===
user_indices = np.arange(num_users)
train_idx, test_idx = train_test_split(user_indices, test_size=1, random_state=42)
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[train_idx] = True
data.train_mask = train_mask

In [143]:
# === Модель GCN ===
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

In [144]:
model = GCN(input_dim=6, hidden_dim=16, output_dim=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

for epoch in range(1, 101):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0 or epoch == 1:
        print(f"Epoch {epoch:03d}, Loss: {loss.item():.4f}")

# === Загрузка тестовых данных ===
nodes_test = pd.read_csv("data/nodes_test.csv")
edges_test = pd.read_csv("data/edges_test.csv")
resources_test = pd.read_csv("data/resources_test.csv")

nodes_test['role_encoded'] = role_encoder.transform(nodes_test['role'])
nodes_test['last_login_time'] = pd.to_datetime(nodes_test['last_login_time'])
nodes_test['days_since_login'] = (ref_date - nodes_test['last_login_time']).dt.days

x_test_users = nodes_test[['role_encoded', 'login_count', 'access_level', 'session_time', 'var_activity', 'days_since_login']].to_numpy()
resources_test['resource_type_encoded'] = resource_encoder.transform(resources_test['resource_type'])
x_test_resources = resources_test[['resource_type_encoded', 'importance', 'access_level']].to_numpy()
x_test_resources_padded = np.hstack([x_test_resources, np.zeros((x_test_resources.shape[0], x_test_users.shape[1] - x_test_resources.shape[1]))])

x_test_combined = np.vstack([x_test_users, x_test_resources_padded])
num_test_users = x_test_users.shape[0]
edges_test['target_resource'] += num_test_users
edge_index_test = torch.tensor([edges_test['source_user'].values, edges_test['target_resource'].values], dtype=torch.long)

test_data = Data(x=torch.tensor(x_test_combined, dtype=torch.float), edge_index=edge_index_test)

# === Предсказание на новых данных ===
model.eval()
with torch.no_grad():
    out = model(test_data)
    preds = out.argmax(dim=1)
    test_preds = preds[:num_test_users]  # только для пользователей
    print("\n🔎 Test Predictions (0 - низкая активность, 1 - высокая):")
    print(test_preds.cpu().numpy())
    print(preds)


IndexError: Target 3 is out of bounds.

In [128]:
test_data

Data(x=[32, 6], edge_index=[2, 100])

In [129]:
data.y

tensor([ 4,  0,  0,  3,  4,  2,  0,  4,  4,  1,  4,  0,  3,  1,  2,  0,  0,  0,
         3,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1])