In [1]:

import pandas as pd
import torch
from torch_geometric.data import Data
from sklearn.preprocessing import LabelEncoder


In [2]:

nodes_df = pd.read_csv("data/cert4.2/out/nodes.csv", index_col=0)
resources_df = pd.read_csv("data/cert4.2/out/resources.csv")
edges_df = pd.read_csv("data/cert4.2/out/edges.csv")


In [3]:

user_encoder = LabelEncoder()
resource_encoder = LabelEncoder()

nodes_df['encoded_id'] = user_encoder.fit_transform(nodes_df.index)
resources_df['encoded_id'] = resource_encoder.fit_transform(resources_df['resource_id'])

user_features = torch.tensor(nodes_df.drop(columns=['user', 'employee_name'], errors='ignore').values, dtype=torch.float)
resource_features = torch.zeros((len(resources_df), user_features.shape[1]))  # padding zeros
x = torch.cat([user_features, resource_features], dim=0)


In [4]:

edges_df['source_encoded'] = user_encoder.transform(edges_df['source_user'])
edges_df['target_encoded'] = resource_encoder.transform(edges_df['target_resource']) + len(nodes_df)
edge_index = torch.tensor(edges_df[['source_encoded', 'target_encoded']].values.T, dtype=torch.long)

y_raw = (nodes_df['login_count'] > nodes_df['login_count'].median()).astype(int)
y = torch.cat([torch.tensor(y_raw.values, dtype=torch.long), torch.full((len(resources_df),), -1, dtype=torch.long)])
train_mask = torch.zeros(x.shape[0], dtype=torch.bool)
train_mask[:len(nodes_df)] = True

data = Data(x=x, edge_index=edge_index, y=y, train_mask=train_mask)
data


Data(x=[6228, 16], edge_index=[2, 488451], y=[6228], train_mask=[6228])