# Setup

In [207]:
import torch
import random
import numpy as np
import os

In [208]:
import warnings
warnings.filterwarnings("ignore")

# Load .pt Data

In [209]:
#TODO:change the root path
folder = '../SEAL_OGB/dataset/movie_actor'
filename = 'movie.pt'
data = torch.load(os.path.join(folder, filename))

In [213]:
# overview
print(data, '\n')

# actors' features (gender and popularity)
print("Features:\n", data.x, '\n')

# actors' features + graph stats (PageRank, Betweenness, Closeness)
print("Features & Graph Stats:\n", data.x_stat, '\n')

# convert tensor to numpy array
print("Tensor to Numpy:\n", data.x.numpy(), '\n')

Data(x=[1942, 2], edge_index=[2, 5068], name=[1942], gender=[1942], popularity=[1942], Date of Birth=[1942], Birth City=[1942], Birth Country=[1942], Height (Inches)=[1942], Ethnicity=[1942], NetWorth=[1942], Age=[1942], x_stat=[1942, 5], category=[5068]) 

Features:
 tensor([[ 2.0000, 11.2530],
        [ 2.0000,  4.9780],
        [ 1.0000,  2.0030],
        ...,
        [ 2.0000, 18.0720],
        [ 1.0000, 10.1630],
        [ 2.0000, 12.1320]]) 

Features & Graph Stats:
 tensor([[2.0000e+00, 1.1253e+01, 9.2644e-04, 2.6557e-06, 1.6486e-03],
        [2.0000e+00, 4.9780e+00, 6.2816e-04, 1.5934e-06, 1.3739e-03],
        [1.0000e+00, 2.0030e+00, 3.3613e-04, 0.0000e+00, 1.0304e-03],
        ...,
        [2.0000e+00, 1.8072e+01, 5.1493e-04, 0.0000e+00, 5.1520e-04],
        [1.0000e+00, 1.0163e+01, 5.1493e-04, 0.0000e+00, 5.1520e-04],
        [2.0000e+00, 1.2132e+01, 2.4927e-04, 0.0000e+00, 1.1634e-01]]) 

Tensor to Numpy:
 [[ 2.    11.253]
 [ 2.     4.978]
 [ 1.     2.003]
 ...
 [ 2.    18.

# Train Test Split

In [214]:
### edge info
print(f"# of nodes: {data.num_nodes}")
print(f"# of edges: {data.num_edges // 2}")
print("------------------------------------")

print(f"# of success(1): {data.edge_index[0][data.category == 1].size(0) // 2}")
print(f"# of non-success(0): {data.edge_index[0][data.category == 0].size(0) // 2}")
print(f"# of failure(-1): {data.edge_index[0][data.category == -1].size(0) // 2}")


# of nodes: 1942
# of edges: 2534
------------------------------------
# of success(1): 1313
# of non-success(0): 570
# of failure(-1): 651


In [215]:
from torch_geometric.utils import (negative_sampling, add_self_loops,
                                   train_test_split_edges)

In [216]:
def do_custom_edge_split(data, custom_split=True, val_ratio=0.1, test_ratio=0.1, is_undirected=True):
    random.seed(234)
    torch.manual_seed(234)
    # remove duplicate links (undirected)
    if is_undirected:
        mask = data.edge_index[0] < data.edge_index[1]
        data.category = data.category[mask]
        data.edge_index = torch.stack(
                [data.edge_index[0][mask], 
                data.edge_index[1][mask]],
                dim=0
            )

    if not custom_split:
        data = train_test_split_edges(data, val_ratio, test_ratio)
        edge_index, _ = add_self_loops(data.train_pos_edge_index)
        data.train_neg_edge_index = negative_sampling(
            edge_index, num_nodes=data.num_nodes,
            num_neg_samples=data.train_pos_edge_index.size(1))
    else:
        # make failure links(-1) as negative edges
        failure_edge_mask = (data.category == -1)
        data.train_neg_edge_index = torch.stack(
            [data.edge_index[0][failure_edge_mask], 
             data.edge_index[1][failure_edge_mask]], 
            dim=0
        )
        # remove unsuccessful edges from edge_index (-1, 0)
        success_edge_mask = (data.category == 1)
        data.edge_index = torch.stack(
            [data.edge_index[0][success_edge_mask], 
             data.edge_index[1][success_edge_mask]], 
            dim=0
        )
        data.category = data.category[success_edge_mask]
        data = train_test_split_edges(data, val_ratio, test_ratio)

        mask = data.train_pos_edge_index[0] < data.train_pos_edge_index[1]
        data.train_pos_edge_index = torch.stack(
            [data.train_pos_edge_index[0][mask], 
            data.train_pos_edge_index[1][mask]],
            dim=0
        )

        # conplement to sufficient data count
        if data.train_neg_edge_index.size(1) < data.train_pos_edge_index.size(1):
            num_neg_sample = data.train_pos_edge_index.size(1) - data.train_neg_edge_index.size(1)
            neg_index = negative_sampling(
                data.train_neg_adj_mask.nonzero().t(), 
                num_nodes=data.num_nodes,
                num_neg_samples=num_neg_sample
            )
            data.train_neg_edge_index = torch.cat(
                [data.train_neg_edge_index,
                 neg_index],
                dim=1
            )
        del data.category

    split_edge = {'train': {}, 'valid': {}, 'test': {}}
    split_edge['train']['edge'] = data.train_pos_edge_index.t()
    split_edge['train']['edge_neg'] = data.train_neg_edge_index.t()
    split_edge['valid']['edge'] = data.val_pos_edge_index.t()
    split_edge['valid']['edge_neg'] = data.val_neg_edge_index.t()
    split_edge['test']['edge'] = data.test_pos_edge_index.t()
    split_edge['test']['edge_neg'] = data.test_neg_edge_index.t()
    return split_edge

In [217]:
split_edge = do_custom_edge_split(data)

In [218]:
print("train pos edge:", data.train_pos_edge_index.size(1))
print("train neg edge:", data.train_neg_edge_index.size(1))
print('-------------------------')
print("valid pos edge:", data.val_pos_edge_index.size(1))
print("valid neg edge:", data.val_neg_edge_index.size(1))
print('-------------------------')
print("test pos edge:", data.test_pos_edge_index.size(1))
print("test neg edge:", data.test_neg_edge_index.size(1))

train pos edge: 1051
train neg edge: 1051
-------------------------
valid pos edge: 131
valid neg edge: 131
-------------------------
test pos edge: 131
test neg edge: 131


In [219]:
# node num pairs of edges
print(split_edge['train'])

# you can access features of the specified node like this
node_num1 = 1
node_num1 = 28
x1 = data.x[50, :].numpy()
x2 = data.x[50, :].numpy()
x = np.concatenate([x1, x2])
x

{'edge': tensor([[   0,    2],
        [   0,   16],
        [   1,   28],
        ...,
        [1899, 1900],
        [1933, 1934],
        [1937, 1938]]), 'edge_neg': tensor([[   0,    1],
        [   5,   14],
        [  14,   19],
        ...,
        [1015,  498],
        [1883, 1747],
        [1494,  656]])}


array([ 2.   , 14.531,  2.   , 14.531], dtype=float32)

In [220]:
# del attributes
del data['name']
del data['gender']
del data['popularity']
del data['Date of Birth']
del data['Birth City']
del data['Birth Country']
del data['Height (Inches)']
del data['Ethnicity']
del data['NetWorth']
del data['Age']


# Save File

In [165]:
to_path = '../SEAL_OGB/dataset/movie_actor'
torch.save(data, os.path.join(to_path, 'movie_transformed.pt'))