In [1]:
import torch
import torch_geometric.transforms as T
from torch_geometric.loader import DataLoader
from torch_geometric.datasets import SNAPDataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Homogeneous, Social Network

In [3]:
transform = T.Compose(
    [
        T.ToDevice(device),
        T.RemoveIsolatedNodes(),
        T.RandomLinkSplit(
            num_val=0.05,
            num_test=0.1,
            is_undirected=True,
            add_negative_train_samples=False,
        ),
    ]
)

In [4]:
dataset = SNAPDataset(
    root="./data/SNAPDataset", name="ego-facebook", transform=transform
)

train_data = next(iter(DataLoader([x[0] for x in dataset], batch_size=10)))
val_data = next(iter(DataLoader([x[1] for x in dataset], batch_size=10)))
test_data = next(iter(DataLoader([x[2] for x in dataset], batch_size=10)))

In [5]:
print(test_data)

DataBatch(x=[4167, 1406], edge_index=[2, 160324], circle=[4233], circle_batch=[4233], edge_label=[17800], edge_label_index=[2, 17800], batch=[4167], ptr=[11])


In [6]:
train_data = train_data.to_heterogeneous(node_type_names=["person"], edge_type_names=[("person", "to", "person")])
train_data[("person", "to", "person")]["edge_label"] = train_data["person"].edge_label
train_data[("person", "to", "person")]["edge_label_index"] = train_data["person"].edge_label_index
del train_data["person"].edge_label
del train_data["person"].edge_label_index
del train_data["person"].circle
del train_data["person"].circle_batch

val_data = val_data.to_heterogeneous(node_type_names=["person"], edge_type_names=[("person", "to", "person")])
val_data[("person", "to", "person")]["edge_label"] = val_data["person"].edge_label
val_data[("person", "to", "person")]["edge_label_index"] = val_data["person"].edge_label_index
del val_data["person"].edge_label
del val_data["person"].edge_label_index
del val_data["person"].circle
del val_data["person"].circle_batch

test_data = test_data.to_heterogeneous(node_type_names=["person"], edge_type_names=[("person", "to", "person")])
test_data[("person", "to", "person")]["edge_label"] = test_data["person"].edge_label
test_data[("person", "to", "person")]["edge_label_index"] = test_data["person"].edge_label_index
del test_data["person"].edge_label
del test_data["person"].edge_label_index
del test_data["person"].circle
del test_data["person"].circle_batch

In [7]:
print(train_data)
print(val_data)
print(test_data)

HeteroData(
  [1mperson[0m={
    x=[4167, 1406],
    batch=[4167],
    ptr=[11]
  },
  [1m(person, to, person)[0m={
    edge_index=[2, 151430],
    edge_label=[75715],
    edge_label_index=[2, 75715]
  }
)
HeteroData(
  [1mperson[0m={
    x=[4167, 1406],
    batch=[4167],
    ptr=[11]
  },
  [1m(person, to, person)[0m={
    edge_index=[2, 151430],
    edge_label=[8894],
    edge_label_index=[2, 8894]
  }
)
HeteroData(
  [1mperson[0m={
    x=[4167, 1406],
    batch=[4167],
    ptr=[11]
  },
  [1m(person, to, person)[0m={
    edge_index=[2, 160324],
    edge_label=[17800],
    edge_label_index=[2, 17800]
  }
)


## Heterogeneous, Recommender System

### DBLP

In [8]:
from torch_geometric.datasets import DBLP

In [9]:
transform = T.Compose([
    T.ToDevice(device),
    T.RemoveIsolatedNodes(),
    T.RandomLinkSplit(
        num_val=0.05, 
        num_test=0.1, 
        is_undirected=True, 
        add_negative_train_samples=False,
        edge_types=[("paper", "to", "author")]
    ),
    T.ToUndirected(),
])

In [10]:
dataset = DBLP(root="../data/DBLP", transform=transform)

train_data, val_data, test_data = dataset[0]

for data in train_data, val_data, test_data:
    del data["term"]
    del data[("paper", "to", "term")]
    del data[("term", "to", "paper")]
    del data[("author", "to", "paper")]
    del data[("conference", "to", "paper")]
    
    del data[("paper", "rev_to", "author")]
    del data[("term", "rev_to", "paper")]
    del data[("paper", "rev_to", "term")]
    del data[("paper", "rev_to", "conference")]
    
    del data["author"].train_mask
    del data["author"].val_mask
    del data["author"].test_mask
    del data["author"].y
    
    data["conference"].x = torch.ones((20, 1))
    del data["conference"].num_nodes

In [11]:
train_data

HeteroData(
  [1mauthor[0m={ x=[4057, 334] },
  [1mpaper[0m={ x=[14328, 4231] },
  [1mconference[0m={ x=[20, 1] },
  [1m(paper, to, author)[0m={
    edge_index=[2, 16699],
    edge_label=[16699],
    edge_label_index=[2, 16699]
  },
  [1m(paper, to, conference)[0m={ edge_index=[2, 14328] },
  [1m(author, rev_to, paper)[0m={
    edge_index=[2, 16699],
    edge_label=[16699]
  },
  [1m(conference, rev_to, paper)[0m={ edge_index=[2, 14328] }
)

In [12]:
val_data

HeteroData(
  [1mauthor[0m={ x=[4057, 334] },
  [1mpaper[0m={ x=[14328, 4231] },
  [1mconference[0m={ x=[20, 1] },
  [1m(paper, to, author)[0m={
    edge_index=[2, 16699],
    edge_label=[1964],
    edge_label_index=[2, 1964]
  },
  [1m(paper, to, conference)[0m={ edge_index=[2, 14328] },
  [1m(author, rev_to, paper)[0m={ edge_index=[2, 16699] },
  [1m(conference, rev_to, paper)[0m={ edge_index=[2, 14328] }
)

In [13]:
test_data

HeteroData(
  [1mauthor[0m={ x=[4057, 334] },
  [1mpaper[0m={ x=[14328, 4231] },
  [1mconference[0m={ x=[20, 1] },
  [1m(paper, to, author)[0m={
    edge_index=[2, 17681],
    edge_label=[3928],
    edge_label_index=[2, 3928]
  },
  [1m(paper, to, conference)[0m={ edge_index=[2, 14328] },
  [1m(author, rev_to, paper)[0m={ edge_index=[2, 17681] },
  [1m(conference, rev_to, paper)[0m={ edge_index=[2, 14328] }
)

### IMDB

In [14]:
from torch_geometric.datasets import IMDB

In [15]:
transform = T.Compose([
    T.ToDevice(device),
    T.RemoveIsolatedNodes(),
    T.RandomLinkSplit(
        num_val=0.05, 
        num_test=0.1, 
        is_undirected=True, 
        add_negative_train_samples=False,
        edge_types=[("movie", "to", "actor")]
    ),
    T.ToUndirected(),
])

In [16]:
dataset = IMDB(root="../data/IMDB", transform=transform)

train_data, val_data, test_data = dataset[0]

for data in train_data, val_data, test_data:
    del data[("director", "to", "movie")]
    del data[("actor", "to", "movie")]
    
    del data[("movie", "rev_to", "director")]
    del data[("movie", "rev_to", "actor")]
    
    del data["movie"].train_mask
    del data["movie"].val_mask
    del data["movie"].test_mask
    del data["movie"].y

In [17]:
train_data

HeteroData(
  [1mmovie[0m={ x=[4278, 3066] },
  [1mdirector[0m={ x=[2081, 3066] },
  [1mactor[0m={ x=[5257, 3066] },
  [1m(movie, to, director)[0m={ edge_index=[2, 4278] },
  [1m(movie, to, actor)[0m={
    edge_index=[2, 10905],
    edge_label=[10905],
    edge_label_index=[2, 10905]
  },
  [1m(director, rev_to, movie)[0m={ edge_index=[2, 4278] },
  [1m(actor, rev_to, movie)[0m={
    edge_index=[2, 10905],
    edge_label=[10905]
  }
)

In [18]:
val_data

HeteroData(
  [1mmovie[0m={ x=[4278, 3066] },
  [1mdirector[0m={ x=[2081, 3066] },
  [1mactor[0m={ x=[5257, 3066] },
  [1m(movie, to, director)[0m={ edge_index=[2, 4278] },
  [1m(movie, to, actor)[0m={
    edge_index=[2, 10905],
    edge_label=[1282],
    edge_label_index=[2, 1282]
  },
  [1m(director, rev_to, movie)[0m={ edge_index=[2, 4278] },
  [1m(actor, rev_to, movie)[0m={ edge_index=[2, 10905] }
)

In [19]:
test_data

HeteroData(
  [1mmovie[0m={ x=[4278, 3066] },
  [1mdirector[0m={ x=[2081, 3066] },
  [1mactor[0m={ x=[5257, 3066] },
  [1m(movie, to, director)[0m={ edge_index=[2, 4278] },
  [1m(movie, to, actor)[0m={
    edge_index=[2, 11546],
    edge_label=[2564],
    edge_label_index=[2, 2564]
  },
  [1m(director, rev_to, movie)[0m={ edge_index=[2, 4278] },
  [1m(actor, rev_to, movie)[0m={ edge_index=[2, 11546] }
)