- kernel on macos is `torosx`
- Next, work with [3] to better make use of the random link split  


[1]: https://antoniolonga.github.io/Pytorch_geometric_tutorials/posts/post6.html
[2]: https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.transforms.RandomLinkSplit.html#torch_geometric.transforms.RandomLinkSplit
[3]: https://github.com/pyg-team/pytorch_geometric/blob/master/examples/autoencoder.py 

In [30]:
import time
import torch
import pandas as pd
from torch_geometric.datasets import Planetoid
from torch_geometric import transforms as T
from torch_geometric.nn import GCNConv
from torch_geometric.utils import train_test_split_edges
from torch_geometric.nn import GAE, VGAE, GCNConv

def download_citeseer_dataset():
    dataset = Planetoid("./Data","CiteSeer", transform=T.NormalizeFeatures())
    return dataset

# dataset = download_citeseer_dataset()
# data = dataset[0]
# data.train_mask = data.val_mask =data.test_mask = None
# print(data)
# data = train_test_split_edges(data)
# print(data)
if torch.cuda.is_available():
    device = torch.device('cuda')
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')



In [31]:
from torch_geometric.transforms import RandomLinkSplit

# (Data(x=[3327, 3703], edge_index=[2, 6374], y=[3327], train_mask=[3327],val_mask=[3327], test_mask=[3327], edge_label=[6374], edge_label_index=[2, 6374]), 
# Data(x=[3327, 3703], edge_index=[2, 6374], y=[3327], train_mask=[3327], val_mask=[3327], test_mask=[3327], edge_label=[910], edge_label_index=[2, 910]), 
# Data(x=[3327, 3703], edge_index=[2, 7284], y=[3327], train_mask=[3327], val_mask=[3327], test_mask=[3327], edge_label=[1820], edge_label_index=[2, 1820]))

# transform = RandomLinkSplit(is_undirected=True)

# # data.train_mask = data.val_mask =data.test_mask = None
# # data = transform(data)

# train_data, val_data, test_data = transform(data)
# train_data.train_mask = val_data.val_mask =test_data.test_mask = None

transform = T.Compose([
    T.NormalizeFeatures(),
    T.ToDevice(device),
    T.RandomLinkSplit(num_val=0.05, num_test=0.1, is_undirected=True,
                      split_labels=True, add_negative_train_samples=False),
])
# path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'Planetoid')
# dataset = Planetoid(path, args.dataset, transform=transform)

dataset = Planetoid("./Data","CiteSeer", transform=transform)
train_data, val_data, test_data = dataset[0]

print(train_data)

Data(x=[3327, 3703], edge_index=[2, 7740], y=[3327], train_mask=[3327], val_mask=[3327], test_mask=[3327], pos_edge_label=[3870], pos_edge_label_index=[2, 3870])


In [32]:
pd.Series(train_data.y).value_counts()

3    701
2    668
4    596
1    590
5    508
0    264
dtype: int64

In [13]:
class GCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv2 = GCNConv(2 * out_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)


class VariationalGCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv_mu = GCNConv(2 * out_channels, out_channels)
        self.conv_logstd = GCNConv(2 * out_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv_mu(x, edge_index), self.conv_logstd(x, edge_index)


class LinearEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv = GCNConv(in_channels, out_channels)

    def forward(self, x, edge_index):
        return self.conv(x, edge_index)


class VariationalLinearEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv_mu = GCNConv(in_channels, out_channels)
        self.conv_logstd = GCNConv(in_channels, out_channels)

    def forward(self, x, edge_index):
        return self.conv_mu(x, edge_index), self.conv_logstd(x, edge_index)


In [37]:
in_channels, out_channels = dataset.num_features, 6

# if not args.variational and not args.linear:
# model = GAE(GCNEncoder(in_channels, out_channels))
# elif not args.variational and args.linear:
# model = GAE(LinearEncoder(in_channels, out_channels))
# elif args.variational and not args.linear:
model = VGAE(VariationalGCNEncoder(in_channels, out_channels))
# elif args.variational and args.linear:
# model = VGAE(VariationalLinearEncoder(in_channels, out_channels))

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


In [38]:
args_variational = True
def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(train_data.x, train_data.edge_index)
    loss = model.recon_loss(z, train_data.pos_edge_label_index)
    if args_variational:
        loss = loss + (1 / train_data.num_nodes) * model.kl_loss()
    loss.backward()
    optimizer.step()
    return float(loss)


@torch.no_grad()
def test(data):
    model.eval()
    z = model.encode(data.x, data.edge_index)
    return model.test(z, data.pos_edge_label_index, data.neg_edge_label_index)


times = []
for epoch in range(1, 100 + 1):
    start = time.time()
    loss = train()
    auc, ap = test(test_data)
    print(f'Epoch: {epoch:03d}, AUC: {auc:.4f}, AP: {ap:.4f}')
    times.append(time.time() - start)
print(f"Median time per epoch: {torch.tensor(times).median():.4f}s")

Epoch: 001, AUC: 0.6423, AP: 0.6687
Epoch: 002, AUC: 0.6488, AP: 0.6721
Epoch: 003, AUC: 0.6506, AP: 0.6724
Epoch: 004, AUC: 0.6524, AP: 0.6736
Epoch: 005, AUC: 0.6525, AP: 0.6740
Epoch: 006, AUC: 0.6524, AP: 0.6740
Epoch: 007, AUC: 0.6522, AP: 0.6740
Epoch: 008, AUC: 0.6518, AP: 0.6738
Epoch: 009, AUC: 0.6515, AP: 0.6736
Epoch: 010, AUC: 0.6510, AP: 0.6732
Epoch: 011, AUC: 0.6505, AP: 0.6725
Epoch: 012, AUC: 0.6495, AP: 0.6714
Epoch: 013, AUC: 0.6480, AP: 0.6703
Epoch: 014, AUC: 0.6456, AP: 0.6687
Epoch: 015, AUC: 0.6419, AP: 0.6664
Epoch: 016, AUC: 0.6394, AP: 0.6657
Epoch: 017, AUC: 0.6376, AP: 0.6649
Epoch: 018, AUC: 0.6374, AP: 0.6651
Epoch: 019, AUC: 0.6387, AP: 0.6656
Epoch: 020, AUC: 0.6405, AP: 0.6667
Epoch: 021, AUC: 0.6429, AP: 0.6679
Epoch: 022, AUC: 0.6447, AP: 0.6692
Epoch: 023, AUC: 0.6464, AP: 0.6703
Epoch: 024, AUC: 0.6479, AP: 0.6714
Epoch: 025, AUC: 0.6492, AP: 0.6723
Epoch: 026, AUC: 0.6500, AP: 0.6730
Epoch: 027, AUC: 0.6507, AP: 0.6736
Epoch: 028, AUC: 0.6512, AP:

In [None]:
# data = dataset[0]
# data.train_mask = data.val_mask =data.test_mask = None
# print(data)

# # print(tr)


In [None]:
# df_train = data.dataset.iloc[data.indices]
# # # or
# # df_train = df.iloc[data.indices]
# df_train.head()
import pandas as pd 

pd.DataFrame(train_data).head()

In [None]:
# px = pd.DataFrame(x.numpy())


pd.DataFrame(data.y.numpy()).head()

In [None]:
# data.edge_index[:10]

In [None]:
print("## Define the Encoder")



class GCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GCNEncoder, self).__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels, cached=True) # cached only for transductive learning
        self.conv2 = GCNConv(2 * out_channels, out_channels, cached=True) # cached only for transductive learning

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)


print("## Define the Autoencoder")

from torch_geometric.nn import GAE

# parameters
out_channels = 2
num_features = dataset.num_features
epochs = 100
print({'num_features': num_features})

# model
model = GAE(GCNEncoder(num_features, out_channels))

# move to GPU (if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
x = train_data.x.to(device)
train_pos_edge_index = train_data.edge_index.to(device)


In [None]:
# inizialize the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)



In [None]:
def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(x, train_pos_edge_index)
    loss = model.recon_loss(z, train_pos_edge_index)
    #if args.variational:
    #   loss = loss + (1 / data.num_nodes) * model.kl_loss()
    loss.backward()
    optimizer.step()
    return float(loss)


def test(pos_edge_index, neg_edge_index):
    model.eval()
    with torch.no_grad():
        z = model.encode(x, train_pos_edge_index)
    return model.test(z, pos_edge_index, neg_edge_index)

In [None]:
for epoch in range(1, epochs + 1):
    loss = train()

    auc, ap = test(test_data.edge_index, test_data.edge_index)
    print('Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, auc, ap))


Z = model.encode(x, train_pos_edge_index)
print(Z)

In [43]:
## rely on tensorboard 

from torch.utils.tensorboard import SummaryWriter

# parameters
out_channels = 2
num_features = dataset.num_features
epochs = 200

# model
# model = GAE(GCNEncoder(num_features, out_channels))

# move to GPU (if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
# x = data.x.to(device)
# train_pos_edge_index = train_data.train_pos_edge_index.to(device)

# inizialize the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

writer = SummaryWriter('runs/GAE1_experiment_'+'2d_200_epochs')


# times = []
# for epoch in range(1, 100 + 1):
#     start = time.time()
#     loss = train()
#     auc, ap = test(test_data)
#     print(f'Epoch: {epoch:03d}, AUC: {auc:.4f}, AP: {ap:.4f}')
#     times.append(time.time() - start)

for epoch in range(1, epochs + 1):
    start = time.time()
    loss = train()
    auc, ap = test(test_data)
    print('Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, auc, ap))


    writer.add_scalar('auc train',auc,epoch) # new line
    writer.add_scalar('ap train',ap,epoch)   # new line

Epoch: 001, AUC: 0.7833, AP: 0.7920
Epoch: 002, AUC: 0.7761, AP: 0.7887
Epoch: 003, AUC: 0.7629, AP: 0.7748
Epoch: 004, AUC: 0.7636, AP: 0.7759
Epoch: 005, AUC: 0.7713, AP: 0.7854
Epoch: 006, AUC: 0.7782, AP: 0.7916
Epoch: 007, AUC: 0.7816, AP: 0.7938
Epoch: 008, AUC: 0.7822, AP: 0.7944
Epoch: 009, AUC: 0.7816, AP: 0.7944
Epoch: 010, AUC: 0.7792, AP: 0.7931
Epoch: 011, AUC: 0.7758, AP: 0.7903
Epoch: 012, AUC: 0.7730, AP: 0.7874
Epoch: 013, AUC: 0.7731, AP: 0.7874
Epoch: 014, AUC: 0.7754, AP: 0.7897
Epoch: 015, AUC: 0.7791, AP: 0.7933
Epoch: 016, AUC: 0.7818, AP: 0.7957
Epoch: 017, AUC: 0.7835, AP: 0.7971
Epoch: 018, AUC: 0.7841, AP: 0.7978
Epoch: 019, AUC: 0.7838, AP: 0.7978
Epoch: 020, AUC: 0.7825, AP: 0.7973
Epoch: 021, AUC: 0.7810, AP: 0.7964
Epoch: 022, AUC: 0.7791, AP: 0.7945
Epoch: 023, AUC: 0.7789, AP: 0.7943
Epoch: 024, AUC: 0.7810, AP: 0.7965
Epoch: 025, AUC: 0.7835, AP: 0.7992
Epoch: 026, AUC: 0.7851, AP: 0.8006
Epoch: 027, AUC: 0.7865, AP: 0.8019
Epoch: 028, AUC: 0.7875, AP: