In [1]:
import os
import numpy as np
import pandas as pd
import pickle

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader

from torch_geometric.utils import train_test_split_edges
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import Node2Vec
from torch_geometric.nn import GCNConv, GAE, VGAE
from torch_geometric.data import Data

%matplotlib inline

In [2]:
os.chdir("..")

In [3]:
from utils.table2graph import create_corpus, shuffle_vocabulary, build_graph_edges, build_node_features
from utils.vectorize import generate_table_vectors
from utils.evaluation import evaluate_model


## 0) Dataset loading & Model Configuration

In [4]:
dataset_path="./datasets/benchmarks/rossman200.pickle"

In [5]:
dataset= pickle.load(open(dataset_path,"rb"))

In [6]:
print(len(dataset),len(dataset[0]["table"].columns))

200 17


### General settings:

In [7]:
CONF = {
    "add_attr":True,
    "shuffle_vocab": True,
    "add_columns":False,
    "vector_size":50,
    "row_edges_sample":0.5,
    "column_edges_sample":0.05,
    "epoch_num":10
}

### Specific model settings:

In [8]:
CONF["n2v_walk_length"] = 20
CONF["n2v_context_size"] = 10
CONF["n2v_walks_per_node"] = 10

## 1) Build Table graph

### Tables tokenization

In [9]:
tokenized_tables, vocabulary, cell_dict, reversed_dictionary = corpus_tuple = create_corpus(dataset,include_attr=CONF["add_attr"])

In [10]:
if CONF["shuffle_vocab"] == True:
    shuffled_vocab = shuffle_vocabulary(vocabulary)
else:
    shuffled_vocab = None

In [11]:
nodes = build_node_features(vocabulary)
row_edges_index, row_edges_weights = build_graph_edges(tokenized_tables,s_vocab=shuffled_vocab,sample_frac=CONF["row_edges_sample"],columns=False)
col_edges_index, col_edges_weights = build_graph_edges(tokenized_tables,s_vocab=shuffled_vocab,sample_frac=CONF["column_edges_sample"],columns=True)

In [12]:
len(nodes)

54596

In [13]:
row_edges_index.shape

torch.Size([2, 928840])

In [14]:
col_edges_index.shape

torch.Size([2, 351724])

In [15]:
edges = torch.cat((row_edges_index,col_edges_index),dim=1)
weights= torch.cat((row_edges_weights,col_edges_weights),dim=0)
graph_data = Data(x=nodes,edge_index=edges,edge_attr=weights)

## 2 ) Run Table Auto-Encoder Model:

In [16]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [17]:
#for VAE we split the edges to train/test
def split_edges(graph_data):
    graph_data.train_mask = graph_data.val_mask = graph_data.test_mask = graph_data.y = None
    graph_data = train_test_split_edges(graph_data)
    x, train_pos_edge_index = graph_data.x.to(device), graph_data.train_pos_edge_index.to(device)
    return x, train_pos_edge_index

In [18]:
x, train_pos_edge_index = split_edges(graph_data)

In [19]:
class Encoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Encoder, self).__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels, cached=True)
        self.conv_mu = GCNConv(2 * out_channels, out_channels, cached=True)
        self.conv_logvar = GCNConv(
            2 * out_channels, out_channels, cached=True)
    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        return self.conv_mu(x, edge_index), self.conv_logvar(x, edge_index)


In [20]:
channels=CONF["vector_size"]

In [21]:
enc = Encoder(graph_data.num_features, channels)
model = VGAE(enc)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [22]:
def train(model,optimizer,x, train_pos_edge_index):
    model.train()
    optimizer.zero_grad()
    z = model.encode(x, train_pos_edge_index)
    loss = model.recon_loss(z, train_pos_edge_index)
    #loss = model.kl_loss()

    loss.backward()
    optimizer.step()
    return loss


def test(model,pos_edge_index, neg_edge_index,x, train_pos_edge_index):
    model.eval()
    with torch.no_grad():
        z = model.encode(x, train_pos_edge_index)
    return model.test(z, pos_edge_index, neg_edge_index)

In [23]:
losses=[]
for epoch in range(CONF["epoch_num"]):
    loss = train(model,optimizer,x,train_pos_edge_index)
    losses.append(loss)
    auc, ap = test(model,graph_data.test_pos_edge_index, graph_data.test_neg_edge_index,x,train_pos_edge_index)
    print('Epoch: {:03d}, Loss{:.4f} AUC: {:.4f}, AP: {:.4f}'.format(epoch,loss, auc, ap))

Epoch: 000, Loss10.2076 AUC: 0.8832, AP: 0.9208
Epoch: 001, Loss6.0351 AUC: 0.8832, AP: 0.9208
Epoch: 002, Loss4.6263 AUC: 0.8832, AP: 0.9205
Epoch: 003, Loss4.3347 AUC: 0.8831, AP: 0.9204
Epoch: 004, Loss4.1167 AUC: 0.8831, AP: 0.9205
Epoch: 005, Loss3.8160 AUC: 0.8832, AP: 0.9207
Epoch: 006, Loss3.5145 AUC: 0.8832, AP: 0.9207
Epoch: 007, Loss3.2026 AUC: 0.8831, AP: 0.9208
Epoch: 008, Loss2.9556 AUC: 0.8830, AP: 0.9208
Epoch: 009, Loss2.7360 AUC: 0.8830, AP: 0.9207


### 3) Extract the latent cell vectors, generate table vectors:

In [24]:
def get_cell_vectors(model,x,train_pos_edge_index):
    model.eval()
    with torch.no_grad():
        z = model.encode(x, train_pos_edge_index)
        cell_vectors = z.numpy()
    return z,cell_vectors


In [25]:
z,cell_vectors = get_cell_vectors(model,x,train_pos_edge_index)

In [26]:
vec_list=generate_table_vectors(cell_vectors,tokenized_tables,s_vocab=shuffled_vocab)

## 3) Evaluate the model

In [27]:
result_score=evaluate_model(dataset,vec_list,k=5)

In [28]:
result_score

0.41625

In [29]:
graph_data

Data(edge_attr=[1280564], test_neg_edge_index=[2, 64028], test_pos_edge_index=[2, 64028], train_neg_adj_mask=[54596, 54596], train_pos_edge_index=[2, 1088480], val_neg_edge_index=[2, 32014], val_pos_edge_index=[2, 32014], x=[54596, 1])