In [None]:
import os
import numpy as np
import pandas as pd
import pickle

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader

from torch_geometric.datasets import Planetoid
from torch_geometric.nn import Node2Vec
from torch_geometric.nn import GCNConv, GAE, VGAE
from torch_geometric.data import Data

from utils.table2graph import create_corpus, shuffle_vocabulary, build_graph_edges, build_node_features
from utils.vectorize import generate_table_vectors
from utils.evaluation import evaluate_model
%matplotlib inline

## 0) Dataset loading & Model Configuration

In [None]:
os.chdir("..")

In [None]:
dataset_path="./datasets/benchmarks/airlines50.pickle"


In [None]:
dataset= pickle.load(open(dataset_path,"rb"))

In [None]:
print(len(dataset),len(dataset[0]["table"].columns))

### General settings:

In [13]:
CONF = {
    "add_attr":True,
    "shuffle_vocab": True,
    "add_columns":False,
    "vector_size":50,
    "row_edges_sample":1.0,
    "column_edges_sample":0.1,
    "epoch_num":10
}

### Specific model settings:

In [14]:
CONF["n2v_walk_length"] = 20
CONF["n2v_context_size"] = 10
CONF["n2v_walks_per_node"] = 10

## 1) Build Table graph

### Tables tokenization

In [18]:
tokenized_tables, vocabulary, cell_dict, reversed_dictionary = corpus_tuple = create_corpus(dataset,include_attr=CONF["add_attr"])

In [21]:
if CONF["shuffle_vocab"] == True:
    shuffled_vocab = shuffle_vocabulary(vocabulary)
else:
    shuffled_vocab = None

In [29]:
nodes = build_node_features(vocabulary)
row_edges_index, row_edges_weights = build_graph_edges(tokenized_tables,s_vocab=shuffled_vocab,sample_frac=CONF["row_edges_sample"],columns=False)
col_edges_index, col_edges_weights = build_graph_edges(tokenized_tables,s_vocab=shuffled_vocab,sample_frac=CONF["column_edges_sample"],columns=True)

In [30]:
edges = torch.cat((row_edges_index,col_edges_index),dim=1)
weights= torch.cat((row_edges_weights,col_edges_weights),dim=0)
graph_data = Data(x=nodes,edge_index=edges,edge_attr=weights)

## 2 ) Run Table Auto-Encoder Model:

In [31]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
#for VAE we split the edges to train/test
def split_edges(graph_data):
    graph_data.train_mask = graph_data.val_mask = graph_data.test_mask = graph_data.y = None
    graph_data = train_test_split_edges(graph_data)
    x, train_pos_edge_index = graph_data.x.to(device), graph_data.train_pos_edge_index.to(device)
    return x, train_pos_edge_index

In [None]:
x, train_pos_edge_index = split_edges(graph_data)

In [33]:
class Encoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Encoder, self).__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels, cached=True)
        self.conv_mu = GCNConv(2 * out_channels, out_channels, cached=True)
        self.conv_logvar = GCNConv(
            2 * out_channels, out_channels, cached=True)
    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        return self.conv_mu(x, edge_index), self.conv_logvar(x, edge_index)


In [None]:
channels=CONF["vector_size"]

In [34]:
enc = Encoder(graph_data.num_features, channels)
model = VGAE(enc)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [None]:
def train(model,optimizer,x, train_pos_edge_index):
    model.train()
    optimizer.zero_grad()
    z = model.encode(x, train_pos_edge_index)
    #loss = model.recon_loss(z, train_pos_edge_index)
    loss = model.kl_loss()

    loss.backward()
    optimizer.step()
    return loss


def test(model,pos_edge_index, neg_edge_index,x, train_pos_edge_index):
    model.eval()
    with torch.no_grad():
        z = model.encode(x, train_pos_edge_index)
    return model.test(z, pos_edge_index, neg_edge_index)

In [59]:
losses=[]
for epoch in range(CONF["epoch_num"]):
    loss = train(model,optimizer,x,train_pos_edge_index)
    losses.append(loss)
    auc, ap = test(model,graph_data.test_pos_edge_index, graph_data.test_neg_edge_index,x,train_pos_edge_index)
    print('Epoch: {:03d},Loss{:03d} AUC: {:.4f}, AP: {:.4f}'.format(epoch,loss, auc, ap))

Epoch: 00, Loss: 1.2726


KeyboardInterrupt: 

### 3) Extract the latent cell vectors, generate table vectors:

In [None]:
def get_cell_vectors(model,x,train_pos_edge_index):
    model.eval()
    with torch.no_grad():
        z = model.encode(x, train_pos_edge_index)
        cell_vectors = z.numpy()
    return z,cell_vectors


In [None]:
z,cell_vectors = get_cell_vectors(model,x,train_pos_edge_index)

In [42]:
vec_list=generate_table_vectors(cell_vectors,tokenized_tables,s_vocab=shuffled_vocab)

## 3) Evaluate the model

In [57]:
result_score=evaluate_model(dataset,vec_list,k=5)

In [58]:
result_score

0.792