In [1]:
import os
import numpy as np
import pandas as pd
import pickle

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader

from torch_geometric.utils import train_test_split_edges
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import Node2Vec
from torch_geometric.nn import GCNConv, GAE, VGAE
from torch_geometric.data import Data

%matplotlib inline

In [2]:
os.chdir("..")

In [3]:
from utils.table2graph import create_corpus, shuffle_vocabulary, build_graph_edges, build_node_features
from utils.vectorize import generate_table_vectors
from utils.evaluation import evaluate_model


In [4]:
from torch_geometric.utils import (negative_sampling, remove_self_loops,
                                   add_self_loops)


## 0) Dataset loading & Model Configuration

In [5]:
dataset_path="./datasets/benchmarks/rossman200.pickle"

In [6]:
dataset= pickle.load(open(dataset_path,"rb"))

In [7]:
print(len(dataset),len(dataset[0]["table"].columns))

200 17


### General settings:

In [8]:
CONF = {
    "add_attr":True,
    "shuffle_vocab": True,
    "add_columns":False,
    "vector_size":16,
    "row_edges_sample":1.0,
    "column_edges_sample":0.1,
    "epoch_num":20
}

### Specific model settings:

In [9]:
CONF["n2v_walk_length"] = 20
CONF["n2v_context_size"] = 10
CONF["n2v_walks_per_node"] = 10

## 1) Build Table graph

### Tables tokenization

In [10]:
tokenized_tables, vocabulary, cell_dict, reversed_dictionary = corpus_tuple = create_corpus(dataset,include_attr=CONF["add_attr"])

In [11]:
if CONF["shuffle_vocab"] == True:
    shuffled_vocab = shuffle_vocabulary(vocabulary)
else:
    shuffled_vocab = None

In [12]:
nodes = build_node_features(vocabulary)
row_edges_index, row_edges_weights = build_graph_edges(tokenized_tables,s_vocab=shuffled_vocab,sample_frac=CONF["row_edges_sample"],columns=False)
col_edges_index, col_edges_weights = build_graph_edges(tokenized_tables,s_vocab=shuffled_vocab,sample_frac=CONF["column_edges_sample"],columns=True)

In [13]:
all_row_edges_index, all_row_edges_weights = build_graph_edges(tokenized_tables,s_vocab=shuffled_vocab,sample_frac=1.0,columns=False)
all_col_edges_index, all_col_edges_weights = build_graph_edges(tokenized_tables,s_vocab=shuffled_vocab,sample_frac=1.0,columns=True)

In [14]:
all_possible_edges= torch.cat((all_row_edges_index,all_col_edges_index),dim=1)

In [15]:
print(dataset_path)

./datasets/benchmarks/rossman200.pickle


In [16]:
print(len(nodes),all_row_edges_index.shape[1],all_col_edges_index.shape[1],all_possible_edges.shape[1])

54596 1857682 7034472 8892154


In [17]:
edges = torch.cat((row_edges_index,col_edges_index),dim=1)
weights= torch.cat((row_edges_weights,col_edges_weights),dim=0)
graph_data = Data(x=nodes,edge_index=edges,edge_attr=weights)

In [18]:
from sklearn import preprocessing
def get_column_by_node(node,r_dict):
    return r_dict[node][0]

def get_column_ids(vocab,s_vocab=None):
    if s_vocab is not None:
        node_columns = [get_column_by_node(n,reversed_dictionary) for n in s_vocab]
    else:
        node_columns = [get_column_by_node(n,reversed_dictionary) for n in range(vocab)]
    le = preprocessing.LabelEncoder()
    node_columns_ids = le.fit_transform(node_columns)
    return torch.tensor(node_columns_ids,dtype=torch.int)


In [19]:
cols=get_column_ids(vocabulary,s_vocab=shuffled_vocab)

In [20]:
graph_data.cols = cols

## 2 ) Run Table Auto-Encoder Model:

In [21]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [22]:
#x, train_pos_edge_index = split_edges(graph_data)
x, train_pos_edge_index = nodes,edges

In [None]:
EPS = 1e-15
MAX_LOGVAR = 10

class TVGAE(GAE):
    r"""The Variational Graph Auto-Encoder model from the
    `"Variational Graph Auto-Encoders" <https://arxiv.org/abs/1611.07308>`_
    paper.

    Args:
        encoder (Module): The encoder module to compute :math:`\mu` and
            :math:`\log\sigma^2`.
        decoder (Module, optional): The decoder module. If set to :obj:`None`,
            will default to the
            :class:`torch_geometric.nn.models.InnerProductDecoder`.
            (default: :obj:`None`)
    """
    def __init__(self, encoder, decoder=None):
        super(TVGAE, self).__init__(encoder, decoder)

    def reparametrize(self, mu, logvar):
        if self.training:
            return mu + torch.randn_like(logvar) * torch.exp(logvar)
        else:
            return mu


    def encode(self, *args, **kwargs):
        """"""
        self.__rmu__, self.__rlogvar__,self.__cmu__, self.__clogvar__ = self.encoder(*args, **kwargs)
        self.__rlogvar__ = self.__rlogvar__.clamp(max=MAX_LOGVAR)
        self.__clogvar__ = self.__clogvar__.clamp(max=MAX_LOGVAR)
        zr = self.reparametrize(self.__rmu__, self.__rlogvar__)
        zc = self.reparametrize(self.__cmu__, self.__clogvar__)
        z=torch.cat((zr,zc),0)
        return z


    def kl_loss(self):

        rmu = self.__rmu__ 
        rlogvar = self.__rlogvar__ 

        cmu = self.__cmu__ 
        clogvar = self.__clogvar__ 
        
        rkl= -0.5 * torch.mean(
            torch.sum(1 + rlogvar - rmu**2 - rlogvar.exp(), dim=1))
        ckl= -0.5 * torch.mean(
            torch.sum(1 + clogvar - rmu**2 - clogvar.exp(), dim=1))
        return(rkl,ckl)

        
    def recon_loss(self,z, pos_edge_index,all_possible_edges):
        EPS = 1e-15
        MAX_LOGVAR = 10

        pos_loss = -torch.log(
            model.decoder(z, pos_edge_index, sigmoid=True) + EPS).mean()

        # Do not include self-loops in negative samples
        pos_edge_index, _ = remove_self_loops(pos_edge_index)
        pos_edge_index, _ = add_self_loops(pos_edge_index)

        neg_edge_index = negative_sampling(all_possible_edges, z.size(0))
        neg_loss = -torch.log(1 -
                              model.decoder(z, neg_edge_index, sigmoid=True) +
                              EPS).mean()

        return pos_loss + neg_loss


In [24]:
class Encoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Encoder, self).__init__()
        self.conv_rows = GCNConv(in_channels, 2 * out_channels, cached=True)
        self.conv_cols = GCNConv(in_channels, 2 * out_channels, cached=True)
        
        self.conv_rmu = GCNConv(2 * out_channels, out_channels, cached=True)
        self.conv_rlogvar = GCNConv(2 * out_channels, out_channels, cached=True)

        self.conv_cmu = GCNConv(2 * out_channels, out_channels, cached=True)
        self.conv_clogvar = GCNConv(2 * out_channels, out_channels, cached=True)
             
        
    def forward(self, x, row_edge_index,col_edge_index):
        xr = F.relu(self.conv_rows(x, row_edge_index))
        xc = F.relu(self.conv_cols(x, col_edge_index))
        return self.conv_rmu(xr, row_edge_index),\
            self.conv_rlogvar(xr, row_edge_index),\
            self.conv_cmu(xc, col_edge_index),\
            self.conv_clogvar(xc, col_edge_index)
    
            
        


In [25]:
channels=CONF["vector_size"]

In [26]:
channels

16

In [27]:
enc = Encoder(graph_data.num_features, channels)
model = TVGAE(enc)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [28]:


def train(model,optimizer,x, row_edges,col_edges):
    model.train()
    optimizer.zero_grad()
    z = model.encode(x, row_edges,col_edges)
    mid = int(len(z)/2)
    zr=z[:mid]
    zc=z[mid:]
    
    #recon loss:
    #row_loss = model.recon_loss(zr,row_edges,all_possible_edges)
    #col_loss = model.recon_loss(zc,col_edges,all_possible_edges)
    #loss = row_loss+col_loss
    
    rkl,ckl = model.kl_loss()
    loss = rkl+ckl
    

    
    loss.backward()
    optimizer.step()
    #return loss,row_loss,col_loss
    return loss,rkl,ckl
    



In [None]:
losses=[]
for epoch in range(CONF["epoch_num"]):
    #loss,row_loss,col_loss = train(model,optimizer,x,row_edges_index,col_edges_index)
    loss = train(model,optimizer,x,row_edges_index,col_edges_index)

    losses.append(loss)

    #print('Epoch: {:03d}, Row Loss{:.4f} Col Loss: {:.4f}, Loss: {:.4f}'.format(epoch,row_loss,col_loss,loss))
    print(epoch,loss)

0 (tensor(14.3631, grad_fn=<AddBackward0>), tensor(10.2363, grad_fn=<AddBackward0>), tensor(4.1268, grad_fn=<AddBackward0>))
1 (tensor(9.3202, grad_fn=<AddBackward0>), tensor(5.5652, grad_fn=<AddBackward0>), tensor(3.7550, grad_fn=<AddBackward0>))
2 (tensor(6.9965, grad_fn=<AddBackward0>), tensor(3.5404, grad_fn=<AddBackward0>), tensor(3.4561, grad_fn=<AddBackward0>))
3 (tensor(6.0272, grad_fn=<AddBackward0>), tensor(3.0189, grad_fn=<AddBackward0>), tensor(3.0083, grad_fn=<AddBackward0>))
4 (tensor(5.5722, grad_fn=<AddBackward0>), tensor(2.7613, grad_fn=<AddBackward0>), tensor(2.8108, grad_fn=<AddBackward0>))
5 (tensor(5.2403, grad_fn=<AddBackward0>), tensor(2.6031, grad_fn=<AddBackward0>), tensor(2.6372, grad_fn=<AddBackward0>))
6 (tensor(4.9703, grad_fn=<AddBackward0>), tensor(2.4791, grad_fn=<AddBackward0>), tensor(2.4912, grad_fn=<AddBackward0>))
7 (tensor(4.7448, grad_fn=<AddBackward0>), tensor(2.3871, grad_fn=<AddBackward0>), tensor(2.3577, grad_fn=<AddBackward0>))
8 (tensor(4.52

### 3) Extract the latent cell vectors, generate table vectors:

In [None]:
def get_cell_vectors(model,x,row_edges_index,col_edges_index):
    model.eval()
    with torch.no_grad():
        z = model.encode(x, row_edges_index,col_edges_index)
        cell_vectors = z.numpy()
    return z,cell_vectors


In [None]:
z,cell_vectors = get_cell_vectors(model,x,row_edges_index,col_edges_index)

In [None]:
vec_list=generate_table_vectors(cell_vectors,tokenized_tables,s_vocab=shuffled_vocab)

## 3) Evaluate the model

In [None]:
result_score=evaluate_model(dataset,vec_list,k=5)

In [None]:
result_score