In [None]:
import os
import numpy as np
import pandas as pd
import pickle

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader

from torch_geometric.utils import train_test_split_edges
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import Node2Vec
from torch_geometric.nn import GCNConv, GAE, VGAE
from torch_geometric.data import Data

%matplotlib inline

In [None]:
os.chdir("..")

In [None]:
from utils.table2graph import create_corpus, shuffle_vocabulary, build_graph_edges, build_node_features
from utils.vectorize import generate_table_vectors
from utils.evaluation import evaluate_model


In [None]:
from torch_geometric.utils import (negative_sampling, remove_self_loops,
                                   add_self_loops)


## 0) Dataset loading & Model Configuration

In [5]:
dataset_path="./datasets/benchmarks/airlines50.pickle"

In [6]:
dataset= pickle.load(open(dataset_path,"rb"))

In [7]:
print(len(dataset),len(dataset[0]["table"].columns))

50 19


### General settings:

In [8]:
CONF = {
    "add_attr":True,
    "shuffle_vocab": True,
    "add_columns":False,
    "vector_size":16,
    "row_edges_sample":0.5,
    "column_edges_sample":0.05,
    "epoch_num":10
}

### Specific model settings:

In [9]:
CONF["n2v_walk_length"] = 20
CONF["n2v_context_size"] = 10
CONF["n2v_walks_per_node"] = 10

## 1) Build Table graph

### Tables tokenization

In [10]:
tokenized_tables, vocabulary, cell_dict, reversed_dictionary = corpus_tuple = create_corpus(dataset,include_attr=CONF["add_attr"])

In [11]:
if CONF["shuffle_vocab"] == True:
    shuffled_vocab = shuffle_vocabulary(vocabulary)
else:
    shuffled_vocab = None

In [12]:
nodes = build_node_features(vocabulary)
row_edges_index, row_edges_weights = build_graph_edges(tokenized_tables,s_vocab=shuffled_vocab,sample_frac=CONF["row_edges_sample"],columns=False)
col_edges_index, col_edges_weights = build_graph_edges(tokenized_tables,s_vocab=shuffled_vocab,sample_frac=CONF["column_edges_sample"],columns=True)

In [13]:
all_row_edges_index, all_row_edges_weights = build_graph_edges(tokenized_tables,s_vocab=shuffled_vocab,sample_frac=1.0,columns=False)
all_col_edges_index, all_col_edges_weights = build_graph_edges(tokenized_tables,s_vocab=shuffled_vocab,sample_frac=1.0,columns=True)

In [14]:
all_possible_edges= torch.cat((all_row_edges_index,all_col_edges_index),dim=1)

In [15]:
print(dataset_path)

./datasets/benchmarks/airlines50.pickle


In [16]:
print(len(nodes),all_row_edges_index.shape[1],all_col_edges_index.shape[1],all_possible_edges.shape[1])

13063 1001452 2273208 3274660


In [18]:
edges = torch.cat((row_edges_index,col_edges_index),dim=1)
weights= torch.cat((row_edges_weights,col_edges_weights),dim=0)
graph_data = Data(x=nodes,edge_index=edges,edge_attr=weights)

In [19]:
from sklearn import preprocessing
def get_column_by_node(node,r_dict):
    return r_dict[node][0]

def get_column_ids(vocab,s_vocab=None):
    if s_vocab is not None:
        node_columns = [get_column_by_node(n,reversed_dictionary) for n in s_vocab]
    else:
        node_columns = [get_column_by_node(n,reversed_dictionary) for n in range(vocab)]
    le = preprocessing.LabelEncoder()
    node_columns_ids = le.fit_transform(node_columns)
    return torch.tensor(node_columns_ids,dtype=torch.int)


In [20]:
cols=get_column_ids(vocabulary,s_vocab=shuffled_vocab)

In [21]:
graph_data.cols = cols

## 2 ) Run Table Auto-Encoder Model:

In [22]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [23]:
#for VAE we split the edges to train/test
def split_edges(graph_data):
    graph_data.train_mask = graph_data.val_mask = graph_data.test_mask = graph_data.y = None
    graph_data = train_test_split_edges(graph_data)
    x, train_pos_edge_index = graph_data.x.to(device), graph_data.train_pos_edge_index.to(device)
    return x, train_pos_edge_index

In [24]:
#x, train_pos_edge_index = split_edges(graph_data)
x, train_pos_edge_index = nodes,edges

In [25]:
class Encoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Encoder, self).__init__()
        self.conv_rows = GCNConv(in_channels, 2 * out_channels, cached=True)
        self.conv_cols = GCNConv(in_channels, 2 * out_channels, cached=True)
        self.conv2 = GCNConv(4 * out_channels, out_channels, cached=True)

    def forward(self, x, row_edge_index,col_edge_index):
        x1 = F.relu(self.conv_rows(x, row_edge_index))
        x2 = F.relu(self.conv_cols(x, col_edge_index))
        x_all = torch.cat((x1, x2), 1)
        #print(x1.shape,x2.shape,x_all.shape)
        edges = torch.cat((row_edges_index,col_edges_index),dim=1)
        return self.conv2(x_all, edges)


In [26]:
channels=CONF["vector_size"]

In [27]:
channels

16

In [28]:
enc = Encoder(graph_data.num_features, channels)
model = GAE(enc)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [29]:
def pairwise_distances(x, y=None):
    '''
    Input: x is a Nxd matrix
           y is an optional Mxd matirx
    Output: dist is a NxM matrix where dist[i,j] is the square norm between x[i,:] and y[j,:]
            if y is not given then use 'y=x'.
    i.e. dist[i,j] = ||x[i,:]-y[j,:]||^2
    '''
    x_norm = (x**2).sum(1).view(-1, 1)
    if y is not None:
        y_norm = (y**2).sum(1).view(1, -1)
    else:
        y = x
        y_norm = x_norm.view(1, -1)

    dist = x_norm + y_norm - 2.0 * torch.mm(x, torch.transpose(y, 0, 1))
    return dist
    
def get_column_loss(z):
    avg_list = []
    for i in range(len(dataset[0]["table"].columns)):
        mask = (graph_data.cols == i)
        avg_list.append(z[mask].mean(axis=0))
    at= torch.stack(avg_list)
    #distsum = euclidean_distances(at).sum()
    distsum= pairwise_distances(at).sum()
    #print(distsum)
    return 20-torch.log(distsum)

def get_column_loss2(z):
    distsum=0
    for i in range(len(dataset[0]["table"].columns)):
        mask = (graph_data.cols == i)
        colsum = pairwise_distances(z[mask]).sum()
        #print(colsum)
        distsum+=colsum
    return torch.log(distsum)


def recon_loss2(model,z, pos_edge_index,all_possible_edges):
    r"""Given latent variables :obj:`z`, computes the binary cross
    entropy loss for positive edges :obj:`pos_edge_index` and negative
    sampled edges.

    Args:
        z (Tensor): The latent space :math:`\mathbf{Z}`.
        pos_edge_index (LongTensor): The positive edges to train against.
    """
    EPS = 1e-15
    MAX_LOGVAR = 10

    pos_loss = -torch.log(
        model.decoder(z, pos_edge_index, sigmoid=True) + EPS).mean()

    # Do not include self-loops in negative samples
    pos_edge_index, _ = remove_self_loops(pos_edge_index)
    pos_edge_index, _ = add_self_loops(pos_edge_index)

    neg_edge_index = negative_sampling(all_possible_edges, z.size(0))
    neg_loss = -torch.log(1 -
                          model.decoder(z, neg_edge_index, sigmoid=True) +
                          EPS).mean()

    return pos_loss + neg_loss


def train(model,optimizer,x, row_edges,col_edges):
    model.train()
    optimizer.zero_grad()
    z = model.encode(x, row_edges,col_edges)
    r1 = model.recon_loss(z, row_edges)
    r2 = model.recon_loss(z,col_edges)
    
    edges = torch.cat((row_edges_index,col_edges_index),dim=1)
    rl2 =recon_loss2(z,edges,all_possible_edges)
    #row_loss = recon_loss2(model,z,row_edges,all_possible_edges)
    #col_loss = recon_loss2(model,z,col_edges,all_possible_edges)
    #kl = model.kl_loss()
    #cl1 = get_column_loss(z)
    #cl2 = get_column_loss2(z)
    #loss=rl+cl2+kl
    #loss = row_loss+col_loss
    loss = rl2
    
    
    loss.backward()
    optimizer.step()
    #return loss,row_loss,col_loss
    return loss


def test(model,pos_edge_index, neg_edge_index,x, train_pos_edge_index):
    model.eval()
    with torch.no_grad():
        z = model.encode(x, train_pos_edge_index)
    return model.test(z, pos_edge_index, neg_edge_index)

In [30]:
losses=[]
for epoch in range(CONF["epoch_num"]):
    #loss,row_loss,col_loss = train(model,optimizer,x,row_edges_index,col_edges_index)
    loss = train(model,optimizer,x,row_edges_index,col_edges_index)

    losses.append(loss)

    #print('Epoch: {:03d}, Row Loss{:.4f} Col Loss: {:.4f}, Loss: {:.4f}'.format(epoch,row_loss,col_loss,loss))
    print(epoch,loss)

AttributeError: 'GAE' object has no attribute 'recon_loss2'

### 3) Extract the latent cell vectors, generate table vectors:

In [None]:
def get_cell_vectors(model,x,row_edges_index,col_edges_index):
    model.eval()
    with torch.no_grad():
        z = model.encode(x, row_edges_index,col_edges_index)
        cell_vectors = z.numpy()
    return z,cell_vectors


In [None]:
z,cell_vectors = get_cell_vectors(model,x,row_edges_index,col_edges_index)

In [None]:
vec_list=generate_table_vectors(cell_vectors,tokenized_tables,s_vocab=shuffled_vocab)

## 3) Evaluate the model

In [None]:
result_score=evaluate_model(dataset,vec_list,k=5)

In [None]:
result_score