In [14]:
import timeit
import torch
from tqdm import tqdm
import numpy as np
import logging
from hype.sn import Embedding 
from hype import train
from hype.graph import load_edge_list, eval_reconstruction
from hype.rsgd import RiemannianSGD
from hype.Poincare import PoincareManifold
import sys
import json
import torch.multiprocessing as mp
from hype.graph_dataset import BatchedDataset

device = torch.device('cpu')
torch.set_default_tensor_type('torch.DoubleTensor')
#torch.set_default_tensor_type('torch.FloatTensor')


In [2]:
## parameters; these are global in the notebook!
opt_maxnorm = 500000; opt_com_n = 2; opt_debug = False
opt_manifold = "Poincare"; opt_dim = 2; opt_com_n = 1;
opt_negs = 50; opt_batchsize = 10; opt_eval_each = 20;
opt_sparse = True; opt_ndproc = 4;  opt_burnin = 20;
opt_dampening = 0.75; opt_neg_multiplier = 1.0; 
opt_burnin_multiplier = 0.01; opt_lr = 0.3 
#######################################
opt_epochs = 1000; 

### Initializing logging and data loading

In [3]:
log_level = logging.DEBUG if opt_debug else logging.INFO
log = logging.getLogger('Poincare')
logging.basicConfig(level=log_level, format='%(message)s', stream=sys.stdout)
log.info('Using edge list dataloader')
idx, objects, weights = load_edge_list("wordnet/mammal_closure.csv", False) 

Using edge list dataloader


### Initializing model

In [4]:
def init_model(manifold, idx, objects, weights, sparse=True):
    conf = []
    model_name = '%s_dim%d%com_n'
    mname = model_name % (opt_manifold, opt_dim, opt_com_n)
    data = BatchedDataset(idx, objects, weights, opt_negs, opt_batchsize,
        opt_ndproc, opt_burnin > 0, opt_dampening)
    model = Embedding(len(data.objects), opt_dim, manifold, sparse=sparse, com_n=opt_com_n)
    data.objects = objects
    return model, data, mname, conf

def adj_matrix(data):
  adj = {}
  for inputs, _ in data:
    for row in inputs:
        x = row[0].item(); y = row[1].item()
        if x in adj:
            adj[x].add(y)
        else:
            adj[x] = {y}
  return adj

### Training

In [21]:
def data_loader_lr(data, epoch, progress = False):
  data.burnin = True 
  lr = opt_lr
  if epoch < opt_burnin:
    data.burnin = True
    lr = opt_lr * opt_burnin_multiplier
  loader_iter = tqdm(data) if progress else data
  return loader_iter, lr

In [28]:

def train(device, model, data, optimizer, progress=False):
  
  epoch_loss = torch.Tensor(len(data))
  LOSS = np.zeros(opt_epochs)
  
  for epoch in range(opt_epochs):
    largest_weight_emb = round(torch.abs(model.lt.weight.data).max().item(), 6)
    print(largest_weight_emb, "is the largest absolute weight in the embedding")
    
    epoch_loss.fill_(0)
    t_start = timeit.default_timer()
    # handling burnin, get loader_iter and learning rate
    loader_iter, lr = data_loader_lr(data, epoch, progress = progress)
    
    for i_batch, (inputs, targets) in enumerate(loader_iter):
      elapsed = timeit.default_timer() - t_start
      inputs = inputs.to(device); targets = targets.to(device)
      
      optimizer.zero_grad()
      preds = model(inputs)
      loss = model.loss(preds, targets, size_average=True)
      loss.backward()
      optimizer.step(lr=lr)
      epoch_loss[i_batch] = loss.cpu().item()
    LOSS[epoch] = torch.mean(epoch_loss).item()
    # since only one thread is used:
    log.info('json_stats: {' f'"epoch": {epoch}, ' \
    f'"elapsed": {elapsed}, ' f'"loss": {LOSS[epoch]}, ' '}')
  return

# Training embedding

In [30]:
# setup model
manifold = PoincareManifold(
    debug=opt_debug, max_norm=opt_maxnorm, com_n=opt_com_n)
model, data, model_name, conf = init_model(
    manifold, idx, objects, weights, sparse=opt_sparse)
data.neg_multiplier = opt_neg_multiplier
model = model.to(device)
print('the total dimension', model.lt.weight.data.size(-1), 'com_n', opt_com_n)

# setup optimizer
optimizer = RiemannianSGD(model.optim_params(manifold), lr= opt_lr)
# get adjacency matrix
adj = adj_matrix(data) 
# begin training
start_time = timeit.default_timer()
train(device, model, data, optimizer, progress=False )
print("Total training time is:", timeit.default_timer() - start_time)


>>>>>> The size of embedding: 1180 and 2 ;and sparse: True
the total dimension 2 com_n 1
0.0001 is the largest absolute weight in the embedding
json_stats: {"epoch": 0, "elapsed": 0.8434281080008077, "loss": 3.93100562588228, }
0.004062 is the largest absolute weight in the embedding
json_stats: {"epoch": 1, "elapsed": 0.8734509070018248, "loss": 3.9290048422196553, }
0.007878 is the largest absolute weight in the embedding
json_stats: {"epoch": 2, "elapsed": 0.854821629000071, "loss": 3.9269469856998818, }
0.011601 is the largest absolute weight in the embedding
json_stats: {"epoch": 3, "elapsed": 0.8515651689995138, "loss": 3.9248984785878283, }
0.015237 is the largest absolute weight in the embedding
json_stats: {"epoch": 4, "elapsed": 0.8563593250000849, "loss": 3.9228398701244616, }
0.018724 is the largest absolute weight in the embedding
json_stats: {"epoch": 5, "elapsed": 0.8582052629972168, "loss": 3.920792682067316, }
0.022219 is the largest absolute weight in the embedding
js

KeyboardInterrupt: 

# Evaluate embedding

In [38]:
meanrank, maprank = eval_reconstruction(
    adj, model.lt.weight.data.clone(), manifold.distance, workers=opt_ndproc)
sqnorms = manifold.pnorm(model.lt.weight.data.clone())
log.info(
        'json_stats final test: \n{' 
        f'"sqnorm_min": {round(sqnorms.min().item(),6)}, '
        f'"sqnorm_avg": {round(sqnorms.mean().item(),6)}, '
        f'"sqnorm_max": {round(sqnorms.max().item(),6)}, \n'
        f'"mean_rank": {round(meanrank,6)}, '
        f'"map": {round(maprank,6)}, '
        '}'
    )

json_stats final test: 
{"sqnorm_min": 0.049918, "sqnorm_avg": 0.87429, "sqnorm_max": 0.999998, 
"mean_rank": 434.161774, "map": 0.016397, }
