In [1]:
import os
import shutil
import json
import h5py
import torch
import numpy as np
from pathlib import Path
from torchbiggraph.config import parse_config
from torchbiggraph.converters.importers import TSVEdgelistReader, convert_input_data
from torchbiggraph.train import train
from torchbiggraph.util import SubprocessInitializer, setup_logging

In [2]:
DATA_DIR = 'data/example_2'
GRAPH_PATH = DATA_DIR + '/kb.tsv'
TRAINING_PATH = DATA_DIR + '/training.tsv'
TEST_PATH = DATA_DIR + '/test.tsv'
MODEL_DIR = 'model_2'

config = dict(
    # I/O data
    entity_path=DATA_DIR,
    edge_paths=[
        DATA_DIR + '/edge_path'
    ],
    checkpoint_path=MODEL_DIR,
    # Graph structure
    entities={"all": {"num_partitions": 1}},
    relations=[
       {
            "name": "all_edges",
            "lhs": "all",
            "rhs": "all",
            "operator": "translation",
        }
    ],

    dynamic_relations=True,
    dimension=400,  
    global_emb=False,
    comparator="dot",
    num_epochs=200,
    num_uniform_negs=1000,
    loss_fn="softmax",
    lr=0.1,
    regularization_coef=1e-3,
    eval_fraction=0.2,
)

In [3]:
# =================================================
# 2. TRANSFORM GRAPH TO A BIGGRAPH-FRIENDLY FORMAT
# This step generates the following metadata files:

# data/example_2/entity_count_director_0.txt
# data/example_2/entity_count_director_0.json

# and this file with data:
# data/example_2/edges_partitioned/edges_0_0.h5
# =================================================
setup_logging()
config = parse_config(config)
subprocess_init = SubprocessInitializer()
input_edge_paths = [Path(GRAPH_PATH)]

convert_input_data(
    config.entities,
    config.relations,
    config.entity_path,
    config.edge_paths,
    input_edge_paths,
    TSVEdgelistReader(lhs_col=0, rel_col=1, rhs_col=2),
    dynamic_relations=config.dynamic_relations,
)


Found some files that indicate that the input data has already been preprocessed, not doing it again.
These files are in: data/example_2, data/example_2/edge_path


In [4]:
# ===============================================
# 3. TRAIN THE EMBEDDINGS
# files generated in this step:
#
# checkpoint_version.txt
# config.json
# embeddings_all_0.v7.h5
# model.v7.h5
# training_stats.json
# ===============================================

train(config, subprocess_init=subprocess_init)

2021-04-23 04:14:45,894   [Trainer-0] Loading entity counts...
2021-04-23 04:14:46,162   [Trainer-0] Creating workers...
2021-04-23 04:14:46,276   [Trainer-0] Initializing global model...
2021-04-23 04:14:46,324   [Trainer-0] Starting epoch 58 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 04:14:46,326   [Trainer-0] Edge path: data/example_2/edge_path
2021-04-23 04:14:46,330   [Trainer-0] still in queue: 0
2021-04-23 04:14:46,341   [Trainer-0] Swapping partitioned embeddings None ( 0 , 0 )
2021-04-23 04:14:46,354   [Trainer-0] Loading partitioned embeddings from checkpoint
2021-04-23 04:14:48,259   [Trainer-0] ( 0 , 0 ): Stats before training: loss:  12.9771 , pos_rank:  242.612 , mrr:  0.26206 , r1:  0.206249 , r10:  0.357021 , r50:  0.501187 , auc:  0.865537 , count:  26948
2021-04-23 04:15:40,194   [Trainer-0] ( 0 , 0 ): Training stats: loss:  1.13256 , reg:  0.0862826 , violators_lhs:  5.07421 , violators_rhs:  0.102363 , count:  107793
2021-04-23 04:15:42,354   [Trainer-0] ( 

2021-04-23 04:17:54,559   [Trainer-0] Swapping partitioned embeddings ( 0 , 0 ) None
2021-04-23 04:17:54,559   [Trainer-0] Saving partitioned embeddings to checkpoint
2021-04-23 04:17:54,639   [Trainer-0] Finished epoch 62 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 04:17:54,640   [Trainer-0] Writing the metadata
2021-04-23 04:17:54,668   [Trainer-0] Writing the training stats
2021-04-23 04:17:54,670   [Trainer-0] Writing the checkpoint
2021-04-23 04:17:54,677   [Trainer-0] Switching to the new checkpoint version
2021-04-23 04:17:54,693   [Trainer-0] Starting epoch 63 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 04:17:54,698   [Trainer-0] Edge path: data/example_2/edge_path
2021-04-23 04:17:54,698   [Trainer-0] still in queue: 0
2021-04-23 04:17:54,700   [Trainer-0] Swapping partitioned embeddings None ( 0 , 0 )
2021-04-23 04:17:54,700   [Trainer-0] Loading partitioned embeddings from checkpoint
2021-04-23 04:17:55,933   [Trainer-0] ( 0 , 0 ): Stats before training: loss

2021-04-23 04:20:25,607   [Trainer-0] ( 0 , 0 ): bucket 1 / 1 : Trained 107793 edges in 37.35 s ( 0.0029 M/sec ); Eval 2*26948 edges in 3.44 s ( 0.016 M/sec ); io: 0.13 s for 72,408,184 bytes ( 546.26 MB/sec )
2021-04-23 04:20:25,613   [Trainer-0] Swapping partitioned embeddings ( 0 , 0 ) None
2021-04-23 04:20:25,613   [Trainer-0] Saving partitioned embeddings to checkpoint
2021-04-23 04:20:25,714   [Trainer-0] Finished epoch 67 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 04:20:25,715   [Trainer-0] Writing the metadata
2021-04-23 04:20:25,729   [Trainer-0] Writing the training stats
2021-04-23 04:20:25,734   [Trainer-0] Writing the checkpoint
2021-04-23 04:20:25,735   [Trainer-0] Switching to the new checkpoint version
2021-04-23 04:20:25,737   [Trainer-0] Starting epoch 68 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 04:20:25,739   [Trainer-0] Edge path: data/example_2/edge_path
2021-04-23 04:20:25,740   [Trainer-0] still in queue: 0
2021-04-23 04:20:25,740   [Trainer-0

2021-04-23 04:24:44,731   [Trainer-0] ( 0 , 0 ): Training stats: loss:  1.11674 , reg:  0.0840364 , violators_lhs:  5.05563 , violators_rhs:  0.103569 , count:  107793
2021-04-23 04:24:46,991   [Trainer-0] ( 0 , 0 ): Stats after training: loss:  12.9198 , pos_rank:  239.469 , mrr:  0.26507 , r1:  0.208494 , r10:  0.363793 , r50:  0.506086 , auc:  0.864832 , count:  26948
2021-04-23 04:24:46,994   [Trainer-0] ( 0 , 0 ): bucket 1 / 1 : Trained 107793 edges in 29.08 s ( 0.0037 M/sec ); Eval 2*26948 edges in 3.49 s ( 0.015 M/sec ); io: 0.36 s for 72,408,184 bytes ( 201.12 MB/sec )
2021-04-23 04:24:46,997   [Trainer-0] Swapping partitioned embeddings ( 0 , 0 ) None
2021-04-23 04:24:46,998   [Trainer-0] Saving partitioned embeddings to checkpoint
2021-04-23 04:24:47,165   [Trainer-0] Finished epoch 72 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 04:24:47,166   [Trainer-0] Writing the metadata
2021-04-23 04:24:47,189   [Trainer-0] Writing the training stats
2021-04-23 04:24:47,194   [T

2021-04-23 04:56:10,777   [Trainer-0] Swapping partitioned embeddings None ( 0 , 0 )
2021-04-23 04:56:10,777   [Trainer-0] Loading partitioned embeddings from checkpoint
2021-04-23 04:56:12,597   [Trainer-0] ( 0 , 0 ): Stats before training: loss:  12.9051 , pos_rank:  239.227 , mrr:  0.26566 , r1:  0.209106 , r10:  0.363385 , r50:  0.506234 , auc:  0.8655 , count:  26948
2021-04-23 04:56:39,650   [Trainer-0] ( 0 , 0 ): Training stats: loss:  1.11572 , reg:  0.0833403 , violators_lhs:  5.0496 , violators_rhs:  0.10355 , count:  107793
2021-04-23 04:56:41,772   [Trainer-0] ( 0 , 0 ): Stats after training: loss:  12.9088 , pos_rank:  238.91 , mrr:  0.265378 , r1:  0.208457 , r10:  0.364109 , r50:  0.507663 , auc:  0.865073 , count:  26948
2021-04-23 04:56:41,773   [Trainer-0] ( 0 , 0 ): bucket 1 / 1 : Trained 107793 edges in 27.06 s ( 0.004 M/sec ); Eval 2*26948 edges in 3.54 s ( 0.015 M/sec ); io: 0.40 s for 72,408,184 bytes ( 180.55 MB/sec )
2021-04-23 04:56:41,774   [Trainer-0] Swappi

2021-04-23 08:04:57,869   [Trainer-0] Writing the checkpoint
2021-04-23 08:04:57,870   [Trainer-0] Switching to the new checkpoint version
2021-04-23 08:04:57,880   [Trainer-0] Starting epoch 82 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 08:04:57,891   [Trainer-0] Edge path: data/example_2/edge_path
2021-04-23 08:04:57,900   [Trainer-0] still in queue: 0
2021-04-23 08:04:57,901   [Trainer-0] Swapping partitioned embeddings None ( 0 , 0 )
2021-04-23 08:04:57,902   [Trainer-0] Loading partitioned embeddings from checkpoint
2021-04-23 08:04:59,938   [Trainer-0] ( 0 , 0 ): Stats before training: loss:  12.9067 , pos_rank:  238.843 , mrr:  0.265547 , r1:  0.207938 , r10:  0.36537 , r50:  0.507422 , auc:  0.863645 , count:  26948
2021-04-23 08:05:21,565   [Trainer-0] ( 0 , 0 ): Training stats: loss:  1.10881 , reg:  0.0826812 , violators_lhs:  5.04785 , violators_rhs:  0.102623 , count:  107793
2021-04-23 08:05:22,846   [Trainer-0] ( 0 , 0 ): Stats after training: loss:  12.8983 , p

2021-04-23 09:29:21,147   [Trainer-0] Saving partitioned embeddings to checkpoint
2021-04-23 09:29:21,314   [Trainer-0] Finished epoch 86 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 09:29:21,315   [Trainer-0] Writing the metadata
2021-04-23 09:29:21,346   [Trainer-0] Writing the training stats
2021-04-23 09:29:21,354   [Trainer-0] Writing the checkpoint
2021-04-23 09:29:21,355   [Trainer-0] Switching to the new checkpoint version
2021-04-23 09:29:21,368   [Trainer-0] Starting epoch 87 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 09:29:21,370   [Trainer-0] Edge path: data/example_2/edge_path
2021-04-23 09:29:21,371   [Trainer-0] still in queue: 0
2021-04-23 09:29:21,371   [Trainer-0] Swapping partitioned embeddings None ( 0 , 0 )
2021-04-23 09:29:21,372   [Trainer-0] Loading partitioned embeddings from checkpoint
2021-04-23 09:29:23,243   [Trainer-0] ( 0 , 0 ): Stats before training: loss:  12.8929 , pos_rank:  238.8 , mrr:  0.26736 , r1:  0.210461 , r10:  0.365574 , r50:

2021-04-23 13:21:05,128   [Trainer-0] ( 0 , 0 ): bucket 1 / 1 : Trained 107793 edges in 29.75 s ( 0.0036 M/sec ); Eval 2*26948 edges in 3.22 s ( 0.017 M/sec ); io: 0.54 s for 72,408,184 bytes ( 134.63 MB/sec )
2021-04-23 13:21:05,129   [Trainer-0] Swapping partitioned embeddings ( 0 , 0 ) None
2021-04-23 13:21:05,129   [Trainer-0] Saving partitioned embeddings to checkpoint
2021-04-23 13:21:05,175   [Trainer-0] Finished epoch 91 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 13:21:05,176   [Trainer-0] Writing the metadata
2021-04-23 13:21:05,182   [Trainer-0] Writing the training stats
2021-04-23 13:21:05,185   [Trainer-0] Writing the checkpoint
2021-04-23 13:21:05,186   [Trainer-0] Switching to the new checkpoint version
2021-04-23 13:21:05,211   [Trainer-0] Starting epoch 92 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 13:21:05,213   [Trainer-0] Edge path: data/example_2/edge_path
2021-04-23 13:21:05,214   [Trainer-0] still in queue: 0
2021-04-23 13:21:05,215   [Trainer-0

2021-04-23 13:56:22,654   [Trainer-0] ( 0 , 0 ): Training stats: loss:  1.09941 , reg:  0.0812029 , violators_lhs:  5.01036 , violators_rhs:  0.101611 , count:  107793
2021-04-23 13:56:25,735   [Trainer-0] ( 0 , 0 ): Stats after training: loss:  12.8583 , pos_rank:  236.678 , mrr:  0.268851 , r1:  0.210888 , r10:  0.369211 , r50:  0.511188 , auc:  0.864424 , count:  26948
2021-04-23 13:56:25,737   [Trainer-0] ( 0 , 0 ): bucket 1 / 1 : Trained 107793 edges in 62.06 s ( 0.0017 M/sec ); Eval 2*26948 edges in 5.04 s ( 0.011 M/sec ); io: 0.05 s for 72,408,184 bytes ( 1549.06 MB/sec )
2021-04-23 13:56:25,744   [Trainer-0] Swapping partitioned embeddings ( 0 , 0 ) None
2021-04-23 13:56:25,744   [Trainer-0] Saving partitioned embeddings to checkpoint
2021-04-23 13:56:25,923   [Trainer-0] Finished epoch 96 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 13:56:25,924   [Trainer-0] Writing the metadata
2021-04-23 13:56:25,962   [Trainer-0] Writing the training stats
2021-04-23 13:56:25,969   

2021-04-23 13:58:23,196   [Trainer-0] Swapping partitioned embeddings None ( 0 , 0 )
2021-04-23 13:58:23,197   [Trainer-0] Loading partitioned embeddings from checkpoint
2021-04-23 13:58:24,562   [Trainer-0] ( 0 , 0 ): Stats before training: loss:  12.8495 , pos_rank:  237.12 , mrr:  0.269173 , r1:  0.211222 , r10:  0.369916 , r50:  0.511689 , auc:  0.865315 , count:  26948
2021-04-23 13:58:41,796   [Trainer-0] ( 0 , 0 ): Training stats: loss:  1.09781 , reg:  0.0806629 , violators_lhs:  4.98551 , violators_rhs:  0.101723 , count:  107793
2021-04-23 13:58:42,975   [Trainer-0] ( 0 , 0 ): Stats after training: loss:  12.8573 , pos_rank:  236.852 , mrr:  0.269338 , r1:  0.211797 , r10:  0.36897 , r50:  0.511782 , auc:  0.866094 , count:  26948
2021-04-23 13:58:42,976   [Trainer-0] ( 0 , 0 ): bucket 1 / 1 : Trained 107793 edges in 17.26 s ( 0.0062 M/sec ); Eval 2*26948 edges in 2.39 s ( 0.023 M/sec ); io: 0.14 s for 72,408,184 bytes ( 527.35 MB/sec )
2021-04-23 13:58:42,976   [Trainer-0] S

2021-04-23 13:59:58,918   [Trainer-0] Writing the checkpoint
2021-04-23 13:59:58,919   [Trainer-0] Switching to the new checkpoint version
2021-04-23 13:59:58,922   [Trainer-0] Starting epoch 106 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 13:59:58,923   [Trainer-0] Edge path: data/example_2/edge_path
2021-04-23 13:59:58,923   [Trainer-0] still in queue: 0
2021-04-23 13:59:58,924   [Trainer-0] Swapping partitioned embeddings None ( 0 , 0 )
2021-04-23 13:59:58,924   [Trainer-0] Loading partitioned embeddings from checkpoint
2021-04-23 14:00:00,168   [Trainer-0] ( 0 , 0 ): Stats before training: loss:  12.8537 , pos_rank:  236.507 , mrr:  0.268492 , r1:  0.209811 , r10:  0.370974 , r50:  0.51399 , auc:  0.866446 , count:  26948
2021-04-23 14:00:15,877   [Trainer-0] ( 0 , 0 ): Training stats: loss:  1.09737 , reg:  0.0802323 , violators_lhs:  4.99801 , violators_rhs:  0.102131 , count:  107793
2021-04-23 14:00:16,737   [Trainer-0] ( 0 , 0 ): Stats after training: loss:  12.8411 , 

2021-04-23 14:01:53,051   [Trainer-0] Swapping partitioned embeddings ( 0 , 0 ) None
2021-04-23 14:01:53,051   [Trainer-0] Saving partitioned embeddings to checkpoint
2021-04-23 14:01:53,123   [Trainer-0] Finished epoch 110 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 14:01:53,124   [Trainer-0] Writing the metadata
2021-04-23 14:01:53,143   [Trainer-0] Writing the training stats
2021-04-23 14:01:53,145   [Trainer-0] Writing the checkpoint
2021-04-23 14:01:53,145   [Trainer-0] Switching to the new checkpoint version
2021-04-23 14:01:53,148   [Trainer-0] Starting epoch 111 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 14:01:53,149   [Trainer-0] Edge path: data/example_2/edge_path
2021-04-23 14:01:53,150   [Trainer-0] still in queue: 0
2021-04-23 14:01:53,151   [Trainer-0] Swapping partitioned embeddings None ( 0 , 0 )
2021-04-23 14:01:53,151   [Trainer-0] Loading partitioned embeddings from checkpoint
2021-04-23 14:01:54,179   [Trainer-0] ( 0 , 0 ): Stats before training: lo

2021-04-23 14:03:28,870   [Trainer-0] ( 0 , 0 ): bucket 1 / 1 : Trained 107793 edges in 14.65 s ( 0.0074 M/sec ); Eval 2*26948 edges in 2.02 s ( 0.027 M/sec ); io: 0.15 s for 72,408,184 bytes ( 480.38 MB/sec )
2021-04-23 14:03:28,871   [Trainer-0] Swapping partitioned embeddings ( 0 , 0 ) None
2021-04-23 14:03:28,871   [Trainer-0] Saving partitioned embeddings to checkpoint
2021-04-23 14:03:28,900   [Trainer-0] Finished epoch 115 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 14:03:28,901   [Trainer-0] Writing the metadata
2021-04-23 14:03:28,916   [Trainer-0] Writing the training stats
2021-04-23 14:03:28,917   [Trainer-0] Writing the checkpoint
2021-04-23 14:03:28,918   [Trainer-0] Switching to the new checkpoint version
2021-04-23 14:03:28,921   [Trainer-0] Starting epoch 116 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 14:03:28,922   [Trainer-0] Edge path: data/example_2/edge_path
2021-04-23 14:03:28,922   [Trainer-0] still in queue: 0
2021-04-23 14:03:28,923   [Trainer

2021-04-23 14:05:09,709   [Trainer-0] ( 0 , 0 ): Training stats: loss:  1.09002 , reg:  0.0789762 , violators_lhs:  4.98843 , violators_rhs:  0.103105 , count:  107793
2021-04-23 14:05:10,925   [Trainer-0] ( 0 , 0 ): Stats after training: loss:  12.8169 , pos_rank:  234.866 , mrr:  0.271225 , r1:  0.212372 , r10:  0.373813 , r50:  0.516754 , auc:  0.868135 , count:  26948
2021-04-23 14:05:10,926   [Trainer-0] ( 0 , 0 ): bucket 1 / 1 : Trained 107793 edges in 15.62 s ( 0.0069 M/sec ); Eval 2*26948 edges in 2.05 s ( 0.026 M/sec ); io: 0.16 s for 72,408,184 bytes ( 442.59 MB/sec )
2021-04-23 14:05:10,927   [Trainer-0] Swapping partitioned embeddings ( 0 , 0 ) None
2021-04-23 14:05:10,927   [Trainer-0] Saving partitioned embeddings to checkpoint
2021-04-23 14:05:10,991   [Trainer-0] Finished epoch 120 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 14:05:10,992   [Trainer-0] Writing the metadata
2021-04-23 14:05:11,002   [Trainer-0] Writing the training stats
2021-04-23 14:05:11,005   

2021-04-23 14:06:26,030   [Trainer-0] Swapping partitioned embeddings None ( 0 , 0 )
2021-04-23 14:06:26,030   [Trainer-0] Loading partitioned embeddings from checkpoint
2021-04-23 14:06:27,067   [Trainer-0] ( 0 , 0 ): Stats before training: loss:  12.8057 , pos_rank:  235.036 , mrr:  0.270953 , r1:  0.212057 , r10:  0.373868 , r50:  0.516495 , auc:  0.867708 , count:  26948
2021-04-23 14:06:45,595   [Trainer-0] ( 0 , 0 ): Training stats: loss:  1.08858 , reg:  0.0786531 , violators_lhs:  4.98669 , violators_rhs:  0.100981 , count:  107793
2021-04-23 14:06:46,603   [Trainer-0] ( 0 , 0 ): Stats after training: loss:  12.7966 , pos_rank:  235.116 , mrr:  0.270684 , r1:  0.211685 , r10:  0.373497 , r50:  0.516736 , auc:  0.866465 , count:  26948
2021-04-23 14:06:46,604   [Trainer-0] ( 0 , 0 ): bucket 1 / 1 : Trained 107793 edges in 18.53 s ( 0.0058 M/sec ); Eval 2*26948 edges in 1.85 s ( 0.029 M/sec ); io: 0.19 s for 72,408,184 bytes ( 379.26 MB/sec )
2021-04-23 14:06:46,604   [Trainer-0]

2021-04-23 14:07:55,394   [Trainer-0] Writing the checkpoint
2021-04-23 14:07:55,395   [Trainer-0] Switching to the new checkpoint version
2021-04-23 14:07:55,397   [Trainer-0] Starting epoch 130 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 14:07:55,398   [Trainer-0] Edge path: data/example_2/edge_path
2021-04-23 14:07:55,400   [Trainer-0] still in queue: 0
2021-04-23 14:07:55,401   [Trainer-0] Swapping partitioned embeddings None ( 0 , 0 )
2021-04-23 14:07:55,401   [Trainer-0] Loading partitioned embeddings from checkpoint
2021-04-23 14:07:56,447   [Trainer-0] ( 0 , 0 ): Stats before training: loss:  12.8049 , pos_rank:  234.518 , mrr:  0.270542 , r1:  0.211481 , r10:  0.374703 , r50:  0.517701 , auc:  0.868821 , count:  26948
2021-04-23 14:08:17,629   [Trainer-0] ( 0 , 0 ): Training stats: loss:  1.08503 , reg:  0.0783448 , violators_lhs:  4.98615 , violators_rhs:  0.101945 , count:  107793
2021-04-23 14:08:18,760   [Trainer-0] ( 0 , 0 ): Stats after training: loss:  12.795 , 

2021-04-23 14:09:37,661   [Trainer-0] Swapping partitioned embeddings ( 0 , 0 ) None
2021-04-23 14:09:37,663   [Trainer-0] Saving partitioned embeddings to checkpoint
2021-04-23 14:09:37,897   [Trainer-0] Finished epoch 134 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 14:09:37,898   [Trainer-0] Writing the metadata
2021-04-23 14:09:37,914   [Trainer-0] Writing the training stats
2021-04-23 14:09:37,916   [Trainer-0] Writing the checkpoint
2021-04-23 14:09:37,916   [Trainer-0] Switching to the new checkpoint version
2021-04-23 14:09:37,923   [Trainer-0] Starting epoch 135 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 14:09:37,924   [Trainer-0] Edge path: data/example_2/edge_path
2021-04-23 14:09:37,924   [Trainer-0] still in queue: 0
2021-04-23 14:09:37,925   [Trainer-0] Swapping partitioned embeddings None ( 0 , 0 )
2021-04-23 14:09:37,925   [Trainer-0] Loading partitioned embeddings from checkpoint
2021-04-23 14:09:39,599   [Trainer-0] ( 0 , 0 ): Stats before training: lo

2021-04-23 14:11:27,520   [Trainer-0] ( 0 , 0 ): bucket 1 / 1 : Trained 107793 edges in 17.33 s ( 0.0062 M/sec ); Eval 2*26948 edges in 2.15 s ( 0.025 M/sec ); io: 0.23 s for 72,408,184 bytes ( 316.53 MB/sec )
2021-04-23 14:11:27,521   [Trainer-0] Swapping partitioned embeddings ( 0 , 0 ) None
2021-04-23 14:11:27,522   [Trainer-0] Saving partitioned embeddings to checkpoint
2021-04-23 14:11:27,600   [Trainer-0] Finished epoch 139 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 14:11:27,603   [Trainer-0] Writing the metadata
2021-04-23 14:11:27,612   [Trainer-0] Writing the training stats
2021-04-23 14:11:27,615   [Trainer-0] Writing the checkpoint
2021-04-23 14:11:27,615   [Trainer-0] Switching to the new checkpoint version
2021-04-23 14:11:27,624   [Trainer-0] Starting epoch 140 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 14:11:27,625   [Trainer-0] Edge path: data/example_2/edge_path
2021-04-23 14:11:27,625   [Trainer-0] still in queue: 0
2021-04-23 14:11:27,625   [Trainer

2021-04-23 14:13:23,358   [Trainer-0] ( 0 , 0 ): Training stats: loss:  1.08206 , reg:  0.0773248 , violators_lhs:  4.96354 , violators_rhs:  0.100776 , count:  107793
2021-04-23 14:13:24,744   [Trainer-0] ( 0 , 0 ): Stats after training: loss:  12.7921 , pos_rank:  234.247 , mrr:  0.272255 , r1:  0.212409 , r10:  0.37706 , r50:  0.518499 , auc:  0.867949 , count:  26948
2021-04-23 14:13:24,745   [Trainer-0] ( 0 , 0 ): bucket 1 / 1 : Trained 107793 edges in 24.43 s ( 0.0044 M/sec ); Eval 2*26948 edges in 2.64 s ( 0.02 M/sec ); io: 0.63 s for 72,408,184 bytes ( 115.53 MB/sec )
2021-04-23 14:13:24,746   [Trainer-0] Swapping partitioned embeddings ( 0 , 0 ) None
2021-04-23 14:13:24,746   [Trainer-0] Saving partitioned embeddings to checkpoint
2021-04-23 14:13:24,815   [Trainer-0] Finished epoch 144 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 14:13:24,815   [Trainer-0] Writing the metadata
2021-04-23 14:13:24,830   [Trainer-0] Writing the training stats
2021-04-23 14:13:24,832   [T

2021-04-23 14:14:40,324   [Trainer-0] Swapping partitioned embeddings None ( 0 , 0 )
2021-04-23 14:14:40,324   [Trainer-0] Loading partitioned embeddings from checkpoint
2021-04-23 14:14:41,317   [Trainer-0] ( 0 , 0 ): Stats before training: loss:  12.7601 , pos_rank:  232.946 , mrr:  0.272398 , r1:  0.212446 , r10:  0.377022 , r50:  0.520781 , auc:  0.865779 , count:  26948
2021-04-23 14:14:55,825   [Trainer-0] ( 0 , 0 ): Training stats: loss:  1.08133 , reg:  0.0770664 , violators_lhs:  4.92531 , violators_rhs:  0.102103 , count:  107793
2021-04-23 14:14:56,825   [Trainer-0] ( 0 , 0 ): Stats after training: loss:  12.7607 , pos_rank:  233.776 , mrr:  0.272743 , r1:  0.212168 , r10:  0.378692 , r50:  0.520131 , auc:  0.866075 , count:  26948
2021-04-23 14:14:56,826   [Trainer-0] ( 0 , 0 ): bucket 1 / 1 : Trained 107793 edges in 14.51 s ( 0.0074 M/sec ); Eval 2*26948 edges in 1.83 s ( 0.03 M/sec ); io: 0.16 s for 72,408,184 bytes ( 443.60 MB/sec )
2021-04-23 14:14:56,827   [Trainer-0] 

2021-04-23 14:16:14,711   [Trainer-0] Writing the checkpoint
2021-04-23 14:16:14,711   [Trainer-0] Switching to the new checkpoint version
2021-04-23 14:16:14,713   [Trainer-0] Starting epoch 154 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 14:16:14,713   [Trainer-0] Edge path: data/example_2/edge_path
2021-04-23 14:16:14,714   [Trainer-0] still in queue: 0
2021-04-23 14:16:14,714   [Trainer-0] Swapping partitioned embeddings None ( 0 , 0 )
2021-04-23 14:16:14,714   [Trainer-0] Loading partitioned embeddings from checkpoint
2021-04-23 14:16:15,730   [Trainer-0] ( 0 , 0 ): Stats before training: loss:  12.7567 , pos_rank:  233.47 , mrr:  0.274567 , r1:  0.214858 , r10:  0.379453 , r50:  0.520688 , auc:  0.863533 , count:  26948
2021-04-23 14:16:29,465   [Trainer-0] ( 0 , 0 ): Training stats: loss:  1.07827 , reg:  0.0766426 , violators_lhs:  4.90467 , violators_rhs:  0.0992922 , count:  107793
2021-04-23 14:16:30,358   [Trainer-0] ( 0 , 0 ): Stats after training: loss:  12.7771 ,

2021-04-23 14:17:31,837   [Trainer-0] Swapping partitioned embeddings ( 0 , 0 ) None
2021-04-23 14:17:31,837   [Trainer-0] Saving partitioned embeddings to checkpoint
2021-04-23 14:17:31,914   [Trainer-0] Finished epoch 158 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 14:17:31,915   [Trainer-0] Writing the metadata
2021-04-23 14:17:31,926   [Trainer-0] Writing the training stats
2021-04-23 14:17:31,928   [Trainer-0] Writing the checkpoint
2021-04-23 14:17:31,931   [Trainer-0] Switching to the new checkpoint version
2021-04-23 14:17:31,933   [Trainer-0] Starting epoch 159 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 14:17:31,935   [Trainer-0] Edge path: data/example_2/edge_path
2021-04-23 14:17:31,935   [Trainer-0] still in queue: 0
2021-04-23 14:17:31,936   [Trainer-0] Swapping partitioned embeddings None ( 0 , 0 )
2021-04-23 14:17:31,936   [Trainer-0] Loading partitioned embeddings from checkpoint
2021-04-23 14:17:32,935   [Trainer-0] ( 0 , 0 ): Stats before training: lo

2021-04-23 14:18:48,166   [Trainer-0] ( 0 , 0 ): bucket 1 / 1 : Trained 107793 edges in 12.67 s ( 0.0085 M/sec ); Eval 2*26948 edges in 1.75 s ( 0.031 M/sec ); io: 0.30 s for 72,408,184 bytes ( 245.24 MB/sec )
2021-04-23 14:18:48,167   [Trainer-0] Swapping partitioned embeddings ( 0 , 0 ) None
2021-04-23 14:18:48,167   [Trainer-0] Saving partitioned embeddings to checkpoint
2021-04-23 14:18:48,215   [Trainer-0] Finished epoch 163 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 14:18:48,216   [Trainer-0] Writing the metadata
2021-04-23 14:18:48,230   [Trainer-0] Writing the training stats
2021-04-23 14:18:48,232   [Trainer-0] Writing the checkpoint
2021-04-23 14:18:48,232   [Trainer-0] Switching to the new checkpoint version
2021-04-23 14:18:48,235   [Trainer-0] Starting epoch 164 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 14:18:48,235   [Trainer-0] Edge path: data/example_2/edge_path
2021-04-23 14:18:48,236   [Trainer-0] still in queue: 0
2021-04-23 14:18:48,236   [Trainer

2021-04-23 14:19:58,946   [Trainer-0] ( 0 , 0 ): Training stats: loss:  1.07659 , reg:  0.0759723 , violators_lhs:  4.89071 , violators_rhs:  0.10048 , count:  107793
2021-04-23 14:19:59,845   [Trainer-0] ( 0 , 0 ): Stats after training: loss:  12.7572 , pos_rank:  232.824 , mrr:  0.273673 , r1:  0.213448 , r10:  0.379342 , r50:  0.522302 , auc:  0.866409 , count:  26948
2021-04-23 14:19:59,845   [Trainer-0] ( 0 , 0 ): bucket 1 / 1 : Trained 107793 edges in 12.10 s ( 0.0089 M/sec ); Eval 2*26948 edges in 1.67 s ( 0.032 M/sec ); io: 0.17 s for 72,408,184 bytes ( 431.60 MB/sec )
2021-04-23 14:19:59,846   [Trainer-0] Swapping partitioned embeddings ( 0 , 0 ) None
2021-04-23 14:19:59,847   [Trainer-0] Saving partitioned embeddings to checkpoint
2021-04-23 14:19:59,911   [Trainer-0] Finished epoch 168 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 14:19:59,912   [Trainer-0] Writing the metadata
2021-04-23 14:19:59,920   [Trainer-0] Writing the training stats
2021-04-23 14:19:59,922   [

2021-04-23 14:21:03,517   [Trainer-0] Swapping partitioned embeddings None ( 0 , 0 )
2021-04-23 14:21:03,517   [Trainer-0] Loading partitioned embeddings from checkpoint
2021-04-23 14:21:04,608   [Trainer-0] ( 0 , 0 ): Stats before training: loss:  12.758 , pos_rank:  233.589 , mrr:  0.274784 , r1:  0.214951 , r10:  0.378729 , r50:  0.522636 , auc:  0.867857 , count:  26948
2021-04-23 14:21:18,930   [Trainer-0] ( 0 , 0 ): Training stats: loss:  1.07765 , reg:  0.0757126 , violators_lhs:  4.90728 , violators_rhs:  0.102298 , count:  107793
2021-04-23 14:21:19,820   [Trainer-0] ( 0 , 0 ): Stats after training: loss:  12.7448 , pos_rank:  233.234 , mrr:  0.274303 , r1:  0.214506 , r10:  0.378859 , r50:  0.522877 , auc:  0.867931 , count:  26948
2021-04-23 14:21:19,821   [Trainer-0] ( 0 , 0 ): bucket 1 / 1 : Trained 107793 edges in 14.34 s ( 0.0075 M/sec ); Eval 2*26948 edges in 1.69 s ( 0.032 M/sec ); io: 0.28 s for 72,408,184 bytes ( 260.67 MB/sec )
2021-04-23 14:21:19,822   [Trainer-0] 

2021-04-23 14:22:18,903   [Trainer-0] Writing the checkpoint
2021-04-23 14:22:18,903   [Trainer-0] Switching to the new checkpoint version
2021-04-23 14:22:18,905   [Trainer-0] Starting epoch 178 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 14:22:18,906   [Trainer-0] Edge path: data/example_2/edge_path
2021-04-23 14:22:18,907   [Trainer-0] still in queue: 0
2021-04-23 14:22:18,907   [Trainer-0] Swapping partitioned embeddings None ( 0 , 0 )
2021-04-23 14:22:18,907   [Trainer-0] Loading partitioned embeddings from checkpoint
2021-04-23 14:22:20,438   [Trainer-0] ( 0 , 0 ): Stats before training: loss:  12.729 , pos_rank:  232.007 , mrr:  0.273636 , r1:  0.212706 , r10:  0.380344 , r50:  0.523156 , auc:  0.86858 , count:  26948
2021-04-23 14:22:35,217   [Trainer-0] ( 0 , 0 ): Training stats: loss:  1.07114 , reg:  0.0754228 , violators_lhs:  4.92115 , violators_rhs:  0.101992 , count:  107793
2021-04-23 14:22:36,125   [Trainer-0] ( 0 , 0 ): Stats after training: loss:  12.7316 , p

2021-04-23 14:23:40,152   [Trainer-0] Swapping partitioned embeddings ( 0 , 0 ) None
2021-04-23 14:23:40,152   [Trainer-0] Saving partitioned embeddings to checkpoint
2021-04-23 14:23:40,213   [Trainer-0] Finished epoch 182 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 14:23:40,213   [Trainer-0] Writing the metadata
2021-04-23 14:23:40,217   [Trainer-0] Writing the training stats
2021-04-23 14:23:40,219   [Trainer-0] Writing the checkpoint
2021-04-23 14:23:40,219   [Trainer-0] Switching to the new checkpoint version
2021-04-23 14:23:40,220   [Trainer-0] Starting epoch 183 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 14:23:40,223   [Trainer-0] Edge path: data/example_2/edge_path
2021-04-23 14:23:40,223   [Trainer-0] still in queue: 0
2021-04-23 14:23:40,223   [Trainer-0] Swapping partitioned embeddings None ( 0 , 0 )
2021-04-23 14:23:40,224   [Trainer-0] Loading partitioned embeddings from checkpoint
2021-04-23 14:23:41,236   [Trainer-0] ( 0 , 0 ): Stats before training: lo

2021-04-23 14:25:07,377   [Trainer-0] ( 0 , 0 ): bucket 1 / 1 : Trained 107793 edges in 18.41 s ( 0.0059 M/sec ); Eval 2*26948 edges in 1.87 s ( 0.029 M/sec ); io: 0.16 s for 72,408,184 bytes ( 439.43 MB/sec )
2021-04-23 14:25:07,379   [Trainer-0] Swapping partitioned embeddings ( 0 , 0 ) None
2021-04-23 14:25:07,380   [Trainer-0] Saving partitioned embeddings to checkpoint
2021-04-23 14:25:07,439   [Trainer-0] Finished epoch 187 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 14:25:07,441   [Trainer-0] Writing the metadata
2021-04-23 14:25:07,452   [Trainer-0] Writing the training stats
2021-04-23 14:25:07,454   [Trainer-0] Writing the checkpoint
2021-04-23 14:25:07,454   [Trainer-0] Switching to the new checkpoint version
2021-04-23 14:25:07,456   [Trainer-0] Starting epoch 188 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 14:25:07,463   [Trainer-0] Edge path: data/example_2/edge_path
2021-04-23 14:25:07,464   [Trainer-0] still in queue: 0
2021-04-23 14:25:07,464   [Trainer

2021-04-23 14:26:24,012   [Trainer-0] ( 0 , 0 ): Training stats: loss:  1.07019 , reg:  0.0746753 , violators_lhs:  4.89447 , violators_rhs:  0.0999508 , count:  107793
2021-04-23 14:26:24,908   [Trainer-0] ( 0 , 0 ): Stats after training: loss:  12.7175 , pos_rank:  231.868 , mrr:  0.275076 , r1:  0.214116 , r10:  0.382904 , r50:  0.52477 , auc:  0.868172 , count:  26948
2021-04-23 14:26:24,909   [Trainer-0] ( 0 , 0 ): bucket 1 / 1 : Trained 107793 edges in 13.61 s ( 0.0079 M/sec ); Eval 2*26948 edges in 1.73 s ( 0.031 M/sec ); io: 0.03 s for 72,408,184 bytes ( 2592.41 MB/sec )
2021-04-23 14:26:24,909   [Trainer-0] Swapping partitioned embeddings ( 0 , 0 ) None
2021-04-23 14:26:24,910   [Trainer-0] Saving partitioned embeddings to checkpoint
2021-04-23 14:26:24,990   [Trainer-0] Finished epoch 192 / 200, edge path 1 / 1, edge chunk 1 / 1
2021-04-23 14:26:24,990   [Trainer-0] Writing the metadata
2021-04-23 14:26:24,998   [Trainer-0] Writing the training stats
2021-04-23 14:26:25,000  

2021-04-23 14:27:17,739   [Trainer-0] Swapping partitioned embeddings None ( 0 , 0 )
2021-04-23 14:27:17,739   [Trainer-0] Loading partitioned embeddings from checkpoint
2021-04-23 14:27:19,010   [Trainer-0] ( 0 , 0 ): Stats before training: loss:  12.6993 , pos_rank:  231.493 , mrr:  0.276328 , r1:  0.215229 , r10:  0.383535 , r50:  0.5249 , auc:  0.869155 , count:  26948
2021-04-23 14:27:29,922   [Trainer-0] ( 0 , 0 ): Training stats: loss:  1.06781 , reg:  0.0744099 , violators_lhs:  4.87774 , violators_rhs:  0.10112 , count:  107793
2021-04-23 14:27:30,807   [Trainer-0] ( 0 , 0 ): Stats after training: loss:  12.7079 , pos_rank:  231.631 , mrr:  0.276135 , r1:  0.214654 , r10:  0.383479 , r50:  0.524974 , auc:  0.867671 , count:  26948
2021-04-23 14:27:30,807   [Trainer-0] ( 0 , 0 ): bucket 1 / 1 : Trained 107793 edges in 10.96 s ( 0.0098 M/sec ); Eval 2*26948 edges in 1.80 s ( 0.03 M/sec ); io: 0.30 s for 72,408,184 bytes ( 239.02 MB/sec )
2021-04-23 14:27:30,808   [Trainer-0] Swa

In [34]:
with open('/Users/BrandenKang/Documents/GitHub/MetaQA_PBG/data/example_2/entity_names_all_0.json', 'r') as f:
    embeddings = json.load(f)

with h5py.File('/Users/BrandenKang/Documents/GitHub/MetaQA_PBG/model_2/embeddings_all_0.v200.h5', 'r') as g:
    embeddings_all = g['embeddings'][:]


embedding_final = dict(zip(embeddings, embeddings_all))

In [36]:
g = h5py.File('/Users/BrandenKang/Documents/GitHub/MetaQA_PBG/model_2/model.v200.h5', 'r')

In [37]:
relations_right = g['model/relations/0/operator/rhs/translations']
relations_left = g['model/relations/0/operator/lhs/translations']

In [38]:
relations_left[:]

array([[ 0.21865478,  0.2853011 ,  0.40159422, ..., -0.18174824,
         0.32732677, -0.48502034],
       [ 0.20991768,  0.14671926, -0.11882235, ...,  0.01309311,
        -0.25235838, -0.37653735],
       [-0.22295736, -0.30349153,  0.23949671, ..., -0.01469261,
         0.06565962, -0.40226698],
       ...,
       [-0.22331543,  0.05603404,  0.23919353, ...,  0.07602038,
         0.2768275 ,  0.01640979],
       [ 0.15918684,  0.13382927,  0.49134213, ..., -0.19542223,
         0.37968215, -0.381331  ],
       [ 0.20051032, -0.20698321, -0.24790175, ..., -0.26041842,
         0.3686345 , -0.3074067 ]], dtype=float32)

In [39]:
relations_right[:]

array([[-0.36259192, -0.40803045, -0.37224847, ...,  0.35986212,
        -0.44627145,  0.25324315],
       [-0.16606267, -0.21822801,  0.21862061, ...,  0.2941766 ,
         0.15934151,  0.21034053],
       [ 0.21650203,  0.2897291 , -0.01735711, ...,  0.2929325 ,
        -0.32193038,  0.39104363],
       ...,
       [ 0.24814905, -0.07195609, -0.12324741, ...,  0.26344427,
        -0.44739208, -0.21042469],
       [-0.12393223, -0.15010113, -0.57044804, ...,  0.5273425 ,
        -0.649543  ,  0.20126788],
       [-0.18542968,  0.20435613,  0.407901  , ...,  0.5385136 ,
        -0.5753108 ,  0.16237676]], dtype=float32)

In [40]:
print(embedding_final['Pal_Joey'])

list_1 = list(embedding_final.keys())
list_2 = list(embedding_final.values())

[ 0.51033103  0.03036635  0.06283304  0.35039482  0.14102378 -0.15584126
 -0.06909858  0.10056009 -0.12430669  0.57119495  0.24161927  0.08698756
 -0.19983537 -0.03320712 -0.25272903 -0.06555367 -0.27897155 -0.16131715
  0.1018642   0.09884441  0.07321175 -0.3045629   0.74419653 -0.36212048
  0.09532339 -0.3320424  -0.02525772  0.6417993  -0.22214678 -0.0059488
  0.27548715  0.23557724 -0.3091223  -0.12691845  0.2147052  -0.53641385
 -0.46322057  0.253377   -0.4845521  -0.41634384 -0.20695981  0.04110157
  0.01109679  0.5099484  -0.08906671  0.5244997   0.39698082 -0.02457639
 -0.22796789 -0.11731027 -0.05871784 -0.11460497  0.01404705 -0.1718606
 -0.01661239 -0.17754222 -0.4564989   0.08827218  0.10100427 -0.15504368
  0.5811918  -0.5132884  -0.24936116 -0.2897802  -0.02560045  0.0054861
 -0.2521472  -0.11304791  0.15137357 -0.09851966 -0.01950661  0.25187594
  0.3523791  -0.68075633 -0.12788388  0.10080092 -0.07855184  0.06619365
 -0.3103227   0.16740805  0.3893558   0.39888203  0.26

In [None]:
# !pip freeze
# !pip install gensim==4.0.0b0
# !pip install --upgrade gensim
# gensim.__version__

In [123]:
# from json import JSONEncoder
# class NumpyArrayEncoder(JSONEncoder):
#     def default(self, obj):
#         if isinstance(obj, numpy.ndarray):
#             return obj.tolist()
#         return JSONEncoder.default(self, obj)

In [11]:
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import Word2Vec, KeyedVectors   

In [41]:
list_1 = list(list_1)
list_2 = np.stack(list_2, axis=0)

In [42]:
gensim_models = KeyedVectors(400,count=len(list_1))
gensim_models.add_vectors(list_1,list_2)

In [43]:
gensim_models.save('gensim_model.model') #bin #kv

In [44]:
relations_left[0]

array([ 0.21865478,  0.2853011 ,  0.40159422,  0.3809845 ,  0.31712306,
        0.4907502 ,  0.2699243 ,  0.26271448, -0.46574578, -0.06488804,
        0.4244827 ,  0.3150751 ,  0.40627244,  0.4346973 ,  0.11768712,
       -0.29141554, -0.25403723, -0.26504874, -0.47186193,  0.3830049 ,
        0.1044362 , -0.24359064, -0.32620627,  0.50930685, -0.42501178,
        0.49163005,  0.2547748 , -0.615411  , -0.33978832, -0.10845868,
        0.35847124,  0.16136882,  0.41785538,  0.26463374, -0.292871  ,
       -0.0293926 ,  0.4670544 , -0.2379936 , -0.46268576,  0.27355805,
       -0.30974528, -0.14513108,  0.20539391, -0.3360885 , -0.29952657,
       -0.2578046 , -0.43837047, -0.01432132,  0.44997308, -0.3131817 ,
       -0.2063915 ,  0.26613328, -0.1419458 , -0.28814504,  0.07143813,
        0.11462472,  0.42526415,  0.37168562, -0.01822625,  0.48426524,
       -0.24448316,  0.31116578,  0.23402809, -0.11710633,  0.15277512,
        0.0845698 ,  0.10585778, -0.25453472,  0.33475417, -0.36

In [None]:
#     "written_by",
#     "has_imdb_votes",
#     "directed_by",
#     "starred_actors",
#     "has_imdb_rating",
#     "release_year",
#     "in_language",
#     "has_tags",
#     "has_genre"

In [217]:
# rel_types = ['written_by', 'has_imdb_votes','directed_by','starred_actors','has_imdb_rating','release_year','in_language','has_tags','has_genre']

In [229]:
rel_dict = {
    'written_by': relations_left[],
    'has_imdb_votes': relations_left[],
    'directed_by': relations_left[],
    'starred_actors': relations_left[],
    'has_imdb_rating': relations_left[],
    'release_year': relations_left[],
    'in_language': relations_left[],
    'has_tags': relations_left[],
    'has_genre': relations_left[]
}

In [59]:
## add relationships
result = gensim_models.most_similar(positive=['Kismet', relations_left[8]],topn=10)[:][:]
result

[('Helen_Mirren', 0.5621678233146667),
 ('John_Lithgow', 0.5546588897705078),
 ('Burt_Lancaster', 0.5494174361228943),
 ('Marlene_Dietrich', 0.5394538044929504),
 ('Ethan_Hawke', 0.5377938747406006),
 ('Jeanne_Moreau', 0.534210741519928),
 ('Danny_Glover', 0.5336334109306335),
 ('Michael_Redgrave', 0.5334655046463013),
 ('Kim_Novak', 0.5271657705307007),
 ('Julie_Delpy', 0.5253924131393433)]

In [32]:
relations[0]

NameError: name 'relations' is not defined

# Chatbot

In [2]:
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import Word2Vec, KeyedVectors   

In [68]:
re_gensim = KeyedVectors.load('gensim_model.model')
test = re_gensim.most_similar(positive=['Cobra_Woman', 'Sleep_with_Me'],topn=1)[0][0]
test

'Free_Enterprise'

In [None]:
## first need to have the relationship in the form of embeddings 
## King + Spouse = most related entity (how the translation works)
## how translation works: (take a vector and another vector, and take the sum) 
## translation model is a model where you take left hand side and shift it by the relationship and you hope you get close to right hand side
## as opposed to rotation or another kind of vector manipulation 

## can use gensim by saying this is the lhs, this is the relationship, take them as features using positive keyword
## hoping we can play with that (positive, negative)

## if we do not have information about relationship we cannot do translation 
## it should train entities of embeddings and also train relationship and provide them 

## how big is the model 

## goal is to experiment and see where relationship is (make repository with 4 dimensions and put to github)
## configuration file, output files and have a look 
## left hand side, right hand side, try to position vectors so that when you add up lhs and relationship you get rhs
## in process you teak embedding of LHS and relationship, such that you get close to RHS 

## in the meantime, try to integrate what you have 
## you have bot interface, you have very simple prediction model of similarity
## try to integrate them — spielberg – output is most similar stuff 
## and then leveredge positive keyword of most similar method — in case query is multiple entities 
## i.e. spielberg, and jurassic park — use those as positive and get most similar 
## just to create pipeline — and then plug in relationship part once it's figured out 

## next time ideally we'll create an instance on AWS or Heroku and we can put application up there

## deadlines: 
## Beginning of April, Sunday — finish 95% of development
## I have April to do fine tuning, and have time to prepare for report and presentation 
## first of all ahve to think about what i'm going to put inside 
## I will have to have significant part talking about theoretical things I have learned 

In [None]:
#model.wv.save_word2vec_format(entity2embedding)

## Back up

In [None]:
# DATA_DIR = 'data/example_2'
# GRAPH_PATH = DATA_DIR + '/edges.tsv'
# MODEL_DIR = 'model_2'

#     # ==================================================================
#     # 0. PREPARE THE GRAPH
#     # the result of this step is a single file 'data/example_2/graph.tsv'
#     # ==================================================================
#     # This the graph we will be embedding.
#     # It has 10 types of nodes, or entities, and 9 types of edges, or relationships. 
#     test_edges = []
#     count=0
#     with open('kb.txt', 'r') as f: 
#         for line in f: 
#            line=line.rstrip().split("|")
#            line[0] = line[0].split(" ")
#            line[0] = "_".join(line[0])
#         #    line[2] = line[2].split(" ")
#         #    line[2] = "_".join(line[2])
#            test_edges.append(line)
#            count+=1
#            if count == 134741:
#                break
           
#     os.makedirs(DATA_DIR, exist_ok=True)
#     with open(GRAPH_PATH, 'w') as f:
#         for edge in test_edges:
#             f.write('\t'.join(edge) + '\n')
# # # # ==================================================
# # # # 1. DEFINE CONFIG
# # # # this dictionary will be used in steps 2. and 3.
# # # # ==================================================

# raw_config = dict(
#     # I/O data
#     entity_path=DATA_DIR,
#     edge_paths=[
#         DATA_DIR + '/edges_partitioned',
#     ],
#     checkpoint_path=MODEL_DIR,
#     # Graph structure
#     entities={
#         "all": {"num_partitions": 1}
#     },
#     relations=[
#         {
#             "name": "directed_by",
#             "lhs": "all",
#             "rhs": "all",
#             "operator": "complex_diagonal",
#         },
#         {
#             "name": "written_by",
#             "lhs": "all",
#             "rhs": "all",
#             "operator": "complex_diagonal",
#         },
#         {
#             "name": "starred_actors",
#             "lhs": "all",
#             "rhs": "all",
#             "operator": "complex_diagonal",
#         },
        
#         {
#             "name": "release_year",
#             "lhs": "all",
#             "rhs": "all",
#             "operator": "complex_diagonal",
#         },
#         {
#             "name": "in_language",
#             "lhs": "all",
#             "rhs": "all",
#             "operator": "complex_diagonal",
#         },
#         {
#             "name": "has_tags",
#             "lhs": "all",
#             "rhs": "all",
#             "operator": "complex_diagonal",
#         },
#         {
#             "name": "has_genre",
#             "lhs": "all",
#             "rhs": "all",
#             "operator": "complex_diagonal",
#         },
        
#         {
#             "name": "has_imdb_votes",
#             "lhs": "all",
#             "rhs": "all",
#             "operator": "complex_diagonal",
#         },
        
#         {
#             "name": "all_edges",
#             "lhs": "all",
#             "rhs": "all",
#             "operator": "complex_diagonal",
#         }
#     ],

#     dynamic_relations=False,
#     dimension=200,  
#     global_emb=False,
#     comparator="dot",
#     num_epochs=7,
#     num_uniform_negs=1000,
#     loss_fn="softmax",
#     lr=0.1,
#     regularization_coef=1e-3,
#     eval_fraction=0.,
# )

## Set Up Logging 


# # =======================================================================
# # 4. LOAD THE EMBEDDINGS
# # The final output of the process consists of a dictionary mapping each entity to its embedding

# # =======================================================================

# # entities_path = DATA_DIR + '/entity_names_entities_0.json'

# # entities_emb_path = MODEL_DIR + "/embeddings_entities.v{NUMBER_OF_EPOCHS}.h5" \
# #     .format(NUMBER_OF_EPOCHS=raw_config['num_epochs'])

# # with open(entities_path, 'r') as f:
# #     entities = json.load(f)

# # with h5py.File(entities_emb_path, 'r') as g:
# #     entity_embeddings = g['embeddings'][:]

# # entity2embedding = dict(zip(entities, entity_embeddings))
# # print('entity embeddings')
# # print(entity2embedding)

# movies_path = DATA_DIR + '/entity_names_movie_0.json'
# directors_path = DATA_DIR + '/entity_names_director_0.json'
# writers_path = DATA_DIR + '/entity_names_writer_0.json'
# actors_path = DATA_DIR + '/entity_names_starred_actor_0.json'
# years_path = DATA_DIR + '/entity_names_year_0.json'
# languages_path = DATA_DIR + '/entity_names_language_0.json'
# tags_path = DATA_DIR + '/entity_names_tags_0.json'
# genres_path = DATA_DIR + '/entity_names_genre_0.json'
# votes_path = DATA_DIR + '/entity_names_votes_0.json'
# rating_path = DATA_DIR + '/entity_names_rating_0.json'


# movie_emb_path = MODEL_DIR + "/embeddings_movie_0.v{NUMBER_OF_EPOCHS}.h5" \
#     .format(NUMBER_OF_EPOCHS=raw_config['num_epochs'])

# director_emb_path = MODEL_DIR + "/embeddings_director_0.v{NUMBER_OF_EPOCHS}.h5" \
#     .format(NUMBER_OF_EPOCHS=raw_config['num_epochs'])

# writer_emb_path = MODEL_DIR + "/embeddings_writer_0.v{NUMBER_OF_EPOCHS}.h5" \
#     .format(NUMBER_OF_EPOCHS=raw_config['num_epochs'])

# actor_emb_path = MODEL_DIR + "/embeddings_starred_actor_0.v{NUMBER_OF_EPOCHS}.h5" \
#     .format(NUMBER_OF_EPOCHS=raw_config['num_epochs'])

# year_emb_path = MODEL_DIR + "/embeddings_year_0.v{NUMBER_OF_EPOCHS}.h5" \
#     .format(NUMBER_OF_EPOCHS=raw_config['num_epochs'])

# language_emb_path = MODEL_DIR + "/embeddings_language_0.v{NUMBER_OF_EPOCHS}.h5" \
#     .format(NUMBER_OF_EPOCHS=raw_config['num_epochs'])

# tags_emb_path = MODEL_DIR + "/embeddings_tags_0.v{NUMBER_OF_EPOCHS}.h5" \
#     .format(NUMBER_OF_EPOCHS=raw_config['num_epochs'])

# genre_emb_path = MODEL_DIR + "/embeddings_genre_0.v{NUMBER_OF_EPOCHS}.h5" \
#     .format(NUMBER_OF_EPOCHS=raw_config['num_epochs'])

# votes_emb_path = MODEL_DIR + "/embeddings_votes_0.v{NUMBER_OF_EPOCHS}.h5" \
#     .format(NUMBER_OF_EPOCHS=raw_config['num_epochs'])

# rating_emb_path = MODEL_DIR + "/embeddings_rating_0.v{NUMBER_OF_EPOCHS}.h5" \
#     .format(NUMBER_OF_EPOCHS=raw_config['num_epochs'])

# with open(movies_path, 'r') as f:
#     movies = json.load(f)

# with h5py.File(movie_emb_path, 'r') as g:
#     movie_embeddings = g['embeddings'][:]

# movie2embedding = dict(zip(movies, movie_embeddings))
# # print('movie embeddings')
# # print(movie2embedding)

# with open(directors_path, 'r') as f:
#     directors = json.load(f)

# with h5py.File(director_emb_path, 'r') as g:
#     director_embeddings = g['embeddings'][:]

# director2embedding = dict(zip(directors, director_embeddings))
# # print('director embeddings')
# # print(director2embedding)

# with open(writers_path, 'r') as f:
#     writers = json.load(f)

# with h5py.File(writer_emb_path, 'r') as g:
#     writer_embeddings = g['embeddings'][:]

# writer2embedding = dict(zip(writers, writer_embeddings))
# # print('writer embeddings')
# # print(writer2embedding)

# with open(actors_path, 'r') as f:
#     actors = json.load(f)

# with h5py.File(actor_emb_path, 'r') as g:
#     actor_embeddings = g['embeddings'][:]

# actor2embedding = dict(zip(actors, actor_embeddings))
# # print('actor embeddings')
# # print(actor2embedding)

# with open(years_path, 'r') as f:
#     years = json.load(f)

# with h5py.File(year_emb_path, 'r') as g:
#     year_embeddings = g['embeddings'][:]

# year2embedding = dict(zip(years, year_embeddings))
# # print('year embeddings')
# # print(year2embedding)

# with open(languages_path, 'r') as f:
#     languages = json.load(f)

# with h5py.File(language_emb_path, 'r') as g:
#     language_embeddings = g['embeddings'][:]

# language2embedding = dict(zip(languages, language_embeddings))
# # print('language embeddings')
# # print(language2embedding)

# with open(tags_path, 'r') as f:
#     tags = json.load(f)

# with h5py.File(tags_emb_path, 'r') as g:
#     tags_embeddings = g['embeddings'][:]

# tag2embedding = dict(zip(tags, tags_embeddings))
# # print('tag embeddings')
# # print(tag2embedding)

# with open(genres_path, 'r') as f:
#     genres = json.load(f)

# with h5py.File(genre_emb_path, 'r') as g:
#     genre_embeddings = g['embeddings'][:]

# genre2embedding = dict(zip(genres, genre_embeddings))
# # print('genre embeddings')
# # print(genre2embedding)

# with open(votes_path, 'r') as f:
#     votes = json.load(f)

# with h5py.File(votes_emb_path, 'r') as g:
#     votes_embeddings = g['embeddings'][:]

# votes2embedding = dict(zip(votes, votes_embeddings))
# # print('votes embeddings')
# # print(votes2embedding)

# with open(rating_path, 'r') as f:
#     ratings = json.load(f)

# with h5py.File(rating_emb_path, 'r') as g:
#     rating_embeddings = g['embeddings'][:]

# rating2embedding = dict(zip(ratings, rating_embeddings))
# # print('rating embeddings')
# # print(rating2embedding)

# entity2embedding = {**movie2embedding, **director2embedding, **writer2embedding, **actor2embedding, **year2embedding, **language2embedding, **tag2embedding, **genre2embedding, **votes2embedding, **rating2embedding}
# print('entity embeddings')
# print(entity2embedding)