In [10]:
import os
import shutil
import json
import h5py
import torch
import numpy as np
from pathlib import Path
from torchbiggraph.config import parse_config
from torchbiggraph.converters.importers import TSVEdgelistReader, convert_input_data
from torchbiggraph.train import train
from torchbiggraph.util import SubprocessInitializer, setup_logging

In [11]:
DATA_DIR = 'data/example_2'
GRAPH_PATH = DATA_DIR + '/edges.tsv'
TRAINING_PATH = DATA_DIR + '/training.tsv'
TEST_PATH = DATA_DIR + '/test.tsv'
MODEL_DIR = 'model_2'

config = dict(
    # I/O data
    entity_path=DATA_DIR,
    edge_paths=[
        DATA_DIR + '/edge_path'
    ],
    checkpoint_path=MODEL_DIR,
    # Graph structure
    entities={"all": {"num_partitions": 1}},
    relations=[
       {
            "name": "all_edges",
            "lhs": "all",
            "rhs": "all",
            "operator": "translation",
        }
    ],

    dynamic_relations=True,
    dimension=4,  
    global_emb=False,
    comparator="dot",
    num_epochs=7,
    num_uniform_negs=1000,
    loss_fn="softmax",
    lr=0.1,
    regularization_coef=1e-3,
    eval_fraction=0.,
)

In [None]:
# =================================================
# 2. TRANSFORM GRAPH TO A BIGGRAPH-FRIENDLY FORMAT
# This step generates the following metadata files:

# data/example_2/entity_count_director_0.txt
# data/example_2/entity_count_director_0.json

# and this file with data:
# data/example_2/edges_partitioned/edges_0_0.h5
# =================================================
setup_logging()
config = parse_config(config)
subprocess_init = SubprocessInitializer()
input_edge_paths = [Path(GRAPH_PATH)]

convert_input_data(
    config.entities,
    config.relations,
    config.entity_path,
    config.edge_paths,
    input_edge_paths,
    TSVEdgelistReader(lhs_col=0, rel_col=1, rhs_col=2),
    dynamic_relations=config.dynamic_relations,
)


In [14]:
# ===============================================
# 3. TRAIN THE EMBEDDINGS
# files generated in this step:
#
# checkpoint_version.txt
# config.json
# embeddings_all_0.v7.h5
# model.v7.h5
# training_stats.json
# ===============================================

train(config, subprocess_init=subprocess_init)

2021-04-15 16:14:57,151   [Trainer-0] Loading entity counts...
2021-04-15 16:14:57,648   [Trainer-0] Creating workers...
2021-04-15 16:14:57,854   [Trainer-0] Initializing global model...
2021-04-15 16:14:59,129   [Trainer-0] Exiting


In [26]:
with open('/Users/BrandenKang/Documents/GitHub/MetaQA_PBG/data/example_2/entity_names_all_0.json', 'r') as f:
    embeddings = json.load(f)

with h5py.File('/Users/BrandenKang/Documents/GitHub/MetaQA_PBG/model_2/embeddings_all_0.v7.h5', 'r') as g:
    embeddings_all = g['embeddings'][:]

embedding_final = dict(zip(embeddings, embeddings_all))

In [79]:
with h5py.File('/Users/BrandenKang/Documents/GitHub/MetaQA_PBG/model_2/model.v7.h5', 'r') as g:
    print(g['model'].encode('ascii')[:])

# with h5py.File("/Users/BrandenKang/Documents/GitHub/MetaQA_PBG/model_2/model.v7.h5", "r") as hf:
#     operator_state_dict = {
#         "real": torch.from_numpy(hf["model/relations/0/operator/rhs/real"][...]),
#         "imag": torch.from_numpy(hf["model/relations/0/operator/rhs/imag"][...]),
#     }

AttributeError: 'Group' object has no attribute 'encode'

In [69]:
print(embedding_final['Pal_Joey'])

[ 0.48531008 -0.02907323  0.409445   -0.27130792]


## Ranking

In [72]:
# import json
# import numpy as np
# import h5py
# import faiss

# # Create FAISS index
# index = faiss.IndexFlatL2(400)
# with h5py.File("/Users/BrandenKang/Documents/GitHub/MetaQA_PBG/model_2/embeddings_all_0.v7.h5", "r") as hf:
#     index.add(hf["embeddings"][...])

# # Get trained embedding of Paris
# with open("/Users/BrandenKang/Documents/GitHub/MetaQA_PBG/data/example_2/entity_names_all_0.json", "rt") as tf:
#     entity_names = json.load(tf)
# target_entity_offset = entity_names.index("/m/05qtj")  # Paris
# with h5py.File("/Users/BrandenKang/Documents/GitHub/MetaQA_PBG/model_2/embeddings_all_0.v7.h5", "r") as hf:
#     target_embedding = hf["embeddings"][target_entity_offset, :]

# print(target_embedding)

In [70]:
# import json
# import h5py
# import torch
# from torchbiggraph.model import ComplexDiagonalDynamicOperator, DotComparator

# # Load entity count
# with open("/Users/BrandenKang/Documents/GitHub/MetaQA_PBG/data/example_2/entity_count_all_0.txt", "rt") as tf:
#     entity_count = int(tf.read().strip())

# # Load count of dynamic relations
# with open("/Users/BrandenKang/Documents/GitHub/MetaQA_PBG/data/example_2/dynamic_rel_count.txt", "rt") as tf:
#     dynamic_rel_count = int(tf.read().strip())

# # Load the operator's state dict
# with h5py.File("/Users/BrandenKang/Documents/GitHub/MetaQA_PBG/model_2/model.v7.h5", "r") as hf:
#     operator_state_dict = {
#         "real": torch.from_numpy(hf["model/relations/0/operator/rhs/real"][...]),
#         "imag": torch.from_numpy(hf["model/relations/0/operator/rhs/imag"][...]),
#     }
# operator = ComplexDiagonalDynamicOperator(4, dynamic_rel_count)
# operator.load_state_dict(operator_state_dict)
# comparator = DotComparator()

# # Load the offsets of the entities and the index of the relation type
# with open("/Users/BrandenKang/Documents/GitHub/MetaQA_PBG/data/example_2/entity_names_all_0.json", "rt") as tf:
#     entity_names = json.load(tf)
# src_entity_offset = entity_names.index("/m/0f8l9c")  # France
# with open("/Users/BrandenKang/Documents/GitHub/MetaQA_PBG/data/example_2/dynamic_rel_names.json", "rt") as tf:
#     rel_type_names = json.load(tf)
# rel_type_index = rel_type_names.index("/location/country/capital")

# # Load the trained embeddings
# with h5py.File("/Users/BrandenKang/Documents/GitHub/MetaQA_PBG/model_2/embeddings_all_0.v7.h5", "r") as hf:
#     src_embedding = torch.from_numpy(hf["embeddings"][src_entity_offset, :])
#     dest_embeddings = torch.from_numpy(hf["embeddings"][...])

# # Calculate the scores
# scores, _, _ = comparator(
#     comparator.prepare(src_embedding.view(1, 1, 4)).expand(1, entity_count, 4),
#     comparator.prepare(
#         operator(
#             dest_embeddings,
#             torch.tensor([rel_type_index]).expand(entity_count),
#         ).view(1, entity_count, 400),
#     ),
#     torch.empty(1, 0, 4),  # Left-hand side negatives, not needed
#     torch.empty(1, 0, 4),  # Right-hand side negatives, not needed
# )

# # Sort the entities by their score
# permutation = scores.flatten().argsort(descending=True)
# top5_entities = [entity_names[index] for index in permutation[:5]]

# print(top5_entities)

ImportError: cannot import name 'ComplexDiagonalDynamicOperator' from 'torchbiggraph.model' (/Users/BrandenKang/anaconda3/lib/python3.7/site-packages/torchbiggraph/model.py)

## End

In [None]:
# !pip freeze
# !pip install gensim==4.0.0b0
# !pip install --upgrade gensim
# gensim.__version__

In [111]:
a = {list(entity2embedding.keys())[i]: list(entity2embedding.values())[i] for i in ranypege(len(list_1))}

In [123]:
from json import JSONEncoder
class NumpyArrayEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, numpy.ndarray):
            return obj.tolist()
        return JSONEncoder.default(self, obj)

In [37]:
with open('embeddings.json','w') as fp:
    json.dump(a, fp)

In [None]:
entity2embedding
list_1 = entity2embedding.keys()
list_1 = list(list_1)
list_1[:10]

In [40]:
list_2 = entity2embedding.values()
## create 2D array out of list_2 dict values -- put in in a variable (embeddings)
embeddings = np.stack(list_2, axis=0)
embeddings.shape

  if (yield from self.run_code(code, result)):


(24713, 200)

In [132]:
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import Word2Vec, KeyedVectors   

In [43]:
gensim_models = KeyedVectors(4,count=len(list_1))
gensim_models.add_vectors(list_1,embeddings)

In [141]:
gensim_models.save('gensim_model.model') #bin #kv

In [148]:
## add relationships
result = gensim_models.most_similar(positive=['Drumline', ],topn=1)[0][0]

In [88]:
result

'Icon'

# Chatbot

In [2]:
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import Word2Vec, KeyedVectors   

In [68]:
re_gensim = KeyedVectors.load('gensim_model.model')
test = re_gensim.most_similar(positive=['Cobra_Woman', 'Sleep_with_Me'],topn=1)[0][0]
test

'Free_Enterprise'

In [None]:
## first need to have the relationship in the form of embeddings 
## King + Spouse = most related entity (how the translation works)
## how translation works: (take a vector and another vector, and take the sum) 
## translation model is a model where you take left hand side and shift it by the relationship and you hope you get close to right hand side
## as opposed to rotation or another kind of vector manipulation 

## can use gensim by saying this is the lhs, this is the relationship, take them as features using positive keyword
## hoping we can play with that (positive, negative)

## if we do not have information about relationship we cannot do translation 
## it should train entities of embeddings and also train relationship and provide them 

## how big is the model 

## goal is to experiment and see where relationship is (make repository with 4 dimensions and put to github)
## configuration file, output files and have a look 
## left hand side, right hand side, try to position vectors so that when you add up lhs and relationship you get rhs
## in process you teak embedding of LHS and relationship, such that you get close to RHS 

## in the meantime, try to integrate what you have 
## you have bot interface, you have very simple prediction model of similarity
## try to integrate them — spielberg – output is most similar stuff 
## and then leveredge positive keyword of most similar method — in case query is multiple entities 
## i.e. spielberg, and jurassic park — use those as positive and get most similar 
## just to create pipeline — and then plug in relationship part once it's figured out 

## next time ideally we'll create an instance on AWS or Heroku and we can put application up there

## deadlines: 
## Beginning of April, Sunday — finish 95% of development
## I have April to do fine tuning, and have time to prepare for report and presentation 
## first of all ahve to think about what i'm going to put inside 
## I will have to have significant part talking about theoretical things I have learned 

In [None]:
#model.wv.save_word2vec_format(entity2embedding)

## Back up

In [None]:
# DATA_DIR = 'data/example_2'
# GRAPH_PATH = DATA_DIR + '/edges.tsv'
# MODEL_DIR = 'model_2'

#     # ==================================================================
#     # 0. PREPARE THE GRAPH
#     # the result of this step is a single file 'data/example_2/graph.tsv'
#     # ==================================================================
#     # This the graph we will be embedding.
#     # It has 10 types of nodes, or entities, and 9 types of edges, or relationships. 
#     test_edges = []
#     count=0
#     with open('kb.txt', 'r') as f: 
#         for line in f: 
#            line=line.rstrip().split("|")
#            line[0] = line[0].split(" ")
#            line[0] = "_".join(line[0])
#         #    line[2] = line[2].split(" ")
#         #    line[2] = "_".join(line[2])
#            test_edges.append(line)
#            count+=1
#            if count == 134741:
#                break
           
#     os.makedirs(DATA_DIR, exist_ok=True)
#     with open(GRAPH_PATH, 'w') as f:
#         for edge in test_edges:
#             f.write('\t'.join(edge) + '\n')
# # # # ==================================================
# # # # 1. DEFINE CONFIG
# # # # this dictionary will be used in steps 2. and 3.
# # # # ==================================================

# raw_config = dict(
#     # I/O data
#     entity_path=DATA_DIR,
#     edge_paths=[
#         DATA_DIR + '/edges_partitioned',
#     ],
#     checkpoint_path=MODEL_DIR,
#     # Graph structure
#     entities={
#         "all": {"num_partitions": 1}
#     },
#     relations=[
#         {
#             "name": "directed_by",
#             "lhs": "all",
#             "rhs": "all",
#             "operator": "complex_diagonal",
#         },
#         {
#             "name": "written_by",
#             "lhs": "all",
#             "rhs": "all",
#             "operator": "complex_diagonal",
#         },
#         {
#             "name": "starred_actors",
#             "lhs": "all",
#             "rhs": "all",
#             "operator": "complex_diagonal",
#         },
        
#         {
#             "name": "release_year",
#             "lhs": "all",
#             "rhs": "all",
#             "operator": "complex_diagonal",
#         },
#         {
#             "name": "in_language",
#             "lhs": "all",
#             "rhs": "all",
#             "operator": "complex_diagonal",
#         },
#         {
#             "name": "has_tags",
#             "lhs": "all",
#             "rhs": "all",
#             "operator": "complex_diagonal",
#         },
#         {
#             "name": "has_genre",
#             "lhs": "all",
#             "rhs": "all",
#             "operator": "complex_diagonal",
#         },
        
#         {
#             "name": "has_imdb_votes",
#             "lhs": "all",
#             "rhs": "all",
#             "operator": "complex_diagonal",
#         },
        
#         {
#             "name": "all_edges",
#             "lhs": "all",
#             "rhs": "all",
#             "operator": "complex_diagonal",
#         }
#     ],

#     dynamic_relations=False,
#     dimension=200,  
#     global_emb=False,
#     comparator="dot",
#     num_epochs=7,
#     num_uniform_negs=1000,
#     loss_fn="softmax",
#     lr=0.1,
#     regularization_coef=1e-3,
#     eval_fraction=0.,
# )

## Set Up Logging 


# # =======================================================================
# # 4. LOAD THE EMBEDDINGS
# # The final output of the process consists of a dictionary mapping each entity to its embedding

# # =======================================================================

# # entities_path = DATA_DIR + '/entity_names_entities_0.json'

# # entities_emb_path = MODEL_DIR + "/embeddings_entities.v{NUMBER_OF_EPOCHS}.h5" \
# #     .format(NUMBER_OF_EPOCHS=raw_config['num_epochs'])

# # with open(entities_path, 'r') as f:
# #     entities = json.load(f)

# # with h5py.File(entities_emb_path, 'r') as g:
# #     entity_embeddings = g['embeddings'][:]

# # entity2embedding = dict(zip(entities, entity_embeddings))
# # print('entity embeddings')
# # print(entity2embedding)

# movies_path = DATA_DIR + '/entity_names_movie_0.json'
# directors_path = DATA_DIR + '/entity_names_director_0.json'
# writers_path = DATA_DIR + '/entity_names_writer_0.json'
# actors_path = DATA_DIR + '/entity_names_starred_actor_0.json'
# years_path = DATA_DIR + '/entity_names_year_0.json'
# languages_path = DATA_DIR + '/entity_names_language_0.json'
# tags_path = DATA_DIR + '/entity_names_tags_0.json'
# genres_path = DATA_DIR + '/entity_names_genre_0.json'
# votes_path = DATA_DIR + '/entity_names_votes_0.json'
# rating_path = DATA_DIR + '/entity_names_rating_0.json'


# movie_emb_path = MODEL_DIR + "/embeddings_movie_0.v{NUMBER_OF_EPOCHS}.h5" \
#     .format(NUMBER_OF_EPOCHS=raw_config['num_epochs'])

# director_emb_path = MODEL_DIR + "/embeddings_director_0.v{NUMBER_OF_EPOCHS}.h5" \
#     .format(NUMBER_OF_EPOCHS=raw_config['num_epochs'])

# writer_emb_path = MODEL_DIR + "/embeddings_writer_0.v{NUMBER_OF_EPOCHS}.h5" \
#     .format(NUMBER_OF_EPOCHS=raw_config['num_epochs'])

# actor_emb_path = MODEL_DIR + "/embeddings_starred_actor_0.v{NUMBER_OF_EPOCHS}.h5" \
#     .format(NUMBER_OF_EPOCHS=raw_config['num_epochs'])

# year_emb_path = MODEL_DIR + "/embeddings_year_0.v{NUMBER_OF_EPOCHS}.h5" \
#     .format(NUMBER_OF_EPOCHS=raw_config['num_epochs'])

# language_emb_path = MODEL_DIR + "/embeddings_language_0.v{NUMBER_OF_EPOCHS}.h5" \
#     .format(NUMBER_OF_EPOCHS=raw_config['num_epochs'])

# tags_emb_path = MODEL_DIR + "/embeddings_tags_0.v{NUMBER_OF_EPOCHS}.h5" \
#     .format(NUMBER_OF_EPOCHS=raw_config['num_epochs'])

# genre_emb_path = MODEL_DIR + "/embeddings_genre_0.v{NUMBER_OF_EPOCHS}.h5" \
#     .format(NUMBER_OF_EPOCHS=raw_config['num_epochs'])

# votes_emb_path = MODEL_DIR + "/embeddings_votes_0.v{NUMBER_OF_EPOCHS}.h5" \
#     .format(NUMBER_OF_EPOCHS=raw_config['num_epochs'])

# rating_emb_path = MODEL_DIR + "/embeddings_rating_0.v{NUMBER_OF_EPOCHS}.h5" \
#     .format(NUMBER_OF_EPOCHS=raw_config['num_epochs'])

# with open(movies_path, 'r') as f:
#     movies = json.load(f)

# with h5py.File(movie_emb_path, 'r') as g:
#     movie_embeddings = g['embeddings'][:]

# movie2embedding = dict(zip(movies, movie_embeddings))
# # print('movie embeddings')
# # print(movie2embedding)

# with open(directors_path, 'r') as f:
#     directors = json.load(f)

# with h5py.File(director_emb_path, 'r') as g:
#     director_embeddings = g['embeddings'][:]

# director2embedding = dict(zip(directors, director_embeddings))
# # print('director embeddings')
# # print(director2embedding)

# with open(writers_path, 'r') as f:
#     writers = json.load(f)

# with h5py.File(writer_emb_path, 'r') as g:
#     writer_embeddings = g['embeddings'][:]

# writer2embedding = dict(zip(writers, writer_embeddings))
# # print('writer embeddings')
# # print(writer2embedding)

# with open(actors_path, 'r') as f:
#     actors = json.load(f)

# with h5py.File(actor_emb_path, 'r') as g:
#     actor_embeddings = g['embeddings'][:]

# actor2embedding = dict(zip(actors, actor_embeddings))
# # print('actor embeddings')
# # print(actor2embedding)

# with open(years_path, 'r') as f:
#     years = json.load(f)

# with h5py.File(year_emb_path, 'r') as g:
#     year_embeddings = g['embeddings'][:]

# year2embedding = dict(zip(years, year_embeddings))
# # print('year embeddings')
# # print(year2embedding)

# with open(languages_path, 'r') as f:
#     languages = json.load(f)

# with h5py.File(language_emb_path, 'r') as g:
#     language_embeddings = g['embeddings'][:]

# language2embedding = dict(zip(languages, language_embeddings))
# # print('language embeddings')
# # print(language2embedding)

# with open(tags_path, 'r') as f:
#     tags = json.load(f)

# with h5py.File(tags_emb_path, 'r') as g:
#     tags_embeddings = g['embeddings'][:]

# tag2embedding = dict(zip(tags, tags_embeddings))
# # print('tag embeddings')
# # print(tag2embedding)

# with open(genres_path, 'r') as f:
#     genres = json.load(f)

# with h5py.File(genre_emb_path, 'r') as g:
#     genre_embeddings = g['embeddings'][:]

# genre2embedding = dict(zip(genres, genre_embeddings))
# # print('genre embeddings')
# # print(genre2embedding)

# with open(votes_path, 'r') as f:
#     votes = json.load(f)

# with h5py.File(votes_emb_path, 'r') as g:
#     votes_embeddings = g['embeddings'][:]

# votes2embedding = dict(zip(votes, votes_embeddings))
# # print('votes embeddings')
# # print(votes2embedding)

# with open(rating_path, 'r') as f:
#     ratings = json.load(f)

# with h5py.File(rating_emb_path, 'r') as g:
#     rating_embeddings = g['embeddings'][:]

# rating2embedding = dict(zip(ratings, rating_embeddings))
# # print('rating embeddings')
# # print(rating2embedding)

# entity2embedding = {**movie2embedding, **director2embedding, **writer2embedding, **actor2embedding, **year2embedding, **language2embedding, **tag2embedding, **genre2embedding, **votes2embedding, **rating2embedding}
# print('entity embeddings')
# print(entity2embedding)