In [None]:
import os
import pandas as pd
import numpy as np
import config, models
import itertools
import multiprocessing
import time

In [None]:
# main variables
dataset_name               = "WN11"
embedding_model            = models.TransE
model_timestamp            = '1524623630'
max_knn_k                  = 7
knn_k_start                = 3 # start at # nearest neighbors
knn_k_step                 = 2 # predict with 5 more neighbors for each generator

# GPU settings
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # should be a string

In [None]:
dataset_path = './benchmarks/' + dataset_name
import_path = './results/{}/{}/{}/'.format(
    dataset_name,
    embedding_model.__name__,
    model_timestamp
)
g_hat_path = import_path + '/g_hat/'
log_info = []

## Restore working model

In [None]:
model_info_df = pd.read_csv('{}/model_info.tsv'.format(import_path), sep='\t')

In [None]:
# transform model info into dict with only one "row"
model_info = model_info_df.to_dict()
for key,d in model_info.iteritems():
    model_info[key] = d[0]

In [None]:
con = config.Config()
dataset_path = "./benchmarks/{}/".format(model_info['dataset_name'])
con.set_in_path(dataset_path)
con.set_test_link_prediction(False)
con.set_test_triple_classification(True)
con.set_work_threads(multiprocessing.cpu_count())
con.set_dimension(int(model_info['k']))
con.score_norm = model_info['score_norm']
con.init()
con.set_model(embedding_model)
con.import_variables("{}tf_model/model.vec.tf".format(import_path)) # loading model via tensor library

In [None]:
model_info_df.transpose()

## Export embedding parameters (save to disk)

In [None]:
# con.save_parameters(import_path + '/embedding.vec.json')

## Get embedding parameters

In [None]:
params = con.get_parameters()

In [None]:
params['ent_embeddings']

## Read datasets

In [None]:
train = pd.read_csv(dataset_path + 'train2id.txt', sep=' ', skiprows=1, names=['head', 'tail', 'rel'])
valid = pd.read_csv(dataset_path + 'valid2id.txt', sep=' ', skiprows=1, names=['head', 'tail', 'rel'])
test = pd.read_csv(dataset_path + 'test2id.txt', sep=' ', skiprows=1, names=['head', 'tail', 'rel'])

valid_neg = pd.read_csv(dataset_path + 'valid2id_neg.txt', sep=' ', skiprows=1, names=['head', 'tail', 'rel'])
test_neg = pd.read_csv(dataset_path + 'test2id_neg.txt', sep=' ', skiprows=1, names=['head', 'tail', 'rel'])

pos_train_and_valid = pd.concat([train, valid])
data = pd.concat([train, valid, test])

In [None]:
from tools import dataset_tools
tc_dict = dataset_tools.read_type_constrain_file(dataset_path + '/type_constrain.txt')

## Get all neighbors (Train KNN)

In [None]:
from sklearn.neighbors import NearestNeighbors

start_time = time.time()

nbrs = NearestNeighbors(n_neighbors=max_knn_k, n_jobs=8).fit(params['ent_embeddings'])
knn_distance, knn_indices = nbrs.kneighbors(params['ent_embeddings'])

knn_learning_time = time.time() - start_time

print("KNN learning time: {}".format(knn_learning_time))

## KNN (perturbing only head or only tail) Generator

In [None]:
# def knn_perturb_one_entity_at_a_time_generator(knn_indices, pos_train_and_valid):    
#     for idx,row in pos_train_and_valid.iterrows():
#         for triple in itertools.product([row['head']], knn_indices[row['tail']], [row['rel']]):
#             yield {
#                 'head': triple[0],
#                 'tail': triple[1],
#                 'rel': triple[2]
#             }
#         for triple in itertools.product(knn_indices[row['head']], [row['tail']], [row['rel']]):
#             yield {
#                 'head': triple[0],
#                 'tail': triple[1],
#                 'rel': triple[2]
#             }

## KNN (perturbing head, tail and products for each positive example) Generator

In [None]:
def knn_products_generator(k, knn_indices, pos_train_and_valid):    
    for idx,row in pos_train_and_valid.iterrows():
        for triple in itertools.product(knn_indices[row['head']][:k], knn_indices[row['tail']][:k], [row['rel']]):
            yield {
                'head': triple[0],
                'tail': triple[1],
                'rel': triple[2]
            }

## KNN (cartesian product head_extended X tail_extended) Generator

In [None]:
# def knn_cartesian_product_generator(knn_indices, tc_dict):    
#     # extend the set of all heads/tails for each relation with the k nearest neighbors
#     extended_types = {}
#     for rel,dic in tc_dict.iteritems():
#         head_set = set(dic['head'])
#         tail_set = set(dic['tail'])
#         for ent_head,ent_tail in zip(dic['head'],dic['tail']):
#             head_set.update(knn_indices[ent_head])
#             tail_set.update(knn_indices[ent_tail])
#         extended_types[rel] = {
#             'head': head_set,
#             'tail': tail_set
#         }
        
#     for rel,dic in extended_types.iteritems():
#         for e1e2 in itertools.product(dic['head'], dic['tail']):
#             yield {
#                 'head': e1e2[0],
#                 'tail': e1e2[1],
#                 'rel': rel
#             }

## All Possible Triples Generator

In [None]:
# def all_possible_triples_generator(ents, rels):
#     """A generator for all possible triples in the current dataset (graph).
#     Don't forget to Pray for God to make it tractable.
    
#     Arguments:
#     - ents: the set of entities
#     - rels: the set of relations
#     """
#     for rel in rels:
#         ents_perm = itertools.permutations(ents, 2)
#         for e1e2 in ents_perm:
#             yield {
#                 'head': e1e2[0],
#                 'tail': e1e2[1],
#                 'rel': rel
#             }

## General functions for predicting Ĝ

In [None]:
def get_batch_from_generator(triples_iter, batch_size):
    batch_heads = []
    batch_tails = []
    batch_rels = []
    break_ = False
    
    for i in range(batch_size):
        try:
            triple = next(triples_iter)
        except StopIteration:
            break
        batch_heads.append(triple['head'])
        batch_tails.append(triple['tail'])
        batch_rels.append(triple['rel'])
        
    return (batch_heads, batch_tails, batch_rels), len(batch_heads)


def filter_positives(heads, tails, rels, preds):
    positive_triples = []
    for idx_n,pred in np.ndenumerate(preds):
        idx = idx_n[0] # ndenumerate works for the dimensional case
        if pred == 1:
            positive_triples.append({
                'head': heads[idx],
                'tail': tails[idx],
                'relation': rels[idx]
            })
    return positive_triples


def predict_g_hat(triples_iterator, batch_size=10000):
    positive_triples = []
    triples_count = 0
    while True:
        (heads, tails, rels), current_batch_size = get_batch_from_generator(triples_iterator, batch_size)
        preds = con.classify(heads, tails, rels, batch_size)
        positive_triples += filter_positives(heads, tails, rels, preds)
        triples_count += current_batch_size
        if current_batch_size < batch_size: # we are at the end of generator
            break
    return positive_triples, triples_count


# def get_size_of_generator(gen):
#     size = 0
#     for i in gen:
#         size += 1
#     return size

## Define Pipeline

In [None]:
def pipeline(k, gen, batch_size=100000):
    prediction_info = {}
    prediction_info['knn_time'] = knn_learning_time

    start_time = time.time()
    pos_triples, pred_size = predict_g_hat(
        triples_iterator=gen,
        batch_size=batch_size
    )
    prediction_info['pred_time'] = time.time() - start_time
    prediction_info['positive_size'] = len(pos_triples)
    prediction_info['total_time'] = prediction_info['knn_time'] + prediction_info['pred_time']
    prediction_info['predicted_size'] = pred_size
    prediction_info['k'] = k

    # ensure g_hat dir
    if not os.path.exists(g_hat_path):
        os.makedirs(g_hat_path)

    # save positive triples
    pd.DataFrame(pos_triples).to_csv('{}/positives_{}nn.tsv'.format(g_hat_path, k),
                                     sep='\t')
    return prediction_info

# Predict Ĝ (for different k)

In [None]:
prediction_info_list = []
for k in range(knn_k_start, max_knn_k+1, knn_k_step):
    prediction_info = pipeline(k, knn_products_generator(k, knn_indices, pos_train_and_valid))
    prediction_info_list.append(prediction_info)
    # save prediction info
    pd.DataFrame(prediction_info_list).to_csv(g_hat_path + 'prediction_info.tsv', sep='\t')
    print("G_hat predicted for k={}".format(k))
    
# generators = [
#     knn_products_generator(knn_indices, pos_train_and_valid),
#     knn_products_generator(knn_indices, pos_train_and_valid),
# ]

# Debugging