The goal of this notebook is to check whether the training examples for the logits are the same, and to discover why the embedding accuracy for different "datasets" is different in the Explanator() output.

In [None]:
import pandas as pd
import numpy as np
import os, time
from explain.helpers import parse_feature_matrix, get_dirs

In [None]:
pra_results_dirs = [
    "/home/arthurcgusmao/Projects/xkbc/algorithms/OpenKE/results/WN11/TransE/1527008113/pra_explain/results",
    "/home/arthurcgusmao/Projects/xkbc/algorithms/OpenKE/results/FB13/TransE/1527033688/pra_explain/results",
    "/home/arthurcgusmao/Projects/xkbc/algorithms/OpenKE/results/NELL186/TransE/1526711822/pra_explain/results",
]

extracted_features_dirs_names = [
    "g_2negrate_bern___pra",
    "ghat_3nn_2negrate_bern___pra",
    "ghat_5nn_2negrate_bern___pra",
]

def extraction_results(pra_results_dirs, extracted_features_dirs_names):
    results = []
    
    for pra_results_dir in pra_results_dirs:
        for extracted_features_dirs_name in extracted_features_dirs_names:
            dpath = os.path.join(pra_results_dir, extracted_features_dirs_names)
            relations = get_dirs(dpath)
            info = {}
            
            for rel in relations:
                for fm in os.listdir(os.path.join(dpath, rel)):
                    heads, tails, labels, feat_dicts = parse_feature_matrix(os.path.join(dpath, rel, fm))
                    n_feats = pd.Series([len(fd) for fd in feat_dicts])
                    info[fm] = info.get(fm, 0) + 
                    
            results.append({
                'results_dir': pra_results_dir,
                'extracted features': extracted_features_dirs_name,
                'total': info.get('train.tsv', 0) + info.get('valid.tsv', 0) + info.get('test.tsv', 0)
                'train.tsv': info.get('train.tsv', None)
                'valid.tsv': info.get('valid.tsv', None)
                'test.tsv': info.get('test.tsv', None)
            })

In [None]:
os.listdir("/home/arthurcgusmao/Projects/xkbc/algorithms/OpenKE/results/NELL186/TransE/1526711822/pra_explain/results/g_2negrate_bern___pra/concept:coachwontrophy")

In [None]:
results_path_wn11 = "/home/arthurcgusmao/Projects/xkbc/algorithms/OpenKE/results/WN11/TransE/1527008113/pra_explain/results"
results_path_fb13 = "/home/arthurcgusmao/Projects/xkbc/algorithms/OpenKE/results/FB13/TransE/1527033688/pra_explain/results"
results_path_nell = "/home/arthurcgusmao/Projects/xkbc/algorithms/OpenKE/results/NELL186/TransE/1526711822/pra_explain/results"

results_path = results_path_fb13
    
features_dir_xke_true = "g_2negrate_bern___pra"
features_dir_xke_pred_3nn = "ghat_3nn_2negrate_bern___pra"
features_dir_xke_pred_5nn = "ghat_5nn_2negrate_bern___pra"
relation = "gender"

In [None]:
def get_list_of_ent_pairs(results_path, features_dir, fold='train.tsv'):
    relations = get_dirs(os.path.join(results_path, features_dir))
    ent_pairs_dict = {}
    for relation in relations:
        rel_features_path = os.path.join(results_path, features_dir, relation)
        heads, tails, labels, feat_dicts = parse_feature_matrix(os.path.join(rel_features_path, fold))
        ent_pairs_dict[relation] = zip(heads, tails)
    return ent_pairs_dict

def compare_ent_pairs_dicts_len(d1, d2, d3, names=['d1', 'd2', 'd3']):
    res = []
    rels = set(d1.keys()).union(set(d2.keys())).union(set(d3.keys()))
    lens_sum = [0,0,0]
    for rel in rels:
        len1 = d1.get(rel, []) # get value with empty list as default
        len2 = d2.get(rel, [])
        len3 = d3.get(rel, [])
        lens = [len(len1), len(len2), len(len3)]
        lens_sum = np.sum((lens, lens_sum), axis=0)
        res.append(np.concatenate(([rel], lens)))
    avg = lens_sum / len(rels)
    res.append(np.concatenate((['*SUM*'], lens_sum)))
    res.append(np.concatenate((['*AVERAGE*'], avg)))
    columns = ['relation']; columns.extend(names)
    return pd.DataFrame(res, columns=columns)

In [None]:
def get_features_len(results_path, features_dir, fold='train.tsv'):
    relations = get_dirs(os.path.join(results_path, features_dir))
    features_len_dict = {}
    for relation in relations:
        rel_features_path = os.path.join(results_path, features_dir, relation)
        heads, tails, labels, feat_dicts = parse_feature_matrix(os.path.join(rel_features_path, fold))
        min_len=999999
        max_len=0
        avg_len=0
        for fd in feat_dicts:
            if len(fd) < min_len: min_len = len(fd)
            if len(fd) > max_len: max_len = len(fd)
            avg_len += len(fd)
        avg_len /= len(feat_dicts)
        features_len_dict[relation] = {'min': min_len, 'max': max_len, 'avg': avg_len}
    return features_len_dict

def compate_features_len():
    pass

In [None]:
def get_number_examples_directly_from_feature_files(results_path, features_dir, fold='train.tsv'):
    """The number of examples for each relation."""
    relations = get_dirs(os.path.join(results_path, features_dir))
    examples_count = {}
    for rel in relations:
        num_lines = 0
        with open(os.path.join(results_path, features_dir, rel, fold), 'r') as f:
            for line in f:
                num_lines += 1
        examples_count[rel] = num_lines
    return examples_count

In [None]:
get_number_examples_directly_from_feature_files(results_path, features_dir_xke_true, fold='train.tsv')

In [None]:
fold = 'train.tsv'
xke_true     = get_list_of_ent_pairs(results_path, features_dir_xke_true,     fold=fold)
xke_pred_3nn = get_list_of_ent_pairs(results_path, features_dir_xke_pred_3nn, fold=fold)
xke_pred_5nn = get_list_of_ent_pairs(results_path, features_dir_xke_pred_5nn, fold=fold)

In [None]:
compare_ent_pairs_dicts_len(xke_true, xke_pred_3nn, xke_pred_5nn, names=['xke_true', 'xke_pred_3nn', 'xke_pred_5nn'])

# debug

In [None]:
rel_features_path = os.path.join(results_path, features_dir_xke_true, '_similar_to')
heads, tails, labels, feat_dicts = parse_feature_matrix(rel_features_path + '/train.tsv')

In [None]:
for fd in feat_dicts:
    print len(fd)

In [None]:
flen_xke_true     = get_features_len(results_path, features_dir_xke_true)
flen_xke_pred_3nn = get_set_of_ent_pairs(results_path, features_dir_xke_pred_3nn)
flen_xke_pred_5nn = get_set_of_ent_pairs(results_path, features_dir_xke_pred_5nn)

In [None]:
flen_xke_true     = get_features_len(results_path, features_dir_xke_true, fold='train.tsv')
flen_xke_true

In [None]:
flen_xke_pred_3nn = get_features_len(results_path, features_dir_xke_pred_3nn, fold='test.tsv')
flen_xke_pred_3nn

In [None]:
# results_path = fasdf
features_dir = features_dir_xke_true
relation = 'cause_of_death'
fold = 'train.tsv'

heads, tails, labels, feat_dicts = parse_feature_matrix(os.path.join(results_path, features_dir, relation, fold))
n_feats = pd.Series([len(fd) for fd in feat_dicts])

In [None]:
n_feats.describe()

In [None]:
len(n_feats_series.loc[n_feats_series == 0])

In [None]:
len(n_feats_series.loc[n_feats_series > 1])

In [None]:
len(n_feats_series.loc[n_feats_series > 2])

In [None]:
len(n_feats_series.loc[n_feats_series > 3])

In [None]:
len(n_feats_series.loc[n_feats_series > 50])

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
fig = plt.figure()
fig.set_size_inches(18, 12)
plt.hist(n_feats_series, bins=max(n_feats_series))
pass

# New scheme