In [None]:
import pandas as pd
import numpy as np
from tools import dataset_tools
import tqdm

In [None]:
dataset_path = './benchmarks/FB13/'

In [None]:
entity2id, id2entity = dataset_tools.read_name2id_file(dataset_path + 'entity2id.txt')
relation2id, id2relation = dataset_tools.read_name2id_file(dataset_path + 'relation2id.txt')

## Verify 2id files

In [None]:
train = pd.read_csv(dataset_path + 'train2id.txt', sep=' ', skiprows=1, names=['e1', 'e2', 'rel'])
valid_pos = pd.read_csv(dataset_path + 'valid2id.txt', sep=' ', skiprows=1, names=['e1', 'e2', 'rel'])
test_pos = pd.read_csv(dataset_path + 'test2id.txt', sep=' ', skiprows=1, names=['e1', 'e2', 'rel'])

valid_neg = pd.read_csv(dataset_path + 'valid2id_neg.txt', sep=' ', skiprows=1, names=['e1', 'e2', 'rel'])
test_neg = pd.read_csv(dataset_path + 'test2id_neg.txt', sep=' ', skiprows=1, names=['e1', 'e2', 'rel'])

# assign labels
train['label'] = 1
valid_pos['label'] =  1
valid_neg['label'] = -1
test_pos['label'] =  1
test_neg['label'] = -1

valid = pd.concat((valid_pos, valid_neg), ignore_index=True)
test = pd.concat((test_pos, test_neg), ignore_index=True)

data_pos = pd.concat([train, valid_pos, test_pos], ignore_index=True)
data = pd.concat([train, valid, test], ignore_index=True)

In [None]:
ents = set()
ents.update(data.e1.unique())
ents.update(data.e2.unique())

print('Entities: {}'.format(len(ents)))
print('Relations: {}'.format(len(data.rel.unique())))

print('\nTrain triples: {}'.format(len(train)))
print('Positive Valid triples: {}'.format(len(valid_pos)))
print('Positive Test triples: {}'.format(len(test_pos)))
print('\nNegative Valid triples: {}'.format(len(valid_neg)))
print('Negative Test triples: {}'.format(len(test_neg)))

print('\nAll positive triples: {}').format(len(data_pos))
print('All triples: {}').format(len(data))

## Convert from id to names

In [None]:
def convert_id2names(df):
    df['e1'] = df['e1'].map(id2entity)
    df['e2'] = df['e2'].map(id2entity)
    df['rel'] = df['rel'].map(id2relation)

In [None]:
convert_id2names(data)
convert_id2names(train)
convert_id2names(valid)
convert_id2names(test)

## Comparison with original `.txt` files

In [None]:
train_orig = pd.read_csv(dataset_path + 'train.txt', sep='\t', names=['e1', 'rel', 'e2'])
valid_orig_0 = pd.read_csv(dataset_path + 'valid.txt', sep='\t', names=['e1', 'rel', 'e2', 'label'])
test_orig_0 = pd.read_csv(dataset_path + 'test.txt', sep='\t', names=['e1', 'rel', 'e2', 'label'])

valid_orig_pos = valid_orig_0.loc[valid_orig_0['label'] ==  1]
valid_orig_neg = valid_orig_0.loc[valid_orig_0['label'] == -1]
test_orig_pos = test_orig_0.loc[test_orig_0['label'] ==  1]
test_orig_neg = test_orig_0.loc[test_orig_0['label'] == -1]

valid_orig = pd.concat([valid_orig_pos, valid_orig_neg], ignore_index=True)
test_orig = pd.concat([test_orig_pos, test_orig_neg], ignore_index=True)

# assign labels
train_orig['label'] = 1

data_orig_pos = pd.concat([train_orig, valid_orig_pos, test_orig_pos], ignore_index=True)
data_orig = pd.concat([train_orig, valid_orig, test_orig], ignore_index=True)

In [None]:
ents_orig = set()
ents_orig.update(data_orig.e1.unique())
ents_orig.update(data_orig.e2.unique())

print('Entities: {}'.format(len(ents_orig)))
print('Relations: {}'.format(len(data_orig.rel.unique())))

print('\nTrain triples: {}'.format(len(train_orig)))
print('Valid triples: {}'.format(len(valid_orig_pos)))
print('Test triples: {}\n'.format(len(test_orig_pos)))
print('Negative Valid triples: {}'.format(len(valid_orig_neg)))
print('Negative Test triples: {}'.format(len(test_orig_neg)))

print('\nAll positive triples: {}').format(len(data_orig_pos))
print('All triples: {}').format(len(data_orig))

## Comparison of each data point

In [None]:
def compare_rows(df1, df2):
    columns = df1.columns.tolist()
    for idx,row_df1 in df1.iterrows():
        for col in columns:
            assert row_df1[col] == df2.iloc[idx][col], 'Something is wrong, bro.'
        if idx % 10000 == 0:
            print idx
    print("Finished! Everything is ok!")

In [None]:
compare_rows(data, data_orig)