In [1]:
import pandas as pd

In [33]:
BASE_DIR = './Release/'
TRAIN_FILE = BASE_DIR + 'train.txt'
TEST_FILE = BASE_DIR + 'test.txt'
VALID_FILE = BASE_DIR + 'valid.txt'
TEXT_EMNLP_FILE = BASE_DIR + 'text_emnlp.txt'
TEXT_CVSC_FILE = BASE_DIR + 'text_cvsc.txt'
VALID_CSV_FILE = 'fb15k_valid.csv'
TEST_CSV_FILE = 'fb15k_test.csv'
ENTITIES_CSV_FILE = 'fb15k_entities.csv'
RELATIONS_CSV_FILE = 'fb15k_relations.csv'
EMNLP_TRAIN_CSV_FILE = 'fb15k_emnlp_train.csv'
CVSC_TRAIN_CSV_FILE = 'fb15k_cvsc_train.csv'
EMNLP_PAIRS_CSV_FILE = 'fb15k_emnlp_pairs.csv'
CVSC_PAIRS_CSV_FILE = 'fb15k_cvsc_pairs.csv'
EMNLP_RELATIONS_CSV_FILE = 'fb15k_emnlp_relations.csv'
CVSC_RELATIONS_CSV_FILE = 'fb15k_cvsc_relations.csv'
ENTITY_PAIRS = {}
RELATIONS = {}

In [3]:
def index(val, idx):
    if val not in idx:
        idx[val] = len(idx)
    return idx[val]

In [4]:
def add_id_columns(df):
    df['pair'] = df['subj'] + ':' + df['obj']
    df['pid'] = df['pair'].apply(lambda x: index(x, ENTITY_PAIRS))
    df['rid'] = df['rel'].apply(lambda x: index(x, RELATIONS))

## Prepare KB triples train/validate/test sets

In [5]:
train_kb_triples = pd.read_csv(TRAIN_FILE, sep='\t', names=['subj', 'rel', 'obj'])
add_id_columns(train_kb_triples)
print 'Train KB triples:', len(train_kb_triples)

Train KB triples: 272115


In [27]:
valid_kb_triples = pd.read_csv(VALID_FILE, sep='\t', names=['subj', 'rel', 'obj'])
add_id_columns(valid_kb_triples)
print 'Validation KB triples:', len(valid_kb_triples)
valid_kb_triples.to_csv(VALID_CSV_FILE, sep='\t', header=True, columns=['subj', 'rel', 'obj', 'pid', 'rid'])
print 'Saved to', VALID_CSV_FILE

Validation KB triples: 17535
Saved to fb15k_valid.csv


In [7]:
test_kb_triples = pd.read_csv(TEST_FILE, sep='\t', names=['subj', 'rel', 'obj'])
add_id_columns(test_kb_triples)
print 'Test KB triples:', len(test_kb_triples)
test_kb_triples.to_csv(TEST_CSV_FILE, sep='\t', header=True, columns=['subj', 'rel', 'obj', 'pid', 'rid'])
print 'Saved to', TEST_CSV_FILE

Test KB triples: 20466
Saved to fb15k_test.csv


## Count KB entities and relations

In [8]:
entities = pd.DataFrame()
entities['entity'] = pd.concat([train_kb_triples['subj'], train_kb_triples['obj']]).drop_duplicates()
print 'Entities:', len(entities)
entities.to_csv(ENTITIES_CSV_FILE, sep='\t', header=True, columns=['entity'])
print 'Saved to', ENTITIES_CSV_FILE

Entities: 14505
Saved to fb15k_entities.csv


In [9]:
relations = pd.DataFrame()
relations['rel'] = pd.concat([
                              train_kb_triples['rel']
                             ]).drop_duplicates()
relations['rid'] = relations['rel'].apply(lambda x: index(x, RELATIONS))
print 'Relations:', len(relations)
relations.to_csv(RELATIONS_CSV_FILE, sep='\t', header=True, columns=['rel', 'rid'])
print 'Saved to', RELATIONS_CSV_FILE

Relations: 237
Saved to fb15k_relations.csv


## Prepare EMNLP datasets

In [10]:
emnlp_text_triples = pd.read_csv(TEXT_EMNLP_FILE, sep='\t', names=['subj', 'rel', 'obj', 'occ'])
add_id_columns(emnlp_text_triples)
print 'Text triples (EMNLP):', len(emnlp_text_triples)
emnlp_train_triples = pd.concat([train_kb_triples, emnlp_text_triples], join="outer")
print 'Training triples (EMNLP):', len(emnlp_train_triples)
emnlp_train_triples.to_csv(EMNLP_TRAIN_CSV_FILE, sep='\t', header=True, columns=['subj', 'rel', 'obj', 'pid', 'rid', 'occ'])
print 'Saved to', EMNLP_TRAIN_CSV_FILE

Text triples (EMNLP): 3978014
Training triples (EMNLP): 4250129
Saved to fb15k_emnlp_train.csv


In [37]:
len(emnlp_text_triples[['rel']].drop_duplicates())

2740403

In [30]:
emnlp_pairs = emnlp_train_triples[['subj', 'obj', 'pid']].drop_duplicates()
print 'Entity pairs (EMNLP):', len(emnlp_pairs)
emnlp_pairs.to_csv(EMNLP_PAIRS_CSV_FILE, sep='\t', header=True, columns=['subj', 'obj', 'pid'])
print 'Saved to', EMNLP_PAIRS_CSV_FILE

Entity pairs (EMNLP): 2014530
Saved to fb15k_emnlp_pairs.csv


In [34]:
emnlp_relations = emnlp_train_triples[['rel', 'rid']].drop_duplicates()
print 'Relations (EMNLP):', len(emnlp_relations)
emnlp_relations.to_csv(EMNLP_RELATIONS_CSV_FILE, sep='\t', header=True, columns=['rel', 'rid'])
print 'Saved to', EMNLP_RELATIONS_CSV_FILE

Relations (EMNLP): 2740640
Saved to fb15k_emnlp_relations.csv


## Prepare CVSC datasets

In [11]:
cvsc_text_triples = pd.read_csv(TEXT_CVSC_FILE, sep='\t', names=['subj', 'rel', 'obj', 'occ'])
add_id_columns(cvsc_text_triples)
print 'Text triples (CVSC):', len(cvsc_text_triples)
cvsc_train_triples = pd.concat([train_kb_triples, cvsc_text_triples], join="outer")
print 'Training triples (CVSC):', len(cvsc_train_triples)
cvsc_train_triples.to_csv(CVSC_TRAIN_CSV_FILE, sep='\t', header=True, columns=['subj', 'rel', 'obj', 'pid', 'rid', 'occ'])
print 'Saved to', CVSC_TRAIN_CSV_FILE

Text triples (CVSC): 6600401
Training triples (CVSC): 6872516
Saved to fb15k_cvsc_train.csv


In [38]:
len(cvsc_text_triples[['rel']].drop_duplicates())

25917

In [31]:
cvsc_pairs = cvsc_train_triples[['subj', 'obj', 'pid']].drop_duplicates()
print 'Entity pairs (CVSC):', len(cvsc_pairs)
cvsc_pairs.to_csv(CVSC_PAIRS_CSV_FILE, sep='\t', header=True, columns=['subj', 'obj', 'pid'])
print 'Saved to', CVSC_PAIRS_CSV_FILE

Entity pairs (CVSC): 2966835
Saved to fb15k_cvsc_pairs.csv


In [35]:
cvsc_relations = cvsc_train_triples[['rel', 'rid']].drop_duplicates()
print 'Relations (CVSC):', len(cvsc_relations)
cvsc_relations.to_csv(CVSC_RELATIONS_CSV_FILE, sep='\t', header=True, columns=['rel', 'rid'])
print 'Saved to', CVSC_RELATIONS_CSV_FILE

Relations (CVSC): 26154
Saved to fb15k_cvsc_relations.csv
