In [None]:
import os, subprocess
from tools import dataset_tools
import pandas as pd
import numpy as np
import config, models
import multiprocessing

# Extract features from original graph

In [None]:
# main variables
dataset_name               = "FB13"
embedding_model            = models.TransE
model_timestamp            = 'test'
neg_rate                   = 2 # negative to positive ratio
bern                       = True
feature_extractors         = ['pra', 'onesided', 'anyrel'] # pra, onesided or anyrel

# GPU settings
os.environ["CUDA_VISIBLE_DEVICES"] = "1" # should be a string

In [None]:
dataset_path = './benchmarks/{}/'.format(dataset_name)
distribution = 'bern' if bern else 'unif'
corrupted_filename = 'train2id_{}negrate_{}.txt'.format(neg_rate, distribution)
corrupted_dirpath = dataset_path + '/corrupted/'
corrupted_filepath = corrupted_dirpath + corrupted_filename
graph_input_dirname = '/pra_graph_input/'
pra_graph_input_dir = os.path.abspath(dataset_path + graph_input_dirname)
split_name = 'g_{}negrate_{}'.format(neg_rate, distribution)
import_path = './results/{}/{}/{}/'.format(
    dataset_name,
    embedding_model.__name__,
    model_timestamp
)
pra_explain_path = import_path + '/pra_explain/'
pra_explain_path_abs = os.path.abspath(import_path + '/pra_explain/')
experiment_specs_path = pra_explain_path + '/experiment_specs/'

# ensure dirs exist
def ensure_dir(d):
    if not os.path.exists(d):
        os.makedirs(d)
ensure_dir(pra_explain_path)
ensure_dir(experiment_specs_path)

# handle feature extraction strings and split name
feature_extractor_dict = {
    'pra': 'PraFeatureExtractor',
    'onesided': 'OneSidedPathAndEndNodeFeatureExtractor',
    'anyrel': 'AnyRelFeatureExtractor'
}
spec_name = split_name + '__'
feat_list = []
for feat in feature_extractors:
    spec_name += '_' + feat
    feat_list.append('"{}"'.format(feature_extractor_dict[feat]))
feat_extractor_string = ','.join(feat_list)

## Create original graph input for PRA

In [None]:
from tools.pra_setup import create_graph_input

create_graph_input(
    dataset_path,
    labels=['valid.txt', 'test.txt'], # folds that have labels
    graph_input_dirname=graph_input_dirname
)

## Generate/Read Negative Examples

In [None]:
if not os.path.exists(corrupted_filepath):
    # create corrupted dirpath if not exist
    if not os.path.exists(corrupted_dirpath):
        os.makedirs(corrupted_dirpath)
    # generate corrupted set and save to disk in `corrupted` folder
    corrupted = dataset_tools.generate_corrupted_training_examples(dataset_path,
            neg_proportion=neg_rate, bern=bern)
    train2id = pd.DataFrame(corrupted)
    train2id.to_csv(corrupted_filepath,
        columns=['head', 'tail', 'relation', 'label'], index=False, header=False, sep=' ')
    print('Created corrupted file: {}.'.format(corrupted_filepath))    
else:
    train2id = pd.read_csv(corrupted_filepath,
        names=['head', 'tail', 'relation', 'label'], sep=' ', skiprows=0)
    print('Corrupted file already exists: {}.'.format(corrupted_filepath))

## Read validation and test examples

In [None]:
valid2id_pos = pd.read_csv(dataset_path + 'valid2id.txt', sep=' ', skiprows=1, names=['head', 'tail', 'relation'])
valid2id_neg = pd.read_csv(dataset_path + 'valid2id_neg.txt', sep=' ', skiprows=1, names=['head', 'tail', 'relation'])
test2id_pos = pd.read_csv(dataset_path + 'test2id.txt', sep=' ', skiprows=1, names=['head', 'tail', 'relation'])
test2id_neg = pd.read_csv(dataset_path + 'test2id_neg.txt', sep=' ', skiprows=1, names=['head', 'tail', 'relation'])

valid2id_pos['label'] = 1
valid2id_neg['label'] = -1
test2id_pos['label'] = 1
test2id_neg['label'] = -1

valid2id = pd.concat((valid2id_pos, valid2id_neg))
test2id = pd.concat((test2id_pos, test2id_neg))

In [None]:
display(train2id.head())
display(valid2id.head())
display(test2id.head())

## Restore working model

In [None]:
from tools import train_test
con = train_test.restore_model(import_path)

## Predict and Update Data

In [None]:
for fold in [train2id, valid2id, test2id]:
    fold['label'] = con.classify(fold['head'], fold['tail'], fold['relation'])
    fold['label'] = fold['label'].map(lambda x: 1 if x==1 else -1)

## Decode from id to names

In [None]:
entity2id, id2entity = dataset_tools.read_name2id_file(dataset_path + 'entity2id.txt')
relation2id, id2relation = dataset_tools.read_name2id_file(dataset_path + 'relation2id.txt')
n_relations = len(relation2id)

for fold in [train2id, valid2id, test2id]:
    fold['head'] = fold['head'].map(id2entity)
    fold['tail'] = fold['tail'].map(id2entity)
    fold['relation'] = fold['relation'].map(id2relation)

# WARNING: at this stage we have transformed the dataframes,
#   and entities and relations are not represented by ids anymore

## Setup PRA Experiment Specs

In [None]:
spec = """
{{
    "graph": {{
        "name": "g",
        "relation sets": [
            {{
                "is kb": false,
                "relation file": "{}/train.tsv"
            }},
            {{
                "is kb": false,
                "relation file": "{}/valid.tsv"
            }}
        ]
    }},
    "split": "{}",
    "operation": {{
        "type": "create matrices",
        "features": {{
            "type": "subgraphs",
            "path finder": {{
                "type": "BfsPathFinder",
                "number of steps": 2
            }},
            "feature extractors": [
                {}
            ],
            "feature size": -1
        }}
    }},
    "output": {{ "output matrices": true }}
}}

""".format(pra_graph_input_dir, pra_graph_input_dir, split_name, feat_extractor_string)
spec_fpath = '{}/experiment_specs/{}.json'.format(pra_explain_path, spec_name)
with open(spec_fpath, 'w') as f:
    f.write(spec)
print "Spec file written: {}".format(spec_fpath)

## Create Split

Generate split (inside `./results/`) with random negative examples (bernoulli or uniform)

In [None]:
from tools import pra_setup

pra_setup.create_split({'train': train2id, 'valid': valid2id, 'test': test2id},
                       splits_dirpath=import_path+'/pra_explain/splits',
                       split_name=split_name)

## Extract Paths for split

In [None]:
# import subprocess
# bash_command = '(cd /home/arthurcgusmao/Projects/xkbc/algorithms/pra/; sbt "runMain edu.cmu.ml.rtw.pra.experiments.ExperimentRunner {} {}")'.format(pra_explain_path_abs, spec_name)
# for r in relation2id:
#     process = subprocess.Popen(bash_command.split(), stdout=subprocess.PIPE)
#     output, error = process.communicate()
#     print output
# print("Features (paths) extracted and saved into:\n{}".format(os.path.abspath(pra_explain_path)))

### not working bro!

In [None]:
%%bash -s "$pra_explain_path_abs" "$spec_name" "$n_relations"

for i in $(seq 1 $3)
do
    (cd /home/arthurcgusmao/Projects/xkbc/algorithms/pra/; sbt "runMain edu.cmu.ml.rtw.pra.experiments.ExperimentRunner $1 $2")
done

# Debug

In [None]:
# !rm /home/arthurcgusmao/Projects/xkbc/algorithms/OpenKE/./results/FB13/TransE/1524490825//pra_explain//results/ -r