In [None]:
import os, subprocess
from tools import dataset_tools
import pandas as pd
import numpy as np
import config, models
import multiprocessing

# Extract features from original graph

In [None]:
# main variables
dataset_name               = "FB13"
embedding_model            = models.TransE
model_timestamp            = '1524490825'

# GPU settings
os.environ["CUDA_VISIBLE_DEVICES"] = "1" # should be a string

In [None]:
dataset_path = './benchmarks/{}/'.format(dataset_name)

## Create original graph for PRA

Create graph input (in `benchmarks/_DATASETNAME_/pra_graph_input2id/`)

In [None]:
from tools.pra_setup import create_graph_input

graph_input_dirname = '/pra_graph_input2id/'

create_graph_input(
    dataset_path,
    names_fname=['train2id.txt', 'test2id.txt', 'valid2id.txt'],
    labels=['valid.tsv', 'test.tsv'], # folds that have labels
    sep=' ',
    skiprows=1,
    order=['head', 'tail', 'relation'],
    graph_input_dirname=graph_input_dirname
)

In [None]:
%%bash -s "$dataset_path" "$graph_input_dirname"
ls $1/$2

Create file in `experiment_specs`

In [None]:
spec = """
{{
    "graph": {{
        "name": "{}_2id",
        "relation sets": [
            {{
                "is kb": false,
                "relation file": "/home/arthurcgusmao/Projects/xkbc/algorithms/OpenKE/benchmarks/{}/{}/train2id.tsv"
            }},
            {{
                "is kb": false,
                "relation file": "/home/arthurcgusmao/Projects/xkbc/algorithms/OpenKE/benchmarks/{}/{}/valid2id.tsv"
            }}
        ]
    }},
    "operation": {{
        "type": "no op"
    }}
}}
""".format(dataset_name, dataset_name, graph_input_dirname.replace('/', ''), dataset_name, graph_input_dirname.replace('/', ''))
spec_fname = '{}_2id'.format(dataset_name)
spec_fpath = './benchmarks/pra/experiment_specs/{}.json'.format(spec_fname)
with open(spec_fpath, 'w') as f:
    f.write(spec)
print "Spec file written: {}".format(spec_fpath)

Run PRA algorithm only to create graph.

The command below will automatically run `ExperimentRunner`, without asking for class selection, for the specific dataset. Make sure the benchmark (original) graph is created.

**Warning**: the cell below will return an error, but the files needed probably will be generated correcly. This is happening because the `.json` spec files don't have all atributes, this is something to fix later.

In [None]:
%%bash -s "$spec_fname"
(cd /home/arthurcgusmao/Projects/xkbc/algorithms/pra/; sbt "runMain edu.cmu.ml.rtw.pra.experiments.ExperimentRunner /home/arthurcgusmao/Projects/xkbc/algorithms/OpenKE/benchmarks/pra/ $1")

Generate split (inside `./results/`) with random negative examples (bernoulli or uniform)

## Generate/Read Negative Examples

In [None]:
neg_rate = 2 # negative to positive ratio
bern = True

In [None]:
distribution = 'bern' if bern else 'unif'
corrupted_filename = 'train2id_{}_{}to1.txt'.format(distribution, neg_rate)
corrupted_dirpath = dataset_path + '/corrupted/'
corrupted_filepath = corrupted_dirpath + corrupted_filename

In [None]:
if not os.path.exists(corrupted_filepath):
    # create corrupted dirpath if not exist
    if not os.path.exists(corrupted_dirpath):
        os.makedirs(corrupted_dirpath)
    # generate corrupted set and save to disk in `corrupted` folder
    corrupted = dataset_tools.generate_corrupted_training_examples(dataset_path,
            neg_proportion=neg_rate, bern=bern)
    train2id = pd.DataFrame(corrupted)
    train2id.to_csv(corrupted_filepath,
        columns=['head', 'tail', 'relation', 'label'], index=False, header=False, sep=' ')
    print('Created corrupted file: {}.'.format(corrupted_filepath))    
else:
    train2id = pd.read_csv(corrupted_filepath,
        names=['head', 'tail', 'relation', 'label'], sep=' ', skiprows=0)
    print('Corrupted file already exists: {}.'.format(corrupted_filepath))

## Read validation and test examples

In [None]:
valid2id_pos = pd.read_csv(dataset_path + 'valid2id.txt', sep=' ', skiprows=1, names=['head', 'tail', 'relation'])
valid2id_neg = pd.read_csv(dataset_path + 'valid2id_neg.txt', sep=' ', skiprows=1, names=['head', 'tail', 'relation'])
test2id_pos = pd.read_csv(dataset_path + 'test2id.txt', sep=' ', skiprows=1, names=['head', 'tail', 'relation'])
test2id_neg = pd.read_csv(dataset_path + 'test2id_neg.txt', sep=' ', skiprows=1, names=['head', 'tail', 'relation'])

valid2id_pos['label'] = 1
valid2id_neg['label'] = -1
test2id_pos['label'] = 1
test2id_neg['label'] = -1

valid2id = pd.concat((valid2id_pos, valid2id_neg))
test2id = pd.concat((test2id_pos, test2id_neg))

In [None]:
display(train2id.head())
display(valid2id.head())
display(test2id.head())

## Restore working model

In [None]:
from tools import train_test
con = train_test.restore_model(import_path)

## Predict and Update Data

In [None]:
for fold in [train2id, valid2id, test2id]:
    fold['label'] = con.classify(fold['head'], fold['tail'], fold['relation'])
    fold['label'] = fold['label'].map(lambda x: 1 if x==1 else -1)

## Move PRA template to results (model) dir

In [None]:
pra_explain_path = import_path + '/pra_explain/'
split_name = '{}_{}to1_2id'.format(distribution, neg_rate)

In [None]:
spec2 = """
{{
    "graph": "/home/arthurcgusmao/Projects/xkbc/algorithms/OpenKE/benchmarks/pra/graphs/{}_2id/",
    "split": "{}",
    "operation": {{
        "type": "create matrices",
        "features": {{
            "type": "subgraphs",
            "path finder": {{
                "type": "BfsPathFinder",
                "number of steps": 2
            }},
            "feature extractors": [
                "PraFeatureExtractor",
                "OneSidedPathAndEndNodeFeatureExtractor",
                "AnyRelFeatureExtractor"
            ],
            "feature size": -1
        }}
    }},
    "output": {{ "output matrices": true }}
}}

""".format(dataset_name, split_name)
spec2_name = 'extract_all_feat2id__neg_by_random'
spec2_fpath = '{}/experiment_specs/{}.json'.format(pra_explain_path, spec2_name)
with open(spec2_fpath, 'w') as f:
    f.write(spec2)
print "Spec file written: {}".format(spec2_fpath)

## Create Split

In [None]:
from tools import pra_setup

pra_setup.create_split({'train': train2id, 'valid': valid2id, 'test': test2id},
                       splits_dirpath=import_path+'/pra_explain/splits',
                       split_name=split_name)

## Extract Paths for split

In [None]:
%%bash -s "$pra_explain_path_abs" "$spec_name" "$n_relations"

for i in $(seq 1 $3)
do
    (cd /home/arthurcgusmao/Projects/xkbc/algorithms/pra/; sbt "runMain edu.cmu.ml.rtw.pra.experiments.ExperimentRunner $1 $2")
done

In [None]:
print("Features (paths) extracted and saved into:\n{}".format(os.path.abspath(pra_explain_path)))

# Debug

In [None]:
# !rm /home/arthurcgusmao/Projects/xkbc/algorithms/OpenKE/./results/FB13/TransE/1524490825//pra_explain//results/ -r

In [None]:
# !rm /home/arthurcgusmao/Projects/xkbc/algorithms/OpenKE/./results/FB13/TransE/1524490825//pra_explain//splits/ -r

In [None]:
# !rm ./benchmarks/FB13//pra_graph_input/ -r