In [5]:
import os
import pandas as pd
import multiprocessing as mp
import json

from sfe import Graph, SFE
from helpers import save_features_to_disk
import time


def pipeline(
    # Files & Directories paths
    pra_graph_input_path, # list containing the path to where the tsv files containing set of triples to be used to build the graph are, e.g., `XKE/benchmarks/FB13/pra_graph_input/`. There should be `train.tsv` and `valid.tsv` files. Each line should contain the head, relation and tail of an existing triple, in the mentioned order.
    datasets_paths, # list containing the path for each dataset for which features will be extracted. The dataset file should be a TSV file containing the columns (in the order): head, relation, tail, label
    output_dir, # path for the output dir, the directory where results will be saved

    # SFE options
    max_depth=2,
    max_fan_out=100,
    bfs_memory_size=1000,
    batch_size=10000, # number of features that will be processed in a row before saving them to disk (and freeing up memory space). Notice that this applies to each Process, so in practice this number is multiplied by the number of cores.
):

    # SAVE HYPERPARAMS TO DISK
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    params_str = json.dumps(locals())
    with open(os.path.join(output_dir, 'hyperparameters.json'), 'w') as f:
        f.write(params_str)

    # BUILD GRAPH
    g = Graph()
    train = pd.read_csv(os.path.join(pra_graph_input_path, 'train.tsv'), sep='\t', skiprows=0, names=['head', 'relation', 'tail'])
    valid = pd.read_csv(os.path.join(pra_graph_input_path, 'valid.tsv'), sep='\t', skiprows=0, names=['head', 'relation', 'tail'])
    print("Building graph..."),
    g.partial_build_from_df(train)
    g.partial_build_from_df(valid)
    print("Built.")


    # EXTRACT FEATURES
    sfe = SFE(g, max_depth, max_fan_out, bfs_memory_size)
    return sfe
    
    


In [12]:
dataset_str = 'NELL186'

sfe = pipeline(
    # Files & Directories paths
    pra_graph_input_path=\
        '../benchmarks/{}/pra_graph_input/'.format(dataset_str),
    datasets_paths=[
        '../benchmarks/{}/corrupted/train.txt'.format(dataset_str),
        '../benchmarks/{}/valid.txt'.format(dataset_str),
        '../benchmarks/{}/test.txt'.format(dataset_str),
    ], 
    output_dir=\
        './results/{}/{}'.format(dataset_str, int(time.time())),
    # SFE options
    max_depth=2,
    max_fan_out=100,
    bfs_memory_size=1000,
    batch_size=10000, # number of features that will be processed in a row before saving them to disk (and freeing up memory space). Notice that this applies to each Process, so in practice this number is multiplied by the number of cores.
)

IndentationError: unexpected indent (<ipython-input-12-fbe82b1cca16>, line 22)

In [19]:
# define parameters in this scope
pra_graph_input_path=\
    '../benchmarks/{}/pra_graph_input/'.format(dataset_str)
datasets_paths=[
    '../benchmarks/{}/corrupted/train.txt'.format(dataset_str),
    '../benchmarks/{}/valid.txt'.format(dataset_str),
    '../benchmarks/{}/test.txt'.format(dataset_str),
]
output_dir=\
    './results/{}/{}'.format(dataset_str, int(time.time()))
# SFE options
max_depth=2
max_fan_out=100
bfs_memory_size=1000
batch_size=10000 # number of features that will be processed in a row before saving them to disk (and freeing up memory space). Notice that this applies to each Process, so in practice this number is multiplied by the number of cores.

# Debug

In [7]:
sfe.

<sfe.SFE at 0x7f3f7482cc10>

# Last steps of pipeline

In [21]:
for filepath in datasets_paths:
    print("\nStarting feature extraction for `{}` ...".format(filepath))
    df = pd.read_csv(filepath, sep='\t', skiprows=0, names=['head', 'relation', 'tail', 'label'])
    df = df.sort_values(by=['head', 'tail']) # this is important if multiprocessing to help SFE save computing time using the BFS memory
    output_file_name = os.path.basename(filepath).replace('.txt', '.tsv')

    # @TODO: put this into multiprocessing (or maybe threading)
    count = 0
    for res in sfe.extract_features(df, batch_size=batch_size):
        save_features_to_disk(res, output_dir, output_file_name)
        count += batch_size
        print("{} examples processed...".format(count))

print("\nPipeline finished.")


Starting feature extraction for `../benchmarks/NELL186/corrupted/train.txt` ...


KeyError: 'concept:academicfield:accountancy'

In [22]:
sfe.graph.get_node('concept:academicfield:accountancy')

KeyError: 'concept:academicfield:accountancy'

In [25]:
sfe.graph.get_node('concept:coach:mark_hendrickson')

KeyError: 'concept:coach:mark_hendrickson'

In [24]:
sfe.graph.nodes

{nan: <sfe.Node at 0x7f3f74751a50>,
 nan: <sfe.Node at 0x7f3f74751d10>,
 '15355,concept:fish:tuna,concept:fishservedwithfood,concept:food:crust': <sfe.Node at 0x7f3f713ee050>,
 nan: <sfe.Node at 0x7f3f6f42ced0>,
 nan: <sfe.Node at 0x7f3f73edde10>,
 '12448,concept:clothing:trousers,concept:clothingtogowithclothing,concept:clothing:blouse': <sfe.Node at 0x7f3f71c12b90>,
 nan: <sfe.Node at 0x7f3f700e72d0>,
 nan: <sfe.Node at 0x7f3f6ff33dd0>,
 nan: <sfe.Node at 0x7f3f73102d50>,
 '11590,concept:city:woburn,concept:atlocation,concept:stateorprovince:massachusetts': <sfe.Node at 0x7f3f71ea57d0>,
 '25905,concept:sportsteam:baltimore_ravens,concept:teamwontrophy,concept:awardtrophytournament:afc_championship': <sfe.Node at 0x7f3f6f5f6ed0>,
 '9589,concept:city:boone,concept:atlocation,concept:geopoliticallocation:north_carolina': <sfe.Node at 0x7f3f72773f50>,
 '6574,concept:beverage:tea,concept:agriculturalproductcontainchemical,concept:chemical:acid': <sfe.Node at 0x7f3f72fe8410>,
 '16952,conce