In [7]:
from argparse import Namespace

import pickle

from griffon.utils import load_config
from griffon.preprocessing.pipeline.stage2.vocab import VocabTransform
from griffon.preprocessing.graph.distances import PersonalizedPageRank, ShortestPaths, AncestorShortestPaths, SiblingShortestPaths, DistanceBinning
from griffon.preprocessing.graph.binning import ExponentialBinning
from griffon.preprocessing.graph.transform import DistancesTransformer

from glob import glob

In [8]:
args = Namespace(vocab_path="../../../../../models/vocab.pickle", 
                 config_file="../../../../../configs/config.json",
                 stage1_root="../../../../../data/processed/stage1")

vocab = pickle.load(open(args.vocab_path, "rb"))

vocab_transform = VocabTransform(vocab)

config = load_config(args.config_file)

# Extract how distances should be computed from the dataset config
distances_config = config['distances']
PPR_ALPHA = distances_config['ppr_alpha']
PPR_USE_LOG = distances_config['ppr_use_log']
PPR_THRESHOLD = distances_config['ppr_threshold']

SP_THRESHOLD = distances_config['sp_threshold']

ANCESTOR_SP_FORWARD = distances_config['ancestor_sp_forward']
ANCESTOR_SP_BACKWARD = distances_config['ancestor_sp_backward']
ANCESTOR_SP_NEGATIVE_REVERSE_DISTS = distances_config['ancestor_sp_negative_reverse_dists']
ANCESTOR_SP_THRESHOLD = distances_config['ancestor_sp_threshold']

SIBLING_SP_FORWARD = distances_config['sibling_sp_forward']
SIBLING_SP_BACKWARD = distances_config['sibling_sp_backward']
SIBLING_SP_NEGATIVE_REVERSE_DISTS = distances_config['sibling_sp_negative_reverse_dists']
SIBLING_SP_THRESHOLD = distances_config['sibling_sp_threshold']

# Extract how distances should be binned from the dataset config
binning_config = config['binning']
EXPONENTIAL_BINNING_GROWTH_FACTOR = binning_config['exponential_binning_growth_factor']
N_FIXED_BINS = binning_config['n_fixed_bins']
NUM_BINS = binning_config['num_bins']

distance_metrics = [
    PersonalizedPageRank(threshold=PPR_THRESHOLD, log=PPR_USE_LOG, alpha=PPR_ALPHA),
    ShortestPaths(threshold=SP_THRESHOLD),
    AncestorShortestPaths(forward=ANCESTOR_SP_FORWARD, backward=ANCESTOR_SP_BACKWARD,
                          negative_reverse_dists=ANCESTOR_SP_NEGATIVE_REVERSE_DISTS,
                          threshold=ANCESTOR_SP_THRESHOLD),
    SiblingShortestPaths(forward=SIBLING_SP_FORWARD, backward=SIBLING_SP_BACKWARD,
                         negative_reverse_dists=SIBLING_SP_NEGATIVE_REVERSE_DISTS,
                         threshold=SIBLING_SP_THRESHOLD)]

db = DistanceBinning(NUM_BINS, N_FIXED_BINS, ExponentialBinning(EXPONENTIAL_BINNING_GROWTH_FACTOR))

distances_transformer = DistancesTransformer(distance_metrics, db)


In [12]:
files = glob(args.stage1_root + "/**/*.pickle", recursive=True)

sample = pickle.load(open(files[0], "rb"))
print(sample.goal)
sample = vocab_transform(sample)
sample = distances_transformer(sample)
print(sample.goal)

goal : goal Logic Init Coq eq 0 Compare_Nat Lib_Numerals Factorization Hardware order 0 Compare_Nat Lib_Numerals Factorization Hardware comparison ValB Datatypes Init Coq nat 0 2 n Cons n x X ValB Datatypes Init Coq nat 0 2 n Cons n y Y Compare_Nat Lib_Numerals Factorization Hardware order 0 1
Stage2Statement(name='goal', tokens=[Stage1Token(subtokens=[71]), Stage1Token(subtokens=[6]), Stage1Token(subtokens=[4]), Stage1Token(subtokens=[2]), Stage1Token(subtokens=[23]), Stage1Token(subtokens=[3]), Stage1Token(subtokens=[2830, 96]), Stage1Token(subtokens=[951, 1319]), Stage1Token(subtokens=[1830]), Stage1Token(subtokens=[1653]), Stage1Token(subtokens=[782]), Stage1Token(subtokens=[3]), Stage1Token(subtokens=[2830, 96]), Stage1Token(subtokens=[951, 1319]), Stage1Token(subtokens=[1830]), Stage1Token(subtokens=[1653]), Stage1Token(subtokens=[197]), Stage1Token(subtokens=[4429]), Stage1Token(subtokens=[5]), Stage1Token(subtokens=[4]), Stage1Token(subtokens=[2]), Stage1Token(subtokens=[17]), 