In [2]:
import argparse
from argparse import Namespace

import os
import pickle

from griffon.utils import load_config
from griffon.coq_dataclasses import Stage1Sample

from griffon.preprocessing.pipeline.stage2.vocab import VocabTransform
from griffon.preprocessing.graph.distances import PersonalizedPageRank, ShortestPaths, AncestorShortestPaths, SiblingShortestPaths, DistanceBinning
from griffon.preprocessing.graph.binning import ExponentialBinning
from griffon.preprocessing.graph.transform import DistancesTransformer

import multiprocessing as mp

import itertools

from glob import glob
from tqdm import tqdm


from typing import Tuple

def get_vocab_transformer(args: Namespace) -> VocabTransform:
    vocab = pickle.load(open(args.vocab_path, "rb"))
    return VocabTransform(vocab)


def get_distances_transformer(config):
    """
    Extract how distances should be computed from the dataset config
    """
    distances_config = config['distances']
    PPR_ALPHA = distances_config['ppr_alpha']
    PPR_USE_LOG = distances_config['ppr_use_log']
    PPR_THRESHOLD = distances_config['ppr_threshold']

    SP_THRESHOLD = distances_config['sp_threshold']

    ANCESTOR_SP_FORWARD = distances_config['ancestor_sp_forward']
    ANCESTOR_SP_BACKWARD = distances_config['ancestor_sp_backward']
    ANCESTOR_SP_NEGATIVE_REVERSE_DISTS = distances_config['ancestor_sp_negative_reverse_dists']
    ANCESTOR_SP_THRESHOLD = distances_config['ancestor_sp_threshold']

    SIBLING_SP_FORWARD = distances_config['sibling_sp_forward']
    SIBLING_SP_BACKWARD = distances_config['sibling_sp_backward']
    SIBLING_SP_NEGATIVE_REVERSE_DISTS = distances_config['sibling_sp_negative_reverse_dists']
    SIBLING_SP_THRESHOLD = distances_config['sibling_sp_threshold']

    # Extract how distances should be binned from the dataset config
    binning_config = config['binning']
    EXPONENTIAL_BINNING_GROWTH_FACTOR = binning_config['exponential_binning_growth_factor']
    N_FIXED_BINS = binning_config['n_fixed_bins']
    NUM_BINS = binning_config['num_bins']

    distance_metrics = [
        PersonalizedPageRank(threshold=PPR_THRESHOLD,
                             log=PPR_USE_LOG, alpha=PPR_ALPHA),
        ShortestPaths(threshold=SP_THRESHOLD),
        AncestorShortestPaths(forward=ANCESTOR_SP_FORWARD, backward=ANCESTOR_SP_BACKWARD,
                              negative_reverse_dists=ANCESTOR_SP_NEGATIVE_REVERSE_DISTS,
                              threshold=ANCESTOR_SP_THRESHOLD),
        SiblingShortestPaths(forward=SIBLING_SP_FORWARD, backward=SIBLING_SP_BACKWARD,
                             negative_reverse_dists=SIBLING_SP_NEGATIVE_REVERSE_DISTS,
                             threshold=SIBLING_SP_THRESHOLD)]

    db = DistanceBinning(NUM_BINS, N_FIXED_BINS, ExponentialBinning(
        EXPONENTIAL_BINNING_GROWTH_FACTOR))

    return DistancesTransformer(distance_metrics, db)



In [16]:


args = Namespace(stage1_root="../../../../../data/processed/stage1/",
                 config_path="../../../../../configs/config.json",
                 vocab_path ="../../../../../models/vocab.pickle")


config = load_config(args.config_path)

vocab_transformer = get_vocab_transformer(args)
distances_transformer = get_distances_transformer(config)

def process_sample(filename):

    split, project, proof, sample_file = filename.split(os.path.sep)[-4:]

    sample:Stage1Sample = pickle.load(open(filename, "rb"))

    sample = vocab_transformer(sample)
    sample = distances_transformer(sample)

    return sample

pattern = os.path.join(args.stage1_root, "**", "*.pickle")
files = glob(pattern, recursive=True)

process_sample(files[0])

Stage2Sample(hypotheses=[Stage2Statement(name='orig_base_params', tokens=[Stage1Token(subtokens=[0, 373, 1824]), Stage1Token(subtokens=[0]), Stage1Token(subtokens=[0]), Stage1Token(subtokens=[0]), Stage1Token(subtokens=[3])], adjacency_matrix=tensor([[0., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 1., 0., 0., 0., 0., 1.],
        [0., 0., 0., 1., 0., 1., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1., 0., 1., 1., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]], dtype=torch.float64), distances=[(tensor([[ 1,  3,  7, 14, 20, 26, 27, 27, 22, 15],
        [ 9,  4, 14, 21, 28, 31, 31, 31, 29, 23],
        [ 7,  8,  3, 10, 16, 22, 24, 24, 18, 11],
        [10, 11,  7,  2,  9, 14, 16, 16, 10,  4],
        [17, 18, 13,  

In [7]:
!ls ../../../../../data/processed/stage1/

Dockerfile  data	    notebooks	      src
Makefile    models	    requirements.txt  sum_fake_output
configs     my_archive.tar  setup.py	      test
