## How many nodes will be updated by the optimization?

In [1]:
from src.models.CORAL_BART.dataset import KaggleDiffsDataset, KaggleDiffsReader
from src.data.scrape_library_structures import TokenizedGraph
from src.models.CORAL_BART.utils import  block_shuffle
from transformers import BartTokenizerFast

import os
from collections import Counter
import pickle
import numpy as np

In [2]:
DIFF_PATH = "/homes/gws/mikeam/RobustDataScience/data/processed/filtered_less_than_5_lines.jsonl"
diff_reader = KaggleDiffsReader(DIFF_PATH)

Loading Diffs: 69898it [00:02, 26853.24it/s]


In [3]:
diff_reader.diffs = block_shuffle(
        diff_reader.diffs, key_fn=lambda x: x["metadata"]["comp_name"])

In [4]:
path_to_tokenizer = "../../tokenizer"
vocab_path = os.path.join(path_to_tokenizer, "vocab.json")
merges_path = os.path.join(path_to_tokenizer, "merges.txt")
tokenizer = BartTokenizerFast(vocab_path, merges_path)

In [5]:
all_diffs = KaggleDiffsDataset(diff_reader.diffs,tokenizer)

In [6]:
input_id_counter = Counter()
for diff in all_diffs:
    for input_id in diff["input_ids"]:
        input_id_counter[input_id]+=1

In [7]:
def load_tokenized_graph(path):
    f = open(path,"rb")
    graph = pickle.load(f)
    f.close
    return graph

tokenized_graph = load_tokenized_graph("/homes/gws/mikeam/RobustDataScience/data/processed/top_10_lib_tree_no_args/tokenized_graph.pickle")

In [8]:
sum([x in input_id_counter for x in tokenized_graph.supported_input_ids])

5110

In [9]:
len(tokenized_graph.supported_input_ids)

6670

In [10]:
in_graph_not_data = [x for x in tokenized_graph.supported_input_ids if not x in input_id_counter]

In [11]:
tokenizer.convert_ids_to_tokens(in_graph_not_data)[:20]

['Ġhashing',
 'Ġspherical',
 'ĠModified',
 'Ġquadrat',
 'integrator',
 'ĠMember',
 'Ġlite',
 'ĠAdadelta',
 'frombuffer',
 'ĠRequired',
 'cca',
 'ĠModule',
 'Ġdivergence',
 'byshev',
 'fmax',
 'Dela',
 'Ġxarray',
 'rue',
 'ĠScope',
 'Ġduplicated']

How do we show what helps?
Performance on rarely/never seen tokens? 
How important is all of this? Need to set all of these hyperpara
Make ablation tables even when empty

## How often is the correct prediction something that would have been exluded? 

In [12]:
in_correct_span = [np.array(x["decoder_input_ids"])[np.where(x["loss_mask"])] for x in  all_diffs]

In [13]:
supported_label_tokens = []
unsupported_label_tokens = []
for correct in in_correct_span:
    for input_id in correct:
        if input_id in tokenized_graph.supported_input_ids:
            supported_label_tokens.append(input_id)
        else:
            unsupported_label_tokens.append(input_id)

In [14]:
print(len(supported_label_tokens))
print(len(unsupported_label_tokens))

1259883
1429970


In [15]:
tokenizer.convert_ids_to_tokens(np.random.choice(unsupported_label_tokens,size=20))

['],',
 '(',
 '(',
 '),',
 '))',
 '(',
 'BR',
 ').',
 '",',
 'Ġ=',
 'Ġ220',
 '<s>',
 'ĠvizElement',
 'Ġ=',
 ',',
 'pred',
 '.',
 '.',
 '")',
 "Ġ'"]

In [16]:
tokenizer.convert_ids_to_tokens(np.random.choice(supported_label_tokens,size=20))

['ĠRegression',
 'Inception',
 'train',
 '_',
 'labels',
 'Ġ10',
 'sec',
 'isnull',
 'x',
 'Ġtrain',
 'train',
 'values',
 'y',
 'from',
 '_',
 'embedding',
 'classifier',
 'array',
 '_',
 'X']

In [17]:
tokenizer.encode(' scaler', add_special_tokens=False)

[6077]

In [18]:
tokenizer.convert_ids_to_tokens([6077])

['Ġscaler']


How do we show what helps?
Performance on rarely/never seen tokens? 
How important is all of this? Need to set all of these hyperpara
Make ablation tables even when empty


In [19]:
import torch

In [20]:
test = torch.tensor([[[1.0,1. ],[1.,2.]],[[3.,4.],[5.,6.]]])

In [21]:
test.view(-1, test.shape[-1])[[1,2]]

tensor([[1., 2.],
        [3., 4.]])

In [22]:
test2 = torch.tensor([[1,1],[1,2]])
test2 = test2.view(-1)


In [23]:
torch.tensor([x.item() in set([1,3]) for x in test2.flatten()])

tensor([ True,  True,  True, False])

In [24]:
test2[[1,2]].detach().tolist()

[1, 1]

In [25]:
test = test.view(-1, test.shape[-1])[[1,2]]

In [26]:
test

tensor([[1., 2.],
        [3., 4.]])

In [27]:
pdist = torch.nn.PairwiseDistance(p=2)
pdist(test)

TypeError: forward() missing 1 required positional argument: 'x2'

In [None]:
def expanded_pairwise_distances(x, y=None):
    '''
    Input: x is a Nxd matrix
           y is an optional Mxd matirx
    Output: dist is a NxM matrix where dist[i,j] is the square norm between x[i,:] and y[j,:]
            if y is not given then use 'y=x'.
    i.e. dist[i,j] = ||x[i,:]-y[j,:]||^2
    '''
    if y is not None:
         differences = x.unsqueeze(1) - y.unsqueeze(0)
    else:
        differences = x.unsqueeze(1) - x.unsqueeze(0)
    distances = torch.sum(differences * differences, -1)
    return distances

In [None]:
e_D = torch.cdist(test,test)

In [None]:
g_D = torch.tensor([[0,1],[1,0]])

In [None]:
torch.sum(e_D)

In [None]:
ratio = e_D / g_D

In [None]:
ratio

In [None]:
ratio[np.diag_indices(2)] = 0

In [None]:
type(torch.tensor(float("Inf")).item())