## How many tokens is the typical function split into? 

In [10]:
import numpy as np
import pandas as pd
import os
from transformers import BartTokenizerFast
from src.data.filter_pairs import build_function_vocabulary

In [5]:
DATA_PATH = "../../data/processed/filtered_less_than_5_lines.jsonl"
diffs = pd.read_json(DATA_PATH, lines=True)

In [11]:
diffs.head()

Unnamed: 0,original_path,metadata,cell_diff
0,data/processed/competitions/bluebook-for-bulld...,"{'version_id': '16033940', 'slug_id': 'gstvolv...",n_valid = 12000\n-n_trn = len(df) - n_valid\n+...
1,data/processed/competitions/liberty-mutual-gro...,"{'version_id': '443427', 'slug_id': 'aarotang'...",# Uncomment the two lines below to roughly che...
2,data/processed/competitions/cdiscount-image-cl...,"{'version_id': '1530168', 'slug_id': 'jpizarro...","-input_tensor = Input(shape=(180, 180, 3))\n+i..."
3,data/processed/competitions/cdiscount-image-cl...,"{'version_id': '1530168', 'slug_id': 'jpizarro...",# create the base pre-trained model\n #base_mo...
4,data/processed/competitions/cdiscount-image-cl...,"{'version_id': '1530168', 'slug_id': 'jpizarro...",-num_images_test = 10\n+num_images_test = 1000...


In [8]:
TOKENIZER_PATH = "../../models/CORAL_BART/tokenizer/"
merges = os.path.join(TOKENIZER_PATH,"merges.txt")
vocab = os.path.join(TOKENIZER_PATH,"vocab.json")
tokenizer = BartTokenizerFast(vocab, merges)

In [19]:
functions = pd.DataFrame(build_function_vocabulary([row for i, row in diffs.iterrows()]).most_common(1000), columns = ["Function","Count"])

In [20]:
functions.head()

Unnamed: 0,Function,Count
0,predict,7447
1,,5054
2,fit,3607
3,train_test_split,3378
4,drop,2840


In [23]:
functions["num_tokens"] = functions["Function"].map(lambda x: len(tokenizer.encode(x, add_special_tokens=False))) 

In [25]:
functions["num_tokens"].describe()

count    1000.000000
mean        2.356000
std         1.563877
min         0.000000
25%         1.000000
50%         2.000000
75%         3.000000
max        10.000000
Name: num_tokens, dtype: float64

In [32]:
def tokenizer_example(string,tokenizer):
    tokens = tokenizer.encode(string,add_special_tokens=False)
    results = tokenizer.convert_ids_to_tokens(tokens)
    return results

In [28]:
tokenizer_example("LogisticRegression")

['LogisticRegression']

In [30]:
tokenizer_example("t_test")

['t', '_', 'test']

In [34]:
tokenizer_example("clf = LogisticRegression(random_state=0).fit(X, y)",tokenizer)

['clf',
 'Ġ=',
 'ĠLogisticRegression',
 '(',
 'random',
 '_',
 'state',
 '=',
 '0',
 ').',
 'fit',
 '(',
 'X',
 ',',
 'Ġy',
 ')']

## What about the old tokenizer we used in the last paper? 

In [37]:
from src.models.CORAL_BART.dataset import CoralDiffsReader

In [38]:
coral_diffs = CoralDiffsReader(DATA_PATH)

Loading Diffs: 69898it [00:01, 40806.09it/s]
100%|██████████| 69898/69898 [00:02<00:00, 33771.13it/s]
  0%|          | 70/47272 [00:00<01:08, 694.10it/s]

Remove 22626 unparsable diffs


100%|██████████| 47272/47272 [01:27<00:00, 543.35it/s]
Building Vocab...: 100%|██████████| 47272/47272 [00:00<00:00, 123436.61it/s]


In [46]:
sum([x in coral_diffs.word2idx for x in functions["Function"])

901

In [49]:
[x for x in functions["Function"] if not x in coral_diffs.word2idx ]

['Capsule',
 'show_wordcloud',
 'UpConvBlock',
 'getElementById',
 'ConvTranspose2d',
 'KernelSettings',
 'fit_set',
 'geom_bar',
 'BatchNorm2d',
 'SN',
 'Model_train',
 'return_error',
 'build_model2',
 'most_frequent_category',
 'createDataPartition',
 'conv2d_block',
 'Dropout2d',
 'function',
 'deconv_block',
 'load_game_and_ngs',
 'geom_tile',
 'missForest',
 'left_join',
 'theme_bw',
 'build_model1',
 'rgba',
 'alt_session_wrapper',
 'get_data_generators',
 'merge_train_test',
 'load_patient_data',
 'ADAM',
 'group_by',
 'session_wrapper',
 'get_aggregate_metrics',
 'blocks',
 'ridge_regression_model',
 'transform_raw',
 '_DownSamplingBlock',
 'lrelu',
 'scaleSequential',
 'theme',
 'ConvBlock',
 'xgb_fit',
 'varImpPlot',
 'spectral_norm',
 'glm',
 'SaveFeatures',
 'ifelse',
 'decide_test_file',
 'CycleScheduler',
 'makeLearner',
 'forward_propagation',
 'build_dataset',
 'create_embedding_weights',
 'create_prediction',
 'Wave_Block',
 'pytorch_model_run_cv',
 'summarise_all',
 