# Using Facebook AI's adaptive span transformer to model code
This implementation of the transformer uses a smaller context window for the attention. The ooriginal model is trained on characters on the traditional language modeling task.

I'm using a huggingface bytePairEncoding tokenizer to make word peices.

In [118]:
import math
import time

import torch
import sys
sys.path.append("src/external_repos/adaptive_span")
from config import PARAMS_CONFIG
from data import get_train_val_test_data, Corpus, _get_train_val_test_data
from models import TransformerSeq
from trainer import train_iteration, full_eval
from utils import (
    get_params,
    set_up_env,
    get_optimizer_and_scheduler,
    load_checkpoint,
    save_checkpoint,
    Logger)
from main import launch
import tqdm.notebook as tqdm 
from tokenizers import ByteLevelBPETokenizer

# training a custom tokenizer
```python
from tokenizers import ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=["code_corpus_train.txt"], vocab_size=32_000, min_frequency=2)
tokenizer.save(".", "code_bpe_hugging_32k")
tokenizer = ByteLevelBPETokenizer("code_bpe_hugging_32k-vocab.json","code_bpe_hugging_32k-merges.txt",)

sent = "print('hello world!')"
ids = tokenizer.encode(sent).ids
print("token ids: ",ids)
print("token ids: ",tokenizer.encode(sent).tokens)
tokenizer.decode(ids)
```

## Tokenising already existing files into a format understood by the scripts
```python
from tokenizers import ByteLevelBPETokenizer

tokenizer = ByteLevelBPETokenizer("datasets/code_search_net/code_bpe_hugging_32k-vocab.json","datasets/code_search_net/code_bpe_hugging_32k-merges.txt",)

train_fps = ("datasets/code_search_net/code_corpus_train.txt", "train.txt")
valid_fps = ("datasets/code_search_net/code_corpus_valid.txt", "valid.txt")
test_fps = ("datasets/code_search_net/code_corpus_test.txt", "test.txt")

for source_file, target_file in [train_fps, valid_fps, test_fps]:
    with open(source_file, "r") as src_fp:
        file_len = sum(1 for line in (src_fp))
    with open(source_file, "r") as src_fp, open(target_file, "w") as tgt_fp:
        pbar = tqdm.tqdm(src_fp, total=file_len)
        for line in pbar:
            ids = tokenizer.encode(line).ids
            tgt_fp.write(' '.join(str(x) for x in ids)+"\n")
```

In [22]:
params = {'env_params': {'distributed': False, 'local_rank': 0}, 
                'data_params': {'data_path': 'datasets/code_search_net/tokenised'}, 
                'model_params': {'hidden_size': 256, 
                                 'inner_hidden_size': 1024, 
                                 'nb_layers': 8, 
                                 'block_size': 64, 
                                 'nb_heads': 4, 
                                 'attn_span': 1024, 
                                 'dropout': 0.2}, 
                'optim_params': {'lr': 0.00, 
                                 'momentum': 0, 
                                 'optim': 'adagrad', 
                                 'lr_warmup': 8000, 
                                 'grad_clip': 0.03}, 
                'trainer_params': {'batch_size': 128, 
                                   'batch_split': 1, 
                                   'nb_batches_per_iter': 1000, 
                                   'nb_iter': 150, 
                                   'checkpoint_path': 'code_adaptive_transformer_save', 
                                   'full_eval_mode': False}, 
                'adapt_span_params': {'adapt_span_enabled': True, 
                                      'adapt_span_loss': 0.000002, 
                                      'adapt_span_ramp': 32, 
                                      'adapt_span_init': 4, 
                                      'adapt_span_cache': True}}

In [17]:
launch(**model_params)

model_params:	 {'hidden_size': 256, 'inner_hidden_size': 1024, 'nb_layers': 8, 'block_size': 64, 'nb_heads': 4, 'attn_span': 32, 'dropout': 0.2}
optim_params:	 {'lr': 0.0, 'momentum': 0, 'optim': 'adagrad', 'lr_warmup': 8000, 'grad_clip': 0.03}
data_params:	 {'data_path': 'datasets/code_search_net/tokenised'}
trainer_params:	 {'batch_size': 128, 'batch_split': 1, 'nb_batches_per_iter': 1000, 'nb_iter': 150, 'checkpoint_path': 'code_adaptive_transformer_save', 'full_eval_mode': False}
adapt_span_params:	 {'adapt_span_enabled': True, 'adapt_span_loss': 2e-06, 'adapt_span_ramp': 32, 'adapt_span_init': 4, 'adapt_span_cache': True}
Loading an existing corpus file from datasets/code_search_net/tokenised/corpus.pt
nb_parameters=22.67M




KeyboardInterrupt: 

In [23]:
checkpoint_state = torch.load("./saved_models/code_adaptive_span_sved_model")

In [40]:
model = TransformerSeq(vocab_size=31886, **params["model_params"],adapt_span_params=params["adapt_span_params"])
model = torch.nn.DataParallel(model).to("cuda")

In [20]:
checkpoint_state['model']

OrderedDict([('module.key_pe',
              tensor([[[-0.5509,  0.4127,  0.4396,  ...,  1.1414,  0.0569,  2.2597],
                       [ 1.5849,  2.3193,  0.9480,  ..., -0.3057, -0.5050, -1.2480],
                       [ 0.6352,  2.1086,  1.8249,  ..., -0.1262, -0.9029, -1.6015],
                       ...,
                       [-2.4559, -1.6373, -1.0293,  ...,  0.5551,  1.0159,  1.1841],
                       [ 0.8418, -0.6450,  0.4996,  ..., -0.1534, -0.4597, -0.4217],
                       [ 1.4326,  0.5642,  1.4460,  ...,  0.7697,  0.1855, -0.1256]]],
                     device='cuda:0')),
             ('module.in_emb.weight',
              tensor([[ 1.2830,  1.1185, -0.1138,  ..., -1.8475, -0.7691,  0.6811],
                      [ 1.0629, -1.6809,  0.9716,  ..., -0.3081,  2.0924,  0.7215],
                      [-0.3928,  1.0188,  0.7129,  ..., -0.2347, -0.4224,  1.1415],
                      ...,
                      [-1.0662,  0.2969,  0.5476,  ..., -1.3524,  0.2323

In [128]:
%%capture
model.load_state_dict(checkpoint_state['model'])
model.eval()

In [129]:
corpus = Corpus("datasets/code_search_net/tokenised")

Tokenizing datasets/code_search_net/tokenised/train.txt
Tokenizing datasets/code_search_net/tokenised/valid.txt
Tokenizing datasets/code_search_net/tokenised/test.txt


In [516]:
corpus.train.shape

torch.Size([128137090])

In [529]:
tokenizer = ByteLevelBPETokenizer("datasets/code_search_net/code_bpe_hugging_32k-vocab.json",
                                  "datasets/code_search_net/code_bpe_hugging_32k-merges.txt",)

sent = "top_item = items[0"
# sent = "    if (a == b): "
ids = tokenizer.encode(sent).ids

In [530]:
src_input = torch.tensor([[corpus._dictionary[str(i)] for i in ids]]).to("cuda")

In [531]:
hid_cache = [[
        torch.zeros(
            1,
            layer.attn.attn.get_cache_size(),
            params["model_params"]["hidden_size"]).to("cuda")
        for layer in model.module.layers] for _ in range(2)]

In [532]:
model.module.layers[0].attn.attn.get_cache_size()

192

In [533]:
outputs = model(src_input,hid_cache[0])
v_out, h_cache = outputs
print(v_out.shape)
v_out = v_out[0]
print(v_out.shape)

torch.Size([1, 7, 31886])
torch.Size([7, 31886])


In [534]:
corpus_itos = [0]*len(corpus._dictionary)
for k,v in corpus._dictionary.items():
    try:
        corpus_itos[v] = int(k)
    except:
        pass

In [535]:
max_ids_outputs = torch.argsort(v_out, dim=-1, descending=True).tolist()
BPE_ids = [[corpus_itos[j] for j in i] for i in max_ids_outputs][-1][:5]
print(" or ".join([tokenizer.decode([i]) for i in BPE_ids]))

] or ][ or ]. or ], or ]]


In [528]:
!head -50 datasets/code_search_net/code_corpus_train.txt

def train(train_dir, model_save_path=None, n_neighbors=None, knn_algo='ball_tree', verbose=False):
    """
    Trains a k-nearest neighbors classifier for face recognition.

    :param train_dir: directory that contains a sub-directory for each known person, with its name.

     (View in source code to see train_dir example tree structure)

     Structure:
        <train_dir>/
        ├── <person1>/
        │   ├── <somename1>.jpeg
        │   ├── <somename2>.jpeg
        │   ├── ...
        ├── <person2>/
        │   ├── <somename1>.jpeg
        │   └── <somename2>.jpeg
        └── ...

    :param model_save_path: (optional) path to save model on disk
    :param n_neighbors: (optional) number of neighbors to weigh in classification. Chosen automatically if not specified
    :param knn_algo: (optional) underlying data structure to support knn.default is ball_tree
    :param verbose: verbosity of training
    :return: returns knn classifier that was trained on the given data.
    """
  