# Scratch Pad: A symbolic playground for Seq2Seq models

```
what does Henry do? <scratch_pad> search('Henry') -> ['Henry is a School Psychologist'] </scratch_pad> School Psychologist

what is 556 - 301? <scratch_pad> 556 - 301 -> 255 </scratch_pad> 255
```

In [1]:
from rank_bm25 import BM25Okapi
from src.models_and_transforms.BERT_models import AlphaBERT
from src.models_and_transforms.text_transforms import Numericalise_Transform, Scratch_Pad_Sequence_Policy_Creator_Transform
from src.pipe_datasets import Scratch_Pad_Policy_Dataset
import re
from transformers import GPT2LMHeadModel, GPT2Config, GPT2Tokenizer, GPT2TokenizerFast
from transformers import BertConfig, BertTokenizerFast, BertForMaskedLM
from tokenizers import processors, Tokenizer
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning import Trainer, Callback, seed_everything
import numpy as np
import json
import sys
import tqdm

from src.useful_utils import chunks
from src.models_and_transforms.GPT2_models import GPT2_Scratch_Pad_Model
sys.path.insert(0,"src/external_repos/pyfuzz/")

%load_ext autoreload
%autoreload 2
%load_ext line_profiler
%load_ext memory_profiler

## Searching function

In [2]:
corpus = [
    "Hello there good man!",
    "It is quite windy in London",
    "How is the weather today?"
]
tokenized_corpus = [re.findall(r"[\w']+|[.,!?;]", doc) for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

In [3]:
def search(q, n=1):
    return bm25.get_top_n(q.split(), corpus, n=n)

## Executing code

In [4]:
def my_exec(code):
    if not code:
        return ''
    try:
        prior_code, _, last_line = code.rpartition('\n')
        exec(f'{prior_code}\nglobal __i__; __i__ = {last_line}')
        global __i__
        return str(__i__)
    except Exception as e:
        if hasattr(e,'msg'):
            return "ERROR: " + e.msg
        return "ERROR: " + str(e)

In [5]:
my_exec('''search('man')''')

"['Hello there good man!']"

## Making a function to process code in the scratch pad

In [6]:
my_exec('r="rrr"\nd+"4"')

"ERROR: name 'd' is not defined"

In [9]:
inputs = tokenizer(['test=5>>>', 'this is a test <ScratchPad>r=4>>></ScratchPad>Jhon<ScratchPad>search("Oliver")>>>'], padding=True, return_tensors='pt')
input_ids = inputs['input_ids']

def scratch_pad_complete(batch_input_ids):
    execution_token_id = tokenizer.get_vocab()['>>>']
    pad_token_id = tokenizer.pad_token_id
    start_scratch_pad_id = tokenizer.get_vocab()['<ScratchPad>']
    end_scratch_pad_id = tokenizer.get_vocab()['</ScratchPad>']
    if not (batch_input_ids[:,-1] == execution_token_id).any():
        # there are no execution tokens, so return
        return batch_input_ids
    full_sents = []
    for i in range(len(batch_input_ids)):
        input_ids = list(batch_input_ids[i])
        input_ids = [tok_id for tok_id in input_ids if tok_id != pad_token_id]
        sequence = tokenizer.decode(input_ids, spaces_between_special_tokens=False)
        
        in_scratch_pad = False
        if start_scratch_pad_id in input_ids:
            in_scratch_pad = True
            last_start_idx = len(input_ids) - 1 - input_ids[::-1].index(start_scratch_pad_id)
            if end_scratch_pad_id in input_ids[last_start_idx:]:
                in_scratch_pad = False
        
        if (batch_input_ids[i][-1] != execution_token_id) or not in_scratch_pad:
            full_sents.append(sequence)
            continue
        
        prior_scratch_pad_sequence, _, last_scratch_pad_sequence = sequence.rpartition('<ScratchPad>')
        prior_scratch_pad_sequences = re.findall(r'\<ScratchPad\>([^]]*)\</ScratchPad\>', prior_scratch_pad_sequence)
#         print(prior_scratch_pad_sequences)
        all_statements = ''.join(prior_scratch_pad_sequences + [last_scratch_pad_sequence])
        individual_statements = re.split(r'>>>.*\n|>>>', all_statements)[:-1]
        stmnt_out = my_exec('\n'.join(individual_statements))
        full_sents.append(sequence + stmnt_out + '\n')
    
    return tokenizer(full_sents, padding=True, return_tensors='pt').input_ids

input_ids = scratch_pad_complete(input_ids)
[tokenizer.decode(input_ids[i], spaces_between_special_tokens=False) for i in range(input_ids.shape[0])]

NameError: name 'tokenizer' is not defined

## Creating a simple model

In [7]:
configuration = GPT2Config()
configuration.n_head = 1
configuration.n_embd = 10
configuration.n_layer = 2
configuration.n_ctx = 512
configuration.n_positions = 512
configuration.vocab_size = 50265

In [8]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", padding_side='left', pad_token='<pad>', eos_token='<eos>',
                                                       additional_special_tokens=['\n', '<ScratchPad>', '</ScratchPad>'])
model = GPT2_Scratch_Pad_Model(configuration)

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [9]:
tokens = tokenizer.encode('do<ScratchPad>4+5>>>', return_tensors='pt')
print("Input tokens: "+ str(tokens))
model.eval()
gen_sequences = model.generate(tokens, max_length=15, do_sample=True, num_beams=3, use_cache=False, pad_token_id=tokenizer.pad_token_id, 
                               num_return_sequences=3, postfix_additional_tokens_fn=scratch_pad_complete)
[tokenizer.decode(gen_sequences[i], spaces_between_special_tokens=False) for i in range(gen_sequences.shape[0])]

Input tokens: tensor([[ 4598, 50260,    19,    10,    20, 33409]])


NameError: name 'scratch_pad_complete' is not defined

In [None]:
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print("Num params: " + str(params))

In [13]:
model.prepare_inputs_for_generation(tokens)

{'input_ids': tensor([[   40,  2883,  6155, 50260,    19, 33409]]),
 'past_key_values': None,
 'use_cache': None,
 'position_ids': None,
 'attention_mask': None,
 'token_type_ids': None}

## Using BERT for AlphaZero style learning
The model will be both responsible for state $s$ evaluation into a value $v$ and polilcy $\textbf{p}$

\begin{equation*}
(v,\textbf{p})=f(s|\theta)
\end{equation*}

In [10]:
config = BertConfig()
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", padding_side='left', pad_token='[PAD]', eos_token='[EOS]', 
                                                       additional_special_tokens=['\n', '<ScratchPad>', '</ScratchPad>', '[VALUE]', '[MASK]'])

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [34]:
config.num_attention_heads = 1
config.hidden_size = 10
config.num_hidden_layers = 2
config.intermediate_size = 64
config.n_ctx = 512
config.n_positions = 512
config.vocab_size = len(tokenizer.get_vocab())
config.position_embedding_type = 'relative_key_query'

In [35]:
model = AlphaBERT(config)

In [36]:
tokens = tokenizer.encode('do<ScratchPad>4+5>>>[MASK][VALUE]', return_tensors='pt')
print("Input tokens: "+ str(tokens))
model(tokens)

Input tokens: tensor([[ 4598, 50260,    19,    10,    20, 33409, 50263, 50262]])


(tensor([[0.9300]], grad_fn=<AddmmBackward>),
 tensor([[-0.0425, -0.1846,  0.0738,  ..., -0.5901, -0.1247, -1.0043]],
        grad_fn=<AddmmBackward>))

In [14]:
batch = {'input_ids':tokens, 
         'target_policy':torch.tensor([[33409]]), 
#          'target_value':torch.tensor([[1.0]]),
         'attention_mask':torch.ones_like(tokens)}

In [15]:
pi = torch.tensor([[0,1.0,0.0]])
policy = torch.tensor([[0.01,0.01,0.9]])
temp = 1.0
torch.dot(pi.view(-1)**temp,torch.log(policy.view(-1)))

tensor(-4.6052)

In [16]:
target_policy = torch.tensor([[2]])
policy_dist = torch.tensor([[0.01,0.01,3.0]])
policy_loss = nn.NLLLoss()(policy_dist, target_policy.view(-1))
policy_loss

tensor(-3.)

In [17]:
model.training_step(batch,0)

{'loss': tensor(12.3789, grad_fn=<AddBackward0>),
 'log': {'train_loss': tensor(12.3789, grad_fn=<AddBackward0>)}}

### Training on a sequence

In [18]:
train_data = [{'input_text':'"$%'}]

In [19]:
numericalise_transform = Numericalise_Transform(numericaliser=lambda inp: tokenizer.encode(inp, add_special_tokens=False), 
                                                fields=[('input_text', 'input_seq')])

Numericaliser. Ex: 'This is a test' -> [1212, 318, 257, 1332]


In [20]:
train_data = numericalise_transform(train_data)
train_data

[{'input_text': '"$%', 'input_seq': [1, 3, 4]}]

In [21]:
policy_creator_transform = Scratch_Pad_Sequence_Policy_Creator_Transform(execution_token_id=tokenizer.get_vocab()['>>>'],
                                                                         newline_token_id=tokenizer.get_vocab()['\n'],
                                                                         mask_token_id=tokenizer.get_vocab()['[MASK]'],
                                                                         value_token_id=tokenizer.get_vocab()['[VALUE]'])
policy_train_data = policy_creator_transform(train_data)
policy_train_data[2]

{'input_ids': [1, 3, 50263, 50262], 'target_policy': [4]}

In [61]:
policy_train_data

[{'input_ids': [50263, 50262], 'target_policy': [1]},
 {'input_ids': [1, 50263, 50262], 'target_policy': [3]},
 {'input_ids': [1, 3, 50263, 50262], 'target_policy': [4]}]

In [22]:
scratch_pad_policy_dataset = Scratch_Pad_Policy_Dataset(train_data, slow_pipe=[numericalise_transform, policy_creator_transform], 
                                                        real_time_pipe=[], PAD=tokenizer.pad_token_id)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))




In [60]:
train_dataloader = scratch_pad_policy_dataset.to_dataloader(3)
batch = next(iter(train_dataloader))
batch

{'input_ids': tensor([[50258, 50258, 50263, 50262],
         [50258,     1, 50263, 50262],
         [    1,     3, 50263, 50262]]),
 'attention_mask': tensor([[0., 0., 1., 1.],
         [0., 1., 1., 1.],
         [1., 1., 1., 1.]]),
 'target_policy': tensor([[1],
         [3],
         [4]])}

In [25]:
model.training_step(batch,0)['loss']

tensor(11.5069, grad_fn=<AddBackward0>)

In [550]:
batch['target_policy'].shape

torch.Size([3, 1])

In [551]:
loss = nn.CrossEntropyLoss()
inp = torch.randn(3, 5, requires_grad=True)
print(inp)
target = torch.empty(3, dtype=torch.long).random_(5)
print(target)
output = loss(inp, target)
output

tensor([[ 0.5614,  0.4802, -0.9755,  0.6096, -0.5561],
        [ 0.2958, -0.8632, -0.6333, -0.3695,  0.4399],
        [-0.8264, -0.4387,  0.2190,  0.3989,  0.2264]], requires_grad=True)
tensor([1, 2, 2])


tensor(1.6296, grad_fn=<NllLossBackward>)

In [552]:
output.backward()

In [553]:
inp.grad

tensor([[ 0.0949, -0.2459,  0.0204,  0.0996,  0.0310],
        [ 0.0987,  0.0310, -0.2944,  0.0507,  0.1140],
        [ 0.0288,  0.0424, -0.2515,  0.0979,  0.0824]])

In [55]:
pbar = tqdm.tqdm(range(10000))
opt = model.configure_optimizers()
it = iter(train_dataloader)
for i in pbar:
    try:
        opt.zero_grad()
        loss = model.training_step(next(it),0)['loss']
        loss.backward()
        opt.step()
        pbar.set_description(str(float(loss)))
    except StopIteration:
        it = iter(train_dataloader)

0.000998951611109078:  10%|▉         | 974/10000 [00:06<01:04, 140.56it/s]  


KeyboardInterrupt: 

In [402]:
trainer = Trainer(gradient_clip_val=0.5, amp_level='O1')
trainer.fit(model, train_dataloader)

GPU available: True, used: False
TPU available: False, using: 0 TPU cores

  | Name        | Type      | Params
------------------------------------------
0 | BERT        | BertModel | 532 K 
1 | dropout     | Dropout   | 0     
2 | value_layer | Linear    | 11    
3 | LM_layer    | Linear    | 552 K 


HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…

Saving latest checkpoint..





1

In [None]:
'[MASK][VALUE]'

In [64]:
model.eval()
model(torch.tensor([[1,3,50263, 50262]]))[1][0][:10]

tensor([-9.3421,  3.2267, -9.8514,  1.2210, 11.6785, -9.6219, -9.7639, -8.8435,
        -9.0634, -9.5598], grad_fn=<SliceBackward>)

In [372]:
class GPT2_Scratch_Pad_Batch_Numericaliser_Transform():
    def __init__(self):
        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2", padding_side='left', pad_token='<pad>', eos_token='<eos>',
                                                       additional_special_tokens=['\n', '<ScratchPad>', '</ScratchPad>'])
        
    def __call__(self, samples, **kwargs):
        '''
        samples: [dict]: [{'sequences':['this is a test', 'do <ScratchPad>4+4>>>8\n']}]
        returns: [dict]: [{'input_ids'&'attention_mask'&'target_ids':tensor[batch_size, seq_len], 'pad_id':666, 'sequences':['this is a test',...]}]
        '''
        execution_token_id = self.tokenizer.get_vocab()['>>>']
        newline_token_id = self.tokenizer.get_vocab()['\n']
        pad_token_id = self.tokenizer.pad_token_id
        for sample_obj in samples:
            model_inputs = tokens = self.tokenizer(sample_obj['sequences'], return_tensors='pt', padding=True)
            input_ids = model_inputs['input_ids']
            attention_mask = model_inputs['attention_mask']
            target_ids = model_inputs['input_ids'].clone()
            batch_size = input_ids.shape[0]
            seq_len = input_ids.shape[1]
            for i in range(batch_size):
                auto_gen_token_mode = False
                for j in range(seq_len):
                    target_ids[i][j] = pad_token_id if auto_gen_token_mode else target_ids[i][j]
                    if input_ids[i][j] == execution_token_id:
                        auto_gen_token_mode = True
                    elif input_ids[i][j] == newline_token_id:
                        auto_gen_token_mode = False
            sample_obj['input_ids'] = input_ids[:,:-1]
            sample_obj['attention_mask'] = attention_mask[:,:-1]
            sample_obj['target_ids'] = target_ids[:,1:]
            sample_obj['pad_id'] = pad_token_id
        return samples

In [115]:
SP_numericaliser_transform = GPT2_Scratch_Pad_Batch_Numericaliser_Transform()
test_samples = [{'sequences':['do <ScratchPad>4+4>>>8\n</ScratchPad>8<eos>']}]
SP_numericaliser_transform(test_samples)

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


[{'sequences': ['do <ScratchPad>4+4>>>8\n</ScratchPad>8<eos>'],
  'input_ids': tensor([[ 4598, 50260,    19,    10,    19, 33409,    23, 50259, 50261,    23]]),
  'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
  'target_ids': tensor([[50260,    19,    10,    19, 33409, 50258, 50258, 50261,    23, 50257]]),
  'pad_id': 50258}]

In [116]:
test_samples[0]

{'sequences': ['do <ScratchPad>4+4>>>8\n</ScratchPad>8<eos>'],
 'input_ids': tensor([[ 4598, 50260,    19,    10,    19, 33409,    23, 50259, 50261,    23]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'target_ids': tensor([[50260,    19,    10,    19, 33409, 50258, 50258, 50261,    23, 50257]]),
 'pad_id': 50258}

In [117]:
model.training_step(test_samples[0], 0)

{'loss': tensor(10.8125, grad_fn=<NllLossBackward>),
 'logits': tensor([[[-0.0474, -0.0402,  0.1575,  ..., -0.0349,  0.0304,  0.0153],
          [-0.0620, -0.0391, -0.1390,  ...,  0.0104, -0.0307, -0.0093],
          [ 0.0239, -0.0122, -0.1372,  ...,  0.0544,  0.0018, -0.0160],
          ...,
          [-0.0334,  0.0984,  0.0767,  ..., -0.0619, -0.0424, -0.0090],
          [ 0.0285,  0.0038, -0.1458,  ...,  0.1122, -0.0479,  0.0007],
          [ 0.0172, -0.0707,  0.0732,  ..., -0.0518,  0.0464,  0.0136]]],
        grad_fn=<UnsafeViewBackward>)}

In [99]:
# del test_samples[0]['sequences']
# del test_samples[0]['target_ids']
# del test_samples[0]['pad_id']
torch.argmax(model(**test_samples[0])[0][0][0])

TypeError: forward() got an unexpected keyword argument 'sequences'

In [118]:
model.train()
trainer = Trainer(gradient_clip_val=0.5, amp_level='O1')
trainer.fit(model, test_samples)

GPU available: True, used: False
TPU available: False, using: 0 TPU cores

  | Name        | Type      | Params
------------------------------------------
0 | transformer | GPT2Model | 510 K 
1 | lm_head     | Linear    | 502 K 


HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…

Saving latest checkpoint..





1

In [122]:
people = json.load(open('people.json', 'r'))

In [123]:
people[:3]

[['Oliver', 'Dentist'], ['Noah', 'Registered Nurse'], ['George', 'Pharmacist']]

In [124]:
people_sentences = [f"{name} is a {job}" for name, job in people]

In [125]:
tokenized_people_corpus = [re.findall(r"[\w']+|[.,!?;]", doc) for doc in people_sentences]
bm25 = BM25Okapi(tokenized_people_corpus)
def search(q, n=1):
    return bm25.get_top_n(q.split(), people_sentences, n=n)

In [126]:
search('Oliver')

['Oliver is a Dentist']

In [127]:
my_exec('''search('Oliver')''')

"['Oliver is a Dentist']"

## Overriding the generate function

In [22]:
from src.custom_generate_utils import GenerationMixin

In [16]:
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple

import torch
from torch.nn import functional as F

from transformers.file_utils import ModelOutput
from transformers.generation_beam_search import BeamScorer, BeamSearchScorer
from transformers.generation_logits_process import (
    HammingDiversityLogitsProcessor,
    LogitsProcessorList,
    MinLengthLogitsProcessor,
    NoBadWordsLogitsProcessor,
    NoRepeatNGramLogitsProcessor,
    PrefixConstrainedLogitsProcessor,
    RepetitionPenaltyLogitsProcessor,
    TemperatureLogitsWarper,
    TopKLogitsWarper,
    TopPLogitsWarper,
)
from transformers.utils import logging

In [17]:
GenerationMixin().generate

<bound method GenerationMixin.generate of <src.custom_generate_utils.GenerationMixin object at 0x7f729bffa860>>

In [473]:
tokenizer.pad_token_id

50257

In [9]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", padding_side='left', pad_token='<pad>', additional_special_tokens=['\n', '<ScratchPad>', '</ScratchPad>'])

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [376]:
tokenizer.decode(clean_up_tokenization_spaces=True)

TypeError: decode() missing 1 required positional argument: 'token_ids'

In [191]:
inputs = tokenizer(['foo', 'bar baz'], padding=True, return_tensors='pt')
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

In [511]:
from src.custom_generate_utils import GenerationMixin
model.generate = GenerationMixin.generate
model.beam_sample = GenerationMixin.beam_sample
model.beam_search = GenerationMixin.beam_search

input_ids = tokenizer(['this is a test <ScratchPad>4>>>'], padding=True, return_tensors='pt').input_ids
print("Input tokens: ",  input_ids)
gen_sequences = model.generate(model, input_ids, max_length=15, num_beams=2, num_beam_groups=1, diversity_penalty=0, use_cache=False, pad_token_id=50257, 
               do_sample=False, num_return_sequences=2, postfix_additional_tokens_fn=scratch_pad_complete)
[tokenizer.decode(gen_sequences[i], spaces_between_special_tokens=False) for i in range(gen_sequences.shape[0])]

Input tokens:  tensor([[ 5661,   318,   257,  1332, 50259,    19, 33409]])
baz
i'm in beam search mode


['this is a test<ScratchPad>4>>>4\n decidedly decidedlymineilage force force',
 'this is a test<ScratchPad>4>>>4\n decidedly decidedlyionsions 84 84']

In [410]:
inputs = tokenizer(['this is a test<ScratchPad>a=3>>>'], padding=True, return_tensors='pt')
inputs['input_ids']

tensor([[ 5661,   318,   257,  1332, 50259,    64,    28,    18, 33409]])

In [411]:
tokenizer.decode([ 5661,   318,   257,  1332, 50259,    64,    28,    18, 33409], spaces_between_special_tokens=False)

'this is a test<ScratchPad>a=3>>>'

In [432]:
(input_ids!=50257).to(torch.int)

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1]], dtype=torch.int32)

In [77]:
s = 'mysrv events Generating Event Name <ScratchPad>Service Current, Category</ScratchPad> [Service] Test <ScratchPad>9991</ScratchPad> Value [1.22]'
print(re.findall(r'\<ScratchPad\>([^]]*)\</ScratchPad\>', s))
['Service Current', 'Service', '9991', '1.22']

['Service Current, Category', '9991']


['Service Current', 'Service', '9991', '1.22']

In [81]:
s.rpartition('<ScratchPad>')[2]

'9991</ScratchPad> Value [1.22]'