In [1]:
import os
import torch
import numpy as np
from data_augmentation import DataAugmenter
from transformers import GPT2Tokenizer, GPT2LMHeadModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
assert torch.cuda.is_available()

In [3]:
augmenter = DataAugmenter()
sentences = ["The boy went to the park.", "She loves to read books."]
inputs = augmenter.tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
input_ids_batch = inputs['input_ids'].to('cuda')
output, target_indices = augmenter.augment(input_ids_batch, target_indices=[5,4], do_filter=True)
output.sum(axis=1)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


tensor([3, 1])

In [4]:
augmenter.generate_new_sentences(input_ids_batch, target_indices, output)

[['The boy went to the area.',
  'The boy went to the bench.',
  'The boy went to the entrance.'],
 ['She loves to read too.']]

In [5]:
batch_size = 32 # if gradient_accumulation_steps > 1, this is the micro-batch size
block_size = 32
data_dir = './shakespeare_data/'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [6]:
def get_batch(split):
    # We recreate np.memmap every batch to avoid a memory leak, as per
    # https://stackoverflow.com/questions/45132940/numpy-memmap-memory-usage-want-to-iterate-once/61472122#61472122
    if split == 'train':
        data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
    else:
        data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    if device == 'cuda':
        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y

In [7]:
X, y = get_batch('train')

In [8]:
augmenter = DataAugmenter(k=3)
output, target_indices = augmenter.augment(X, do_filter=False)

In [9]:
output.sum(axis=1)

tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3])

In [10]:
augmenter.generate_new_sentences(X, target_indices, output)

[[" her best array;\nBut, like a misbehaved and sullen wench,\nThou pout'st upon thy fortune and own love",
  " her best array;\nBut, like a misbehaved and sullen wench,\nThou pout'st upon thy fortune and good love",
  " her best array;\nBut, like a misbehaved and sullen wench,\nThou pout'st upon thy fortune and fortune love"],
 [" know your daring tongue\nScorns to unsay what once it hath deliver'd.\nIn that dead time when Gloucester's, was plotted,\nI",
  " know your daring tongue\nScorns to unsay what once it hath deliver'd.\nIn that dead time when Gloucester's was was plotted,\nI",
  " know your daring tongue\nScorns to unsay what once it hath deliver'd.\nIn that dead time when Gloucester's had was plotted,\nI"],
 [', how this is encompasseth finger.\nEven so thy breast encloseth my poor heart;\nWear both of them, for both of them are',
  ', how this was encompasseth finger.\nEven so thy breast encloseth my poor heart;\nWear both of them, for both of them are',
  ', how this will e