# Example use

## Setup & Imports

In [3]:
# Enable python import reloading
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import math
import urllib
from typing import Tuple
from pathlib import Path
from transformers import GPTNeoXTokenizerFast
from datasets import load_dataset


import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset

from alan_transformer.transformer import Transformer
from alan_transformer.train import train_loop

## Task: Induction Heads

The steps are:

- Download text file from https://www.gutenberg.org/files/100/100-0.txt
- Tokenize it
- Break up into prompts, where each prompt is size 1024 + answer
  prompt is the same size (but with a token offset of 1).
- One-hot-encode these
- Group prompts into batches
- Initialise model
- Run train loop - noting that we need to optimise based on all answers.

We'll give the model a dataset with examples of tokens that are repeated (e.g.
" a b c a b c"). The sequence will be random tokens. We can then test if the
model has learnt induction heads (so that it can do " a b c a" -> " b").

### Create the dataset

## Task: Complete Works of Shakespeare

In [16]:
# Download text file
data_dir = Path(".data")
data_dir.mkdir(parents=True, exist_ok=True)
data_path = data_dir / "shakespeare.txt"
data_url = "https://www.gutenberg.org/files/100/100-0.txt"
urllib.request.urlretrieve(data_url, data_path)

# Load as a dataset
dataset = load_dataset("text", data_files=str(data_path))

Using custom data configuration default-9d75120557a0bbf5


Downloading and preparing dataset text/default to /home/user/.cache/huggingface/datasets/text/default-9d75120557a0bbf5/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 2481.84it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 809.71it/s]
                                                                     

Dataset text downloaded and prepared to /home/user/.cache/huggingface/datasets/text/default-9d75120557a0bbf5/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 281.70it/s]


In [29]:
# Tokenize it
tokenizer = GPTNeoXTokenizerFast.from_pretrained("EleutherAI/gpt-neox-20b", pad_token = "<|endoftext|>")
dataset = dataset.map(
    lambda examples: tokenizer(
        examples["text"], 
        padding="max_length", # Pad to the max length
        truncation=True, # Truncate to the max length
        max_length=1024 # 1024 is the default max length for our transformer
    ),
    batched=True)

 82%|████████▏ | 142/173 [01:01<00:13,  2.27ba/s]

In [None]:
len(dataset['train'][0]["input_ids"])

[9846,
 452,
 281,
 2451,
 253,
 5323,
 273,
 253,
 2586,
 835,
 368,
 403,
 4441,
 1078,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]