# Example use

## Setup & Imports

In [None]:
# Enable python import reloading
%load_ext autoreload
%autoreload 2

In [2]:
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import math
import urllib
from typing import Tuple
from pathlib import Path
from transformers import GPTNeoXTokenizerFast
from datasets import load_dataset, Dataset, load_from_disk


import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset

from alan_transformer.transformer import Transformer
from alan_transformer.train import train_loop

  from .autonotebook import tqdm as notebook_tqdm


## Task: Complete Works of Shakespeare

### Get the dataset

In [26]:
def create_dataset(data_dir = Path(".data"), load_if_exists: bool = True) -> Dataset:
    # Return the dataset from disk if it already exists
    dataset_path = data_dir / "shakespeare_dataset"
    if dataset_path.exists() and load_if_exists:
        return load_from_disk(dataset_path)
    
    # Download text file
    data_dir = Path(".data")
    data_dir.mkdir(parents=True, exist_ok=True)
    data_path = data_dir / "shakespeare.txt"
    data_url = "https://www.gutenberg.org/files/100/100-0.txt"
    urllib.request.urlretrieve(data_url, data_path)

    # Load as a dataset
    raw_dataset = load_dataset("text", data_files=str(data_path))

    # Tokenize it
    tokenizer = GPTNeoXTokenizerFast.from_pretrained("EleutherAI/gpt-neox-20b", pad_token = "<|endoftext|>")
    tokenized_dataset = raw_dataset.map(
            lambda examples: tokenizer(
                examples["text"], 
                padding="max_length", # Pad to the max length
                truncation=True, # Truncate to the max length
                max_length=1024, # 1024 is the default max length for our transformer,
                is_split_into_words=False,
                return_tensors="pt" # Return a pytorch tensor per prompt
            )
        )

    # Save the dataset
    tokenized_dataset.save_to_disk(dataset_path)
    
    # Return the dataset
    return tokenized_dataset

dataset = create_dataset(load_if_exists = False)

# dataloader = DataLoader(dataset["train"], batch_size=8, shuffle=True)

Using custom data configuration default-0fb7688fd30e1ad5


Downloading and preparing dataset text/default to /home/user/.cache/huggingface/datasets/text/default-0fb7688fd30e1ad5/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 5461.33it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 1011.89it/s]
                                                                      

Dataset text downloaded and prepared to /home/user/.cache/huggingface/datasets/text/default-0fb7688fd30e1ad5/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 329.92it/s]
100%|██████████| 172423/172423 [01:33<00:00, 1849.96ex/s]
                                                                                                   

In [2]:
next(iter(DataLoader(dataset["train"], batch_size=8, shuffle=True)))

NameError: name 'DataLoader' is not defined

In [12]:
tokenizer = GPTNeoXTokenizerFast.from_pretrained("EleutherAI/gpt-neox-20b", pad_token = "<|endoftext|>")

text = ['    This seven years did not Talbot see his son;',
 '',
 '',
 'CLOWN.',
 "    Freshly on me. 'Tis surely for a name.",
 'Till thou the lie-giver and that lie do lie',
 'Apt to be render’d, for someone to say,',
 'think thee an honest man; thou shouldst neither want my means for thy']

res = tokenizer(
                text, 
                padding="max_length", # Pad to the max length
                truncation=True, # Truncate to the max length
                max_length=1024, # 1024 is the default max length for our transformer,
                is_split_into_words=False,
                return_tensors="pt"
            )

res["input_ids"][0]

tensor([50274,  1552,  5093,  ...,     0,     0,     0])

### Run the training loop

In [1]:
model = Transformer()

train_loop(
    model,
    dataloader,
    device=torch.device("cpu")
)


KeyboardInterrupt

