```
python3 -m venv run --system-site-packages
source run/bin/activate
pip install wandb==0.14.0
wandb login e5292edda95a11630042fdf943d60d2bbf749fcf
pip install datasets
pip install tokenizers
pip install transformers
```

In [2]:
# pyt
import torch as t
import torch.nn as nn
from torch.utils.data import DataLoader

# data pipeline
from datasets import load_dataset, DatasetDict, load_from_disk
from typing import cast
import math, random

# tokenization
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.normalizers import Lowercase
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.processors import TemplateProcessing
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.trainers import BpeTrainer
from transformers import PreTrainedTokenizerFast

# logging
import os, argparse
import wandb


  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [3]:
device = t.device("cuda" if t.cuda.is_available() else "cpu")
print(f"Using {device} device")

Using cuda device


In [1]:
hyper = {
    'vs': 2**13,
    'ly': 4,
    'hs': 768,
    'ah': 4,
    'cx': 512,
    'lr': 1e-4,
    'bs': 32,
    'ac': 4,
    'ep': 10,
}

hyper = argparse.Namespace(**hyper)


NameError: name 'argparse' is not defined

In [4]:
dataset = cast(DatasetDict, load_dataset('skeskinen/TinyStories-Instruct-hf'))
dataset['train'].set_format(type='torch', columns=['text'])
dataset['train'].format['type']
dataset['validation'].set_format(type='torch', columns=['text'])
dataset['validation'].format['type']
print(dataset)

Found cached dataset parquet (/Users/galen/.cache/huggingface/datasets/skeskinen___parquet/skeskinen--TinyStories-Instruct-hf-1f9111cb77858404/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2476533
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 25028
    })
})


In [5]:
tok = Tokenizer(BPE())
tok.normalizer = Lowercase()
tok.pre_tokenizer = ByteLevel()
tok.decoder = ByteLevelDecoder()
tok.post_processor = TemplateProcessing(single='$0 <|endoftext|>', special_tokens=[('<|endoftext|>', 1)],)
tok.enable_truncation(max_length=hyper.cx)
tok.enable_padding(pad_token='<pad>', length=hyper.cx)
trainer = BpeTrainer(vocab_size=hyper.vs, initial_alphabet=ByteLevel.alphabet(), special_tokens=['<pad>', '<|endoftext|>', '\n','Words: ', 'Features: ', 'Random sentence: ', 'Summary: ', 'Story: '])

In [6]:
if os.path.isfile('tiny.json'): tok = Tokenizer.from_file('tiny.json')
else: tok.train_from_iterator(dataset['train']['text'], trainer=trainer); tok.save('tiny.json')

tok = PreTrainedTokenizerFast(tokenizer_object=tok)
tok.pad_token = 0


In [7]:
def tokenization(example):
    return tok(example['text'], truncation=True, max_length=hyper.cx, padding='max_length')

if os.path.exists('train_dataset') and os.path.exists('valid_dataset'):
    train = load_from_disk('train_dataset')
    valid = load_from_disk('valid_dataset')
else:
    train = dataset['train'].map(tokenization, batched=True)
    valid = dataset['validation'].map(tokenization, batched=True)
    train.save_to_disk('train_dataset')
    valid.save_to_disk('valid_dataset')


In [8]:
train.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask'])
train.format['type']
valid.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask'])
valid.format['type']

'torch'

In [9]:
trainl = DataLoader(train, batch_size=hyper.bs, shuffle=True)
validl = DataLoader(valid, batch_size=hyper.bs, shuffle=True)

In [10]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len, dropout = 0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = t.arange(max_len).unsqueeze(1)
        div_term = t.exp(t.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = t.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = t.sin(position * div_term)
        pe[:, 0, 1::2] = t.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)
    
    
class trans(nn.Module):
    def __init__(self):
        super().__init__()
        self.inbed = nn.Embedding(hyper.vs, hyper.hs)
        self.posit = PositionalEncoding(hyper.hs, hyper.cx)
        self.think = nn.TransformerEncoderLayer(d_model=hyper.hs, nhead=hyper.ah, dim_feedforward=hyper.hs*4, activation='gelu')
        self.thnkr = nn.TransformerEncoder(self.think, num_layers=hyper.ly)
        self.speak = nn.Linear(hyper.hs, hyper.vs)
        self.cmask= t.triu(t.ones(hyper.cx, hyper.cx) * float('-inf'), diagonal=1)
    def forward(self, x, pask=None):
        x = self.inbed(x) * (hyper.hs ** .5)
        x = self.posit(x)
        x = self.thnkr(x, is_causal=True, mask=pask if pask is not None else self.cmask)
        return self.speak(x)


In [11]:
# run = wandb.init(
#     project="tinystories",
#     config={
#         "learning_rate": hyper.lr,
#         "epochs": 1,
#     })

In [12]:
storytell = trans()

print(f'There are {round((sum(p.numel() for p in storytell.parameters()) - hyper.vs*hyper.hs*2)/1e6, 1)} million parameters in the model, plus {round((hyper.vs*hyper.hs*2)/1e6, 1)} million embeddings parameters.')

There are 35.4 million parameters in the model, plus 12.6 million embeddings parameters.


In [13]:
# wandb.watch(storytell, log_freq=100)

ValueError: You must call `wandb.init` before calling watch

In [14]:
optim = t.optim.Adam(storytell.parameters(), lr=hyper.lr)

In [15]:
lossf = nn.CrossEntropyLoss()

In [None]:
step = 110001
for epoch in range(hyper.ep):
for batch in trainl:
    step += 1
    seq = batch['input_ids']
    out = storytell(seq)
    if hyper.cu:
        loss = lossf(t.flatten(out, end_dim=1), t.flatten(t.roll(seq, -1)))
    else:
        loss = lossf(t.flatten(out, end_dim=1), t.flatten(t.roll(seq.to('cpu'), -1).to('mps')))
    loss.backward()

    if (step % hyper.ac == 0) or (step + 1 == len(trainl)):
            optim.step()
            optim.zero_grad()
            # wandb.log({"loss": loss})
            print(f'Step {step+1} of {len(trainl)}: loss {loss.item()}')

    if step % 100 == 0:
        t.save(storytell.state_dict(), f'story_model_{step}.pt')
        t.save(optim.state_dict(), f'story_optim_{step}.pt')
    
        with t.no_grad():
            tloss = 0
            steps = 0
            storytell.eval()
            for batch in validl:
                seq = batch['input_ids'].to('cuda')
                out = storytell(seq)
                tloss += lossf(t.flatten(out, end_dim=1), t.flatten(t.roll(seq, -1))).item()
                steps += 1
            print(f'validation: loss {tloss/steps}')
            storytell.train()

RuntimeError: Placeholder storage has not been allocated on MPS device!

In [16]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len, dropout = 0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = t.arange(max_len).unsqueeze(1)
        div_term = t.exp(t.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = t.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = t.sin(position * div_term)
        pe[:, 0, 1::2] = t.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)
    
    
class trans(nn.Module):
    def __init__(self):
        super().__init__()
        self.inbed = nn.Embedding(hyper.vs, hyper.hs)
        self.posit = PositionalEncoding(hyper.hs, hyper.cx)
        self.think = nn.TransformerEncoderLayer(d_model=hyper.hs, nhead=hyper.ah, dim_feedforward=hyper.hs*4, activation='gelu')
        self.thnkr = nn.TransformerEncoder(self.think, num_layers=hyper.ly)
        self.speak = nn.Linear(hyper.hs, hyper.vs)
        self.cmask= t.triu(t.ones(hyper.cx, hyper.cx) * float('-inf'), diagonal=1)
    def forward(self, x, pask=None):
        x = self.inbed(x) * (hyper.hs ** .5)
        x = self.posit(x)
        x = self.thnkr(x, is_causal=True, mask=pask if pask is not None else self.cmask)
        return self.speak(x)


In [17]:
storytell = trans()
optim = t.optim.Adam(storytell.parameters(), lr=hyper.lr)

In [18]:
# load model state dict story_model_1000.pt
storytell.load_state_dict(t.load('story_model_11000.pt', map_location=t.device('mps')))
# load optim state dict
# optim.load_state_dict(t.load('story_optim_4500.pt', map_location=t.device('mps')))


<All keys matched successfully>

In [302]:

idx = random.randint(0, len(valid) - 1)
print(idx)

# print the original text
print(tok.decode(valid['input_ids'][idx]))

# generate a story using the model
print('model gen:')
print(tok.decode(storytell(valid['input_ids'][idx].unsqueeze(0).to(t.long)).argmax(dim=-1)[0]))

3174
Features:  conflict
Summary:  dave the duck wanted to win a prize and encountered another duck who wouldn't let him have it. he convinced her to share the prize with him and they both enjoyed it.
Random sentence:  she agreed and dave grabbed the prize proudly.
Story:  

 once there lived a duck named dave. dave was excited to enter the new prize game. every day, he patiently waited for it to start.

 one day, dave saw a big prize near a pond. he wanted to make it his own. he moved closer to get it, but another duck swam in front of him. she wouldn't let him take the prize.

 dave tried to convince her by talking nicely to her. he thought this was the only way he could get the prize. but the other duck would not listen.

 finally, dave had an idea. he told the other duck that if she moved aside, he would share the warm prize with her. she agreed and dave grabbed the prize proudly. he and the duck shared the prize and both enjoyed it.<|endoftext|>000000000000000000000000000000000000

In [19]:

def generate_text(prompt, model, tokenizer, temperature=1.0, max_len=512):
    model.eval()
    with t.no_grad():
        input_ids = tokenizer.encode(prompt, return_tensors='pt').to('mps')[:, :-1]
        print(input_ids.shape)
        cur_len = input_ids.shape[1]
        while cur_len < max_len:
            outputs = model(input_ids)
            next_token_logits = outputs[0][-1, :] / temperature
            next_token_logits[1] = -float('inf')
            next_token_id = t.multinomial(t.softmax(next_token_logits, dim=-1), num_samples=1).unsqueeze(-1)
            input_ids = t.cat([input_ids, next_token_id], dim=1)
            cur_len += 1
            if next_token_id[0][0] == tokenizer.eos_token_id:
                break
        return tokenizer.decode(input_ids.squeeze()[55:], skip_special_tokens=False)
    
idx = random.randint(0, len(valid) - 1)
prompt = tok.decode(valid['input_ids'][idx][:60])
print(prompt)
print(generate_text(prompt, storytell, tok, temperature=1, max_len=200))

Features:  dialogue
Words:  watch, pasta, nosy
Story: 

 abby was so excited to go to the park. she was on her way when she noticed the pasta shop. she wanted to get a snack before the park.

 abby's mom told her to be careful with the money. abby
torch.Size([1, 60])
 with the money. abby ate it in the ladder was about him with water. one day, her mommy tried to the truck to balance. it. he encountered a little boy named timmy. as they saw a big pose! i can listen carefully this movie could, there was a big open the little seed felt good idea, "because i could fly and lay down. you and dance in the balls and bob loved to buy a big box. she found a boy, "the moon, the little girl was to the sun that she loved paper otter was so glad to play with a mean children and unique journey home and welcomed jack was told her tomatoes and kept flying high for the wet leaf! 
Story:  once upon
