In [1]:
import sentencepiece as spm
import torch
from torch import nn
from torch.nn import functional as F
import numpy as np
from pathlib import Path
import pandas as pd
from tqdm import tqdm
import math
from collections import OrderedDict
import importlib
import teenygpt

In [2]:
# Load text.
with open("data/tiny_shakespeare.txt", "rt", encoding="utf-8-sig") as f:
    text = f.read()

# Set up encoder.
encoder = teenygpt.CharEncoder(text)

# Set up datasets.
dataset_train, dataset_val, dataset_test = teenygpt.create_datasets(text, encoder)

In [5]:
xs, ys = dataset_train.get_batches(teenygpt.Config())

for i in range(len(ys)):
    x = repr(encoder.decode(xs[i].tolist()))
    y = repr(encoder.decode(ys[i].tolist()))
    print(f"{i:2} {x:24} {y}")

 0 ' down,\nAnd pay y'      'down,\nAnd pay yo'
 1 'e will meet them'       ' will meet them,'
 2 'd let it be more'       ' let it be more '
 3 'titchery; I must'       'itchery; I must '
 4 "e welcome: what'"       " welcome: what's"
 5 't do well, I do '       ' do well, I do n'
 6 'e half of my lan'       ' half of my land'
 7 'nceit, my gracio'       'ceit, my graciou'
 8 'here protest,\nUp'      'ere protest,\nUpo'
 9 ' supply the room'       'supply the room:'
10 'all be satisfied'       'll be satisfied\n'
11 'conditions, whic'       'onditions, which'
12 ' legs and not pr'       'legs and not pre'
13 'y your voices, w'       ' your voices, wi'
14 'an; against whos'       'n; against whose'
15 'rson than myself'       'son than myself,'
16 ' III:\nO Ratcliff'      'III:\nO Ratcliff,'
17 'r:\nI wis it is n'      ':\nI wis it is no'
18 't up.\n\nANTIGONUS'     ' up.\n\nANTIGONUS:'
19 'my spiriting gen'       'y spiriting gent'


In [218]:
importlib.reload(teenygpt.config)
importlib.reload(teenygpt.model)
importlib.reload(teenygpt.model_bkitano)
importlib.reload(teenygpt)

<module 'teenygpt' from '/Volumes/git/src/achang/teenyGPT/teenygpt/__init__.py'>

In [213]:
config = teenygpt.Config(
    vocab_size=67,
    batch_size=128,
    d_model=128,
    d_ffn=128,
    attention_heads=8,
    attention_layers=4,
    dropout_p=0.1,
)
#losses = {}

In [219]:
# model = teenygpt.LlamaModel(config, "LlamaModel")
model = teenygpt.AttentionModel(config, "AttentionModel - v4")
optimizer = torch.optim.Adam(model.parameters())

print(f'model parameters: {model.param_count()}')

def train(iterations):
    train_losses = []
    val_losses = []
    best_loss = math.inf
    
    progress = tqdm(range(iterations))
    for i in progress:
        xs, ys = dataset_train.get_batches(config)
    
        optimizer.zero_grad()
        
        logits = model(xs)
        loss = model.loss(logits, ys)
        
        loss.backward()
        optimizer.step()
    
        if i % 10 == 0:
            train_loss = model.estimate_loss(dataset_train)
            val_loss = model.estimate_loss(dataset_val)

            train_losses.append(train_loss)
            val_losses.append(val_loss)
            
            best_loss = min(best_loss, val_loss)
            progress.set_description(f"val_loss: {best_loss:0.2f}")

    return train_losses, val_losses

train_losses, val_losses = train(5000)
losses = losses | {f"train ({model.name})": train_losses, f"val ({model.name})": val_losses}
losses_df = pd.DataFrame({k: pd.Series(v) for k, v in losses.items()})
losses_df.plot()
losses_df.min()

model parameters: 543432


val_loss: 1.09:  71%|█████████████████████████████████████████████████████████████████████████████▍                               | 3553/5000 [09:43<04:23,  5.48it/s][E thread_pool.cpp:109] Exception in thread pool task: mutex lock failed: Invalid argument
val_loss: 1.09:  71%|█████████████████████████████████████████████████████████████████████████████▍                               | 3554/5000 [09:43<03:57,  6.09it/s]


KeyboardInterrupt: 

In [186]:
teenygpt.AttentionModel(
    teenygpt.Config(
        vocab_size=67,
        batch_size=64,
        d_model=128,
        attention_heads=8,
        attention_layers=3,
        dropout_p=0.1,
    ),
"dummy").param_count()

413511

In [70]:
generated = model.generate(xs, count=100)
for i in range(generated.shape[b0]):
    print(repr(encoder.decode(generated[i])))

" down,\nAnd pay yourst,\nStenter not.\na we speak:\nAnge-pole he 'trayal nhegele's comess: I no the foor thest\nWill for "
"e will meet them, loes have well atccedried me?\n\n\nGLOUCESTESBY:\n\nKING ROK:\nBy, redet, Lecomengen'd? droof.\n\nAUTOESn "
"d let it be more mince,\nBoth remeanies, or thy low-k noble'd like\nAnd 'ses drumngerating our me\nTraining the sonleea"
"titchery; I must and Kin,\nO gaint notreast\nFor us gooIr am say like? Forbad, you, my unig band reace,\nI'll my suturn"
"e welcome: what's lay Sergerowart here'el,\nFain\nFerelmard me frisitshfuld?\n\nANTONIO:\nO, good now befe I reselver,\nIs"
't do well, I do idnot fe, behopard, I foge I must prerson as excills and prayed my court,\nAnd\nnones what pame\nFectur'
'e half of my landes!\n\nMAtcENIO:\nUas I\nWARINA:\nThough ponate, as ome draiter confort, Chartion, fur prayed by resome '
'nceit, my graciousan, lead me cumble a limess and me\nAs Cirtizen and pyour thansess,\nWhen?\nWhy, would is MursEraUR:\n'
'here prote

# Train

In [None]:
spm.SentencePieceTrainer.train(
    input='data/tiny_shakespeare.txt',
    model_prefix='model/sentencepiece/tiny_shakespeare_67',
    vocab_size=67,
    remove_extra_whitespaces=False,
    character_coverage=1.0,
)

In [None]:
processor = spm.SentencePieceProcessor(
    model_file="model/sentencepiece/tiny_shakespeare.model"
)

In [None]:
x = processor.encode('this is a test', add_eos=True)

In [None]:
processor.decode(x)

In [None]:
processor.piece_to_id('<s>')