In [1]:
import time
import torch
import torch.nn as nn
import numpy as np
import random
from torch import optim
import matplotlib.pyplot as plt
from typing import List
from utils import *
from torch.utils.data import Dataset, DataLoader, RandomSampler
import tqdm
from sklearn.decomposition import PCA
from scipy.stats import ttest_ind
from bus_transformer import *
from datasets import load_dataset
from transformers import AutoTokenizer

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

cuda


- limit sequences to 128
- limit tasks to sentence classification
- use single sequence training without NSP
- 


In [2]:
data = load_dataset('Salesforce/wikitext', 'wikitext-103-v1')

In [3]:
data.column_names

{'test': ['text'], 'train': ['text'], 'validation': ['text']}

In [4]:
data['train'].num_rows

1801350

In [5]:
data['test'].num_rows

4358

In [6]:
data['validation'].num_rows

3760

In [7]:
train = data['train']
validation = data['validation']
test = data['test']

In [61]:
seq_len = 512 

In [9]:
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')



In [10]:
def remove_empty(examples):
    ret = {}
    ret['text'] = []
    for ex in examples['text']:
        if len(ex) > 2:
            ret['text'].append(ex)

    return ret


In [11]:
def tokenize_func(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

In [12]:
data_train_remove_empty = train.map(remove_empty, batched=True)

In [13]:
data_valid = validation.map(remove_empty, batched=True)
data_test = test.map(remove_empty, batched=True)

In [14]:
data_train_remove_empty['text'][0:10]

[' = Valkyria Chronicles III = \n',
 ' Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . \n',
 " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for serie

In [15]:
tk_train = data_train_remove_empty.map(tokenize_func, batched=True)

In [16]:
tk_valid = data_valid.map(tokenize_func, batched=True)

In [17]:
tk_test = data_test.map(tokenize_func, batched=True)

Map:   0%|          | 0/2891 [00:00<?, ? examples/s]

In [18]:
tk_train

Dataset({
    features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1165029
})

In [19]:
model = Decoder(num_blocks=3, d_model=128, d_internal=64, vocab_size=30522, num_heads=3, d_hidden=2048, final_dmodel=128)

In [20]:
torch.tensor(tk_train[100]['input_ids']).shape

torch.Size([128])

In [21]:
from importlib import reload
import bus_transformer as bus

In [35]:
reload(bus)
import bus_transformer as bus
model = bus.Decoder(num_blocks=3, d_model=128, d_internal=64, vocab_size=30522, num_heads=3, d_hidden=2048, final_dmodel=128)
model(torch.tensor(tk_train[100]['input_ids'])).shape

torch.Size([1, 128, 30522])

In [36]:
py = model(torch.tensor(tk_train[101]['input_ids']))

In [37]:
import tensorboard_data_server

In [48]:
tk_train['text'][24986:25000]

[' In its original American broadcast on October 31 , 1999 , " Treehouse of Horror X " received an 8 @.@ 6 rating , according to Nielsen Media Research , translating to approximately 8 @.@ 7 million viewers . The episode finished in 34th place in the ratings for the week of October 25 @-@ 31 , 1999 . Following the episode was a rerun of " Treehouse of Horror IX " , which was originally broadcast the previous year . In 2000 , " Treehouse of Horror X " was nominated for a <unk> Golden Eagle Award , which it ultimately won . It was also nominated for a Golden Reel Award in the category of " Best Sound Editing - Television Animation " , which it lost to the SpongeBob SquarePants episode " Mermaid Man & Barnacle Boy " . On October 7 , 2008 , " Treehouse of Horror X " was released as part of The Simpsons : The Complete Eleventh Season DVD set . Mike Scully , George Meyer , Ian Maxtone @-@ Graham , Ron Hauge , Donick Cary , Tim Long , Matt Selman and Pete Michels participated in the audio com

In [38]:
print(model)

Decoder(
  (SoftMax): LogSoftmax(dim=-1)
  (connection): Linear(in_features=128, out_features=2048, bias=True)
  (FFN): Sequential(
    (0): Dropout(p=0.1, inplace=False)
    (1): ReLU()
    (2): Linear(in_features=2048, out_features=8192, bias=True)
    (3): Dropout(p=0.1, inplace=False)
    (4): ReLU()
    (5): Linear(in_features=8192, out_features=30522, bias=True)
    (6): LogSoftmax(dim=-1)
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (embeddings): Embedding(30522, 128)
  (pos_embedding): PositionalEncoding(
    (emb): Embedding(128, 128)
  )
  (layernorm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
)


In [78]:
def generate_samples(input:List[int]):
    stop = torch.nonzero(input, as_tuple=True)
    print(stop)


In [79]:
def training_loop(model, data, dev, epochs):
    optimizer = optim.AdamW(model.parameters(), lr=1e-5)
    lr_sched = optim.lr_scheduler.CosineAnnealingLR(optimizer, 1e10)
    loss = torch.nn.CrossEntropyLoss()
    for t in range(epochs):
        for i, d in enumerate(data):
            print(d['text'])
            print(d['input_ids'])
            print(d)
            generate_samples(torch.tensor(d['input_ids']))
            if i == 5:
                return
            continue
            y = d
            py = model(d)
            l = loss(py, d)
            model.zero_grad()
            loss.backward()
            # optimizer.step()
            lr_sched.step()

    return model


In [80]:
training_loop(model, tk_train, None, 1)

 = Valkyria Chronicles III = 

[101, 1027, 11748, 4801, 4360, 11906, 3523, 1027, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
{'text': ' = Valkyria Chronicles III = \n', 'input_ids': [101, 1027, 11748, 4801, 4360, 11906, 3523, 1027, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [29]:
loss = torch.nn.CrossEntropyLoss()
loss(py, torch.tensor(tk_train[101]['input_ids']))

ValueError: Expected input batch_size (1) to match target batch_size (128).

In [30]:
tk_train[101]['text']

" Cicely Mary Barker ( 28 June 1895 – 16 February 1973 ) was an English illustrator best known for a series of fantasy illustrations depicting fairies and flowers . Barker 's art education began in girlhood with correspondence courses and instruction at the Croydon School of Art . Her earliest professional work included greeting cards and juvenile magazine illustrations , and her first book , Flower Fairies of the Spring , was published in 1923 . Similar books were published in the following decades . \n"

In [31]:
tk_train[101]['input_ids']

[101,
 25022,
 29109,
 2100,
 2984,
 12852,
 1006,
 2654,
 2238,
 6301,
 1516,
 2385,
 2337,
 3381,
 1007,
 2001,
 2019,
 2394,
 13825,
 2190,
 2124,
 2005,
 1037,
 2186,
 1997,
 5913,
 11249,
 10775,
 20182,
 1998,
 4870,
 1012,
 12852,
 1005,
 1055,
 2396,
 2495,
 2211,
 1999,
 2611,
 9021,
 2007,
 11061,
 5352,
 1998,
 7899,
 2012,
 1996,
 21838,
 2082,
 1997,
 2396,
 1012,
 2014,
 5700,
 2658,
 2147,
 2443,
 14806,
 5329,
 1998,
 11799,
 2932,
 11249,
 1010,
 1998,
 2014,
 2034,
 2338,
 1010,
 6546,
 20182,
 1997,
 1996,
 3500,
 1010,
 2001,
 2405,
 1999,
 4927,
 1012,
 2714,
 2808,
 2020,
 2405,
 1999,
 1996,
 2206,
 5109,
 1012,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]