In [1]:
import time
import torch
import torch.nn as nn
import numpy as np
import random
from torch import optim
import matplotlib.pyplot as plt
from typing import List
from utils import *
from torch.utils.data import Dataset, DataLoader, RandomSampler
import tqdm
from sklearn.decomposition import PCA
from scipy.stats import ttest_ind
from bus_transformer import *
from datasets import load_dataset
from transformers import AutoTokenizer

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

cuda


- limit sequences to 128
- limit tasks to sentence classification
- use single sequence training without NSP
- 


In [2]:
data = load_dataset('Salesforce/wikitext', 'wikitext-103-v1')

In [3]:
data.column_names

{'test': ['text'], 'train': ['text'], 'validation': ['text']}

In [4]:
data['train'].num_rows

1801350

In [5]:
data['test'].num_rows

4358

In [6]:
data['validation'].num_rows

3760

In [7]:
train = data['train']
validation = data['validation']
test = data['test']

In [8]:
seq_len = 128

In [9]:
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')



In [10]:
def remove_empty(examples):
    ret = {}
    ret['text'] = []
    for ex in examples['text']:
        if len(ex) > 2:
            ret['text'].append(ex)

    return ret


In [16]:
def tokenize_func(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

In [12]:
data_train_remove_empty = train.map(remove_empty, batched=True)

Map:   0%|          | 0/1801350 [00:00<?, ? examples/s]

In [14]:
data_valid = validation.map(remove_empty, batched=True)
data_test = test.map(remove_empty, batched=True)

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

In [13]:
data_train_remove_empty['text'][0:10]

[' = Valkyria Chronicles III = \n',
 ' Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . \n',
 " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for serie

In [17]:
tk_train = data_train_remove_empty.map(tokenize_func, batched=True)

Map:   0%|          | 0/1165029 [00:00<?, ? examples/s]

In [18]:
tk_valid = data_valid.map(tokenize_func, batched=True)

Map:   0%|          | 0/2461 [00:00<?, ? examples/s]

In [19]:
tk_test = data_test.map(tokenize_func, batched=True)

Map:   0%|          | 0/2891 [00:00<?, ? examples/s]

In [20]:
tk_train

Dataset({
    features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1165029
})

In [21]:
tk_train.save_to_disk('data/llm_data/tk_train_wiki_dataset.hf')

Saving the dataset (0/9 shards):   0%|          | 0/1165029 [00:00<?, ? examples/s]

In [22]:
tk_valid.save_to_disk('data/llm_data/tk_valid_wiki_dataset.hf')

Saving the dataset (0/1 shards):   0%|          | 0/2461 [00:00<?, ? examples/s]

In [23]:
tk_test.save_to_disk('data/llm_data/tk__wiki_dataset.hf')

Saving the dataset (0/1 shards):   0%|          | 0/2891 [00:00<?, ? examples/s]

In [59]:
model = Decoder(num_blocks=3, d_model=128, d_internal=64, vocab_size=5000, num_heads=3)



In [69]:
model(tk_train[100]['input_ids'])

InternalTorchDynamoError: heads

from user code:
   File "/home/brian/Desktop/school/extra/transformer_pretrain/transfer_transformers/bus_transformer.py", line 31, in forward
    """

Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information


You can suppress this exception and fall back to eager by setting:
    import torch._dynamo
    torch._dynamo.config.suppress_errors = True


In [78]:
tk_train[1092:2000]['text']

[' = = <unk> variations = = \n',
 '',
 ' The development of an enantioselective ( i.e. yielding an enantiomeric excess , which is labelled as " ee " ) variant of the Johnson – Corey – Chaykovsky reaction remains an active area of academic research . The use of chiral sulfides in a stoichiometric fashion has proved more successful than the corresponding catalytic variants , but the substrate scope is still limited in all cases . The catalytic variants have been developed almost exclusively for enantioselective purposes ; typical <unk> reagents are not prohibitively expensive and the racemic reactions can be carried out with equimolar amounts of ylide without raising costs significantly . Chiral sulfides , on the other hand , are more costly to prepare , spurring the advancement of catalytic enantioselective methods . \n',
 '',
 ' = = = <unk> reagents = = = \n',
 '',
 ' The most successful reagents employed in a stoichiometric fashion are shown below . The first is a bicyclic <unk> that 

In [75]:
tk_train[1]['input_ids']

[101,
 1027,
 11748,
 4801,
 4360,
 11906,
 3523,
 1027,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 