In [1]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import json
import os

In [2]:
with open("inter_vocab.json", "r") as f:
    inter_vocab = json.load(f)

free_tokens = list(inter_vocab.values())
encode_tokens = free_tokens
decode_tokens = {t: i for i, t in enumerate(free_tokens)}
def encode(token):
    return free_tokens[token]

def decode(token):
    return decode_tokens[token]

def change_token_format(token_file):
    with open(token_file, "r") as f:
        blobs = json.load(f)
    
    new_blobs = []
    for blob in blobs:
        new_blobs.append([encode(token - 65000) if token > 65000 else token for token in blob])
    
    with open(token_file.split(".json")[0] + "_converted.json", "w") as f:
        json.dump(new_blobs, f)

change_token_format("../src/outputs/blobs_tokenized_timit.json")


In [3]:
from torch.utils.data import TensorDataset, Dataset
with open("../src/outputs/blobs_tokenized_timit_converted.json", "r") as f:
    tokenized_dataset = json.load(f)
# pad each blob to length of 2048
tokenized_dataset = [blob + [0] * (2048 - len(blob)) for blob in tokenized_dataset]
# Cut blobs that are too long, print how many such blobs there are
print(len([blob for blob in tokenized_dataset if len(blob) > 2048]))
tokenized_dataset = [blob[:2048] for blob in tokenized_dataset]
tokenized_dataset = [torch.tensor(blob) for blob in tokenized_dataset]
# Print total number of blobs
print(len(tokenized_dataset))
print(tokenized_dataset)
# Create pytorch dataset, with one feature called "input_ids"
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
    
    def __getitem__(self, index):
        input_ids = self.data[index]
        attention_mask = [1] * len(input_ids)
        return {
            'input_ids': torch.tensor(input_ids),
            'attention_mask': torch.tensor(attention_mask)
        }
    
    def __len__(self):
        return len(self.data)

dataset = MyDataset(tokenized_dataset)

batch_size = 1
data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
print(tokenized_dataset)

120
380
[tensor([ 1936, 63458,  2549,  ...,     0,     0,     0]), tensor([ 1936, 63458,  5000,  ...,     0,     0,     0]), tensor([ 1936, 63458,  8872,  ...,     0,     0,     0]), tensor([ 1936, 63458,  2319,  ...,     0,     0,     0]), tensor([ 1936, 63458,   368,  ..., 63990, 63966, 17581]), tensor([ 1936, 63458,   619,  ...,     0,     0,     0]), tensor([ 1936, 63458,  2422,  ...,     0,     0,     0]), tensor([ 1936, 63458,   619,  ...,     0,     0,     0]), tensor([ 1936, 63458,   409,  ...,     0,     0,     0]), tensor([ 1936, 63458,  9075,  ...,     0,     0,     0]), tensor([ 1936, 63458,  2549,  ...,     0,     0,     0]), tensor([ 1936, 63458,  5000,  ...,     0,     0,     0]), tensor([ 1936, 63458,  1652,  ..., 63978, 63924, 51535]), tensor([ 1936, 63458,  1395,  ...,     0,     0,     0]), tensor([ 1936, 63458,  2598,  ..., 63598, 63925, 63748]), tensor([ 1936, 63458, 13015,  ...,     0,     0,     0]), tensor([ 1936, 63458, 34891,  ...,     0,     0,     0]), tenso

In [4]:
model_name = "AI-Sweden-Models/gpt-sw3-126m"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [6]:
model.device

device(type='cuda', index=0)

In [7]:
training_args = TrainingArguments(
    output_dir="./gpt-sw3",
    per_device_train_batch_size=batch_size,
    overwrite_output_dir=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

In [8]:
trainer.train()

***** Running training *****
  Num examples = 380
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 1140
  Number of trainable parameters = 135780864
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: anforsm (dt2112g1). Use `wandb login --relogin` to force relogin


  'input_ids': torch.tensor(input_ids),


OutOfMemoryError: CUDA out of memory. Tried to allocate 500.00 MiB (GPU 0; 8.00 GiB total capacity; 6.85 GiB already allocated; 0 bytes free; 6.93 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF