In [1]:
!pip install transformers
!pip install datasets
!pip install torch

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
import datetime
import random
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2',
                                          bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>',
                                          pad_token='<|pad|>')
tokenizer(["Lets tokenize this text"
           "also this"], return_attention_mask=True)

{'input_ids': [[43, 1039, 11241, 1096, 428, 2420, 14508, 428]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1]]}

In [4]:
from datasets import load_dataset
dataset = load_dataset("BEE-spoke-data/fineweb-literature-100k")
max_length = tokenizer.model_max_length
dataset = dataset.filter(lambda example: len(example["text"].split()) < max_length)
dataset = dataset.map(lambda x: {"tokenized_output": tokenizer('<|startoftext|>'+ x["text"] + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length" )})

In [5]:
splitted_ds = dataset["train"].train_test_split(test_size=0.1)

In [6]:
splitted_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'tokenized_output'],
        num_rows: 53003
    })
    test: Dataset({
        features: ['text', 'tokenized_output'],
        num_rows: 5890
    })
})

In [7]:
class GPT2Dataset(Dataset):

  def __init__(self, dataset):
      self.dataset = dataset


  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, idx):
    return (torch.tensor(self.dataset[idx]["tokenized_output"]["input_ids"]),
           torch.tensor(self.dataset[idx]["tokenized_output"]["attention_mask"]))

In [8]:
train_dataset = GPT2Dataset(splitted_ds["train"])
val_dataset = GPT2Dataset(splitted_ds["test"])

In [9]:
batch_size = 16

train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size
        )

In [10]:
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)
model.resize_token_embeddings(len(tokenizer))
device = torch.device("cuda")
model.cuda()

seed_val = 42
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [11]:
epochs = 1
learning_rate = 5e-4
warmup_steps = 1e2
output_loss_every_steps = 100

In [12]:
from transformers import AdamW

optimizer = AdamW(model.parameters(),
                  lr = learning_rate)



In [13]:
from tqdm.auto import tqdm
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

training_stats = []
model = model.to(device)

for epoch_i in range(0, epochs):

    print("#"*50)
    print(f'Starting Epoch {epoch_i + 1} / {epochs}')


    total_train_loss = 0

    model.train()

    for step, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader), desc="Training"):

        # extract the data from the batch and place it on the GPU
        input_ids = batch[0].to(device)
        labels = batch[0].to(device)
        masks = batch[1].to(device)
        
        # reset the gradients
        model.zero_grad()

        # perform a forward pass
        outputs = model(input_ids,
                        labels=labels,
                        attention_mask=masks)
        # get the loss
        loss = outputs[0]

        # Accumulate the training loss over all of the batches
        batch_loss = loss.item()
        total_train_loss += batch_loss

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Perform the update step
        optimizer.step()

        # Peek at the loss every output_loss_every_steps
        if step % output_loss_every_steps == 0 and not step == 0:
            print(f"Batch Loss: {batch_loss}")


    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)


    print("#"*50)
    print(f"###  Average epoch training loss: {avg_train_loss}".format(avg_train_loss))
    print("#"*50)



    print("")
    # print("Running Validation...")
    print("#"*50)
    print("## Running Validation")
    print("#"*50)

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in tqdm(validation_dataloader, total=len(validation_dataloader), desc="Validating"):

        input_ids = batch[0].to(device)
        labels = batch[0].to(device)
        masks = batch[1].to(device)

        with torch.no_grad():

            outputs  = model(input_ids,
                            attention_mask=masks,
                            labels=labels)

            loss = outputs[0]

        batch_loss = loss.item()
        total_eval_loss += batch_loss

    avg_val_loss = total_eval_loss / len(validation_dataloader)

    print(f"  Validation Loss: {avg_val_loss}".format())

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss
        }
    )

print("Training Finished")


##################################################
Starting Epoch 1 / 1


Training:   3%|▎         | 101/3313 [01:58<1:02:55,  1.18s/it]

Batch Loss: 2.4491968154907227


Training:   6%|▌         | 201/3313 [03:55<1:01:04,  1.18s/it]

Batch Loss: 2.339693069458008


Training:   9%|▉         | 301/3313 [05:53<59:07,  1.18s/it]  

Batch Loss: 2.1089723110198975


Training:  12%|█▏        | 401/3313 [07:51<57:11,  1.18s/it]

Batch Loss: 2.0105273723602295


Training:  13%|█▎        | 419/3313 [08:12<56:50,  1.18s/it]

In [None]:
# Generate some text to see how the model is doing
model.eval()
for i in range(5):

    sample_outputs = model.generate(
                            bos_token_id=random.randint(1,30000),
                            do_sample=True,
                            top_k=50,
                            max_length = 200,
                            top_p=0.95,
                            num_return_sequences=1
                        )
    for i, sample_output in enumerate(sample_outputs):
            print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

  
