In [84]:
!nvidia-smi

Sat Sep 26 11:07:23 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.66       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P0    35W / 250W |   1427MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install transformers

In [12]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW
from transformers import GPT2TokenizerFast, TextDataset, GPT2LMHeadModel, DataCollatorForLanguageModeling, GPT2Tokenizer, get_linear_schedule_with_warmup
import random
from sklearn.model_selection import train_test_split
import os
from tqdm import tqdm, trange
import numpy as np
from dataclasses import dataclass
import itertools

In [21]:
class LinesTextDatasetWithEpochs(Dataset):
    def __init__(self, examples, tokenizer, block_size, num_epochs, example_del="<|endoftext|>"):
        super(LinesTextDatasetWithEpochs, self).__init__()
        examples_input_ids = []
        for ex in examples:
            examples_input_ids.append(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(example_del + ex)))

        combined_input_ids = []
        for i in range(num_epochs):
            tmp = examples_input_ids.copy()
            random.shuffle(tmp)
            combined_input_ids.extend(list(itertools.chain.from_iterable(tmp)))

        self.data = []
        for i in range(0, len(combined_input_ids) - block_size + 1, block_size):
            self.data.append(tokenizer.build_inputs_with_special_tokens(combined_input_ids[i: i + block_size]))

    def __getitem__(self, i):
        return torch.tensor(self.data[i], dtype=torch.long)

    def __len__(self):
        return len(self.data)

In [82]:
def train(model, train_dataset, val_dataset, device, run_config, collate_fn):
    if not os.path.isdir(run_config.output_dir):
        os.makedirs(run_config.output_dir)

    train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=run_config.batch_size,
                                  collate_fn=collate_fn)

    optimizer = AdamW(model.parameters(), lr=run_config.learning_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=run_config.num_warmup_steps,
                                                num_training_steps=len(train_dataloader)*run_config.num_epochs)
    print("Training started:")
    print(f"\tNum examples = {len(train_dataset)}")
    print(f"\tNum Epochs = {run_config.num_epochs}")

    print((len(train_dataset) // 8))  # todo delete

    # train_iterator = trange(0, int(run_config.num_epochs), desc="Epoch")
    train_iterator = trange(0, 1, desc="Epoch")
    for epoch in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", position=0, leave=True)
        model.train()
        epoch_losses = []
        for step, inputs in enumerate(epoch_iterator):
            # move batch to GPU
            if isinstance(inputs, dict):
                for k, v in inputs.items():
                    inputs[k] = v.to(device)
            else:
                inputs = inputs.to(device)

            # forward pass to compute logits
            outputs = model(**inputs)
            loss = outputs[0]

            epoch_losses.append(loss.item())

            # backward pass - backprop
            model.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            epoch_iterator.set_description(f"Training loss = {loss.item():.4f}")

            if step > 0 and step % (len(train_dataloader) // run_config.num_epochs) == 0:
              output_dir = os.path.join(run_config.output_dir, f"Step_{step}")
              model.save_pretrained(output_dir)
              test_ce = evaluate(model, val_dataset, device, run_config, collate_fn)
              print(f"After step {step + 1}: \n-train CE={np.mean(epoch_losses)}\n-testCE={test_ce}")

        output_dir = os.path.join(run_config.output_dir, f"Epoch_{epoch + 1}")
        model.save_pretrained(output_dir) 


In [86]:
def evaluate(model, test_dataset, device, run_config, collate_fn):
    test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset),
                                 batch_size=run_config.batch_size, collate_fn=collate_fn)
    model.eval()
    ce_losses = []
    for inputs in tqdm(test_dataloader, desc="Evaluating", position=0, leave=True):
        # move batch to GPU
        if isinstance(inputs, dict):
            for k, v in inputs.items():
                inputs[k] = v.to(device)
        else:
            inputs = inputs.to(device)

        with torch.no_grad():
            loss = model(**inputs)[0]
        ce_losses.append(loss.item())

    return np.mean(ce_losses)

In [7]:
@dataclass
class RunConfig:
    learning_rate: float = 3e-5
    batch_size: int = 4
    num_epochs: int = 1
    num_warmup_steps: int = 10
    max_len: int = 512
    output_dir: str = "./model/"
    block_size = 128

In [8]:
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


In [9]:
run_config = RunConfig(
    batch_size = 32,
    num_epochs = 5,
    output_dir = "/content/drive/My Drive/NLP workshop/quotes"
)

In [None]:
with open("/content/drive/My Drive/NLP workshop/quotes/quotes_train.txt", "r") as f:
    examples = [l.strip() for l in f.readlines()]

train_examples, valid_examples = train_test_split(examples, test_size=0.2)

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
collate_call = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [22]:
train_dataset = LinesTextDatasetWithEpochs(train_examples, tokenizer, run_config.block_size, run_config.num_epochs)
val_dataset = LinesTextDatasetWithEpochs(valid_examples, tokenizer, run_config.block_size, 1)

In [83]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [None]:
train(model, train_dataset, val_dataset, device, run_config, collate_call)

In [126]:
def generate_text_greedy(prompt="", max_length=64):
  model.eval()
  input_ids = tokenizer.encode("<|endoftext|>" + prompt, return_tensors='pt').cuda()
  generated_ids = model.generate(input_ids, max_length=max_length).cpu().tolist()

  generated_text = [tokenizer.decode(g, skip_special_tokens=True) for g in generated_ids]
  return generated_text


def generate_text_beam(prompt="", max_length=64, num_beams=5):
  model.eval()
  input_ids = tokenizer.encode("<|endoftext|>" + prompt, return_tensors='pt').cuda()
  generated_ids = model.generate(input_ids, max_length=max_length, num_beams=num_beams,
                                 no_repeat_ngram_size=2).cpu().tolist()

  generated_text = [tokenizer.decode(g, skip_special_tokens=True) for g in generated_ids]
  return generated_text


def generate_text_sampling(prompt="", max_length=64, top_k=50, top_p=0.95, temp=1.0, num_return=1):
  model.eval()
  input_ids = tokenizer.encode("<|endoftext|>" + prompt, return_tensors='pt').cuda()
  generated_ids = model.generate(input_ids, do_sample=True, max_length=max_length, temperature=temp, 
                                 top_k=top_k, top_p=top_p, num_return_sequences=num_return).cpu().tolist()

  generated_text = [tokenizer.decode(g, skip_special_tokens=True) for g in generated_ids]
  return generated_text

In [130]:
print(generate_text_greedy())
print(generate_text_beam())
print(generate_text_sampling(num_return=3))

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


['The best way to get lost in the beauty of life is to be yourself.']


Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


["If you want to be a success, you must be willing to take risks. If you don't, chances are you won't be successful."]
['The worst kind of revenge is when you beat up someone because they were not responsible for his/her bad actions.', 'I used to believe all things were just a dream. Now I know they were real when you believe everything is real.', 'Your thoughts and actions shape your future. You are what you think and you are what you do.']


In [144]:
generate_text_sampling(num_return=3, top_k=50, top_p=0.95, temp=1.0)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


["If we don't let someone take us to the right place, we're dead.",
 'The beauty of life is that it is so simple.',
 'I always feel safe, because I know my life is mine.']

In [79]:
1 / 0

ZeroDivisionError: ignored

In [80]:
model = None
train_dataloder = None
epoch_iterator = None
x, y = None, None
loss, optimizer = None, None
logits_y = None
scheduler = None
import gc
gc.collect()
torch.cuda.empty_cache()