# Installations

In [11]:
!pip install transformers



# GPU Debugging

In [1]:
!nvidia-smi

Fri Aug 27 11:46:00 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           On   | 00000001:00:00.0 Off |                    0 |
| N/A   37C    P8    27W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

# Imports

In [1]:

import numpy as np
import os
import random
import glob
from tqdm import tqdm
import logging
import os
import pickle

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler

import transformers
from transformers import (
    GPT2Config,
    GPT2LMHeadModel,
    GPT2PreTrainedModel,
    GPT2Tokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    AdamW, 
    get_linear_schedule_with_warmup
)

torch.autograd.set_detect_anomaly(False)
torch.autograd.profiler.profile(False)
torch.autograd.profiler.emit_nvtx(False)
torch.backends.cudnn.benchmark = True

# GPU Info

In [2]:
# Returns device name:
print(f"{torch.cuda.get_device_name(0)} is using as a GPU ")

# Returns the current GPU memory usage by tensors in bytes for a given device
print(f"CUDA memory allocation in bytes: {torch.cuda.memory_allocated()}")

# Returns the current GPU memory managed by the caching allocator in bytes for a given device
print(f"CUDA Caching allocation in bytes: {torch.cuda.memory_cached()}")

# GPU Summary
print(torch.cuda.memory_summary(device=None, abbreviated=False))

Tesla K80 is using as a GPU 
CUDA memory allocation in bytes: 0
CUDA Caching allocation in bytes: 0
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------------------|
| Active memory         |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |    



# Path & Device Arrangement

In [10]:
# Storage path:
output_dir = "./storage/models"
assert os.path.isdir(output_dir), f"{output_dir} could not found." 

# Device:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"{device} is using as a accelerator!")

# Script Path:
love_script_name = "data.txt"
data_dir = os.path.join("data", love_script_name)

cuda is using as a accelerator!


# Dataset 

In [11]:
class ScriptData(Dataset):
    """ MODEL_CLASSES = {"gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer)} """
    logger = logging.getLogger(__name__)

    def __init__(
        self,
        tokenizer: PreTrainedTokenizer,
        file_path: str,
        block_size=512,
        overwrite_cache=False,
    ):
        assert os.path.isfile(file_path)

        block_size = block_size - (
            tokenizer.model_max_length - tokenizer.max_len_single_sentence
        )

        directory, filename = os.path.split(file_path)

        # change if args are added at later point
        cached_features_file = os.path.join(
            directory, "gpt2" + "_" + str(block_size) + "_" + filename
        )

        if os.path.exists(cached_features_file) and not overwrite_cache:
            self.logger.info(
                f"Loading features from your cached file {cached_features_file}"
            )
            with open(cached_features_file, "rb") as cache:
                self.examples = pickle.load(cache)
                self.logger.debug("Loaded examples from cache")
        else:
            self.logger.info(f"Creating features from file {filename} at {directory}")

            self.examples = []
            with open(file_path, encoding="utf-8") as f:
                text = f.read()
                self.logger.debug("Succesfully read text from file")

            tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))

            for i in range(0, len(tokenized_text) - block_size + 1, block_size):
                self.examples.append(
                    tokenizer.build_inputs_with_special_tokens(
                        tokenized_text[i : i + block_size]
                    )
                )

            self.logger.info(f"Saving features into cached file {cached_features_file}")
            
            with open(cached_features_file, "wb") as cache:
                pickle.dump(self.examples, cache, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)

# Initializing GPT-2 from Storage or Web 

In [12]:
gpt_models = {
    'small'  : 'gpt2',
    'medium' : 'gpt2-medium',
    'large'  : 'gpt2-large',
    'x-large': 'gpt2-xl'
}



init_gpt_path = gpt_models['small']
print(f"{init_gpt_path.upper()} model is using as a language model.")

tokenizer = GPT2Tokenizer.from_pretrained(init_gpt_path)
model = GPT2LMHeadModel.from_pretrained(init_gpt_path)

GPT2 model is using as a language model.


Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

# Hyperparameter and Initializing Optimizer & Scheduler

In [21]:
hyperparameters = {
    'BATCH_SIZE': 8,
    'START_EPOCH': 0,
    'END_EPOCH' : 20,
    'LEARNING_RATE': 0.00002,
    'WARMUP_STEPS' : 10000,
    'SHUFFLE' : True,
    'BATCH_SIZE_GPU' : 1,
    'PIN_MEMORY': True,
    'NUM_WORKERS': 4
}



optimizer = AdamW(
        model.parameters(),
        lr = hyperparameters['LEARNING_RATE']
)

scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps = hyperparameters['WARMUP_STEPS'],
        num_training_steps = -1
)

# PyTorch Script Dataset

In [14]:
dataset = ScriptData(
        tokenizer = tokenizer,
        file_path = data_dir
)
script_loader = DataLoader(
        dataset, 
        batch_size = hyperparameters['BATCH_SIZE_GPU'],
        shuffle = hyperparameters['SHUFFLE'],
        pin_memory = hyperparameters['PIN_MEMORY'],
        num_workers = hyperparameters['NUM_WORKERS']
)

x = next(iter(script_loader))
print(f"Shape of the tensors :{x.size()}")

Shape of the tensors :torch.Size([1, 512])


# Test GPT-2 Model

In [11]:
import gc 
gc.collect()

x = next(iter(script_loader))

# Caution: CPU Runnning!
with torch.no_grad():
    loss, logits = model(x.to(non_blocking=False))[:2]

del loss, logits, x

# Clear CUDA Memory

In [53]:
import gc
torch.cuda.empty_cache()
gc.collect()

20

# Parameteter Checkpoint

In [22]:
script_count = 0
sum_loss = 0.0
batch_count = 0
start_epoch = hyperparameters['START_EPOCH']
end_epoch = hyperparameters['END_EPOCH']
batch_iter = 0
BATCH_SIZE = hyperparameters['BATCH_SIZE']

In [23]:
start_epoch, end_epoch, batch_iter, BATCH_SIZE

(0, 20, 0, 8)

# Moving CPU To CUDA

In [24]:
model.train()
model = model.to(device)

In [25]:
def optimizer_to(optim, device) -> None:
    """ Moving PyTorch Optimizer to device """
    for param in optim.state.values():
        if isinstance(param, torch.Tensor):
            param.data = param.data.to(device)
            if param._grad is not None:
                param._grad.data = param._grad.data.to(device)
        elif isinstance(param, dict):
            for subparam in param.values():
                if isinstance(subparam, torch.Tensor):
                    subparam.data = subparam.data.to(device)
                    if subparam._grad is not None:
                        subparam._grad.data = subparam._grad.data.to(device)
optimizer_to(optimizer, device)

# Fine-tuning GPT-2 Loop

In [27]:
for epoch in range(start_epoch, end_epoch):
    print(f"Epoch number : {epoch}")

    for idx, script in tqdm(enumerate(script_loader), total = len(script_loader), leave=False):
        
        # Moving to CUDA:
        x = script.to(device, 
                      #non_blocking=False
            )
        outputs = model(x, labels = x)
        
        loss, logits = outputs[:2] 

        # Normalize loss to account for batch accumulation:                    
        loss = loss / BATCH_SIZE
        loss.backward()

        sum_loss = sum_loss + loss.detach().item()
                       
        script_count = script_count + 1
        batch_iter += 1

        if script_count == BATCH_SIZE:
            script_count = 0    
            batch_count += 1
            optimizer.step()
            scheduler.step() 

            for param in model.parameters():
                param.grad = None

            optimizer.zero_grad()
            model.zero_grad()
            
        if batch_count == 100:
            model.eval()
            print(f"Summed loss : {sum_loss}")

            with torch.no_grad():
                sample_outputs = model.generate(
                                        bos_token_id=random.randint(1, 30000),
                                        do_sample=True,   
                                        top_k=50, 
                                        max_length = 500,
                                        top_p=0.95, 
                                        num_return_sequences=1
                                )

            print("Output:\n" + 100 * '-')
            for i, sample_output in enumerate(sample_outputs):
                sample_text = tokenizer.decode(sample_output, skip_special_tokens=True)
                print("{}: {}".format(i, sample_text))
                    
            batch_save_path = os.path.join(output_dir, 'batch', f"gpt_script_epoch_{epoch}_batch_{batch_iter}.bin")
            state = {
                'epoch': epoch,
                'batch_iter': batch_iter,
                'loss': sum_loss,
                'model_state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'scheduler' : scheduler.state_dict()
            }
            torch.save(state, batch_save_path)
            print(f"Checkpoint {batch_save_path} is succesfully saved.")

            #del state
            #torch.save(model.state_dict(), batch_save_path)
            
            batch_count = 0
            sum_loss = 0.0
            model.train()

Epoch number : 0
Summed loss : 312.8936052918434
Output:
----------------------------------------------------------------------------------------------------
0: ook

"I've read about it, but it's really disturbing"

–Michael Keaton

"The Last Unicorn" is available here

The Lost Season

"We're at a time where humans are disappearing, but we have a world in which we have hope and we have hope that there are new people in the future, and all of a sudden this monster is just coming!"

–Willa Holland, Game of Thrones creator

"The Last Unicorn" is available here

Lights Out

"Everyone should have the opportunity to experience and experience a magical fantasy world from one of The Last Unicorn's characters"

–John Ridley, director

"Lights Out" is available here

"We all should feel very privileged and grateful for The Last Unicorn"

–Matt Lopatin, creator

"Lights Out" is available here

The Last Unicorn

"The Last Unicorn" is available here

The Last Unicorn Season 3

"The last Unicorn is

 28%|██▊       | 750/2720 [03:44<09:41,  3.39it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 57%|█████▋    | 1550/2720 [08:12<05:48,  3.36it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 86%|████████▋ | 2350/2720 [12:33<01:51,  3.33it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 16%|█▌        | 430/2720 [02:10<11:25,  3.34it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 45%|████▌     | 1230/2720 [06:27<07:23,  3.36it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 75%|███████▍  | 2030/2720 [10:49<03:25,  3.36it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  4%|▍         | 110/2720 [00:33<13:05,  3.32it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 33%|███▎      | 910/2720 [04:53<09:00,  3.35it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 63%|██████▎   | 171

In [65]:
batch_save_path = os.path.join(output_dir, 'batch', f"gpt2_film_epoch_{epoch}_batch_{batch_iter}.bin")
state = {
    'epoch': epoch,
    'batch_iter': batch_iter,
    'loss': sum_loss,
    'model_state_dict': model.state_dict(),
    'optimizer': optimizer.state_dict(),
    'sample_texts' : sample_texts,
    'scheduler' : scheduler.state_dict()
}
torch.save(state, batch_save_path)

# Loading PyTorch GPT-2 Model from Last Pretrained Path

In [None]:
# Retrieve last pretrained model and convert to cuda tensors:
batch_save_path = os.path.join(output_dir, 'batch', f"gpt2_film_epoch_{epoch}_batch_{batch_iter}.bin")
state = torch.load(batch_save_path)
model.load_state_dict(state['model_state_dict'])
model.to(device)

# Saving Hugging Face Transformer Format

In [28]:
from transformers import WEIGHTS_NAME, CONFIG_NAME
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)

torch.save(model.state_dict(), output_model_file)
model.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(output_dir)

('./storage/models/vocab.json', './storage/models/merges.txt')

# Loading Hugging Face GPT-2 Model & Tokenizer from Last Pretrained Path

In [None]:
model = GPT2LMHeadModel.from_pretrained(output_dir)
model.eval()
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)

#  Script Generation

In [14]:
input_ids = tokenizer.encode('         He kisses her softly in İstanbul.           ', return_tensors='pt').to(device)

In [15]:
sample_outputs = model.generate(
                        input_ids = input_ids.to(device),
                        num_beams = 3,
                        max_length = 1000,
                        top_p=0.85, 
                        num_return_sequences=1,
                )

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
      print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /opt/conda/conda-bld/pytorch_1623448234945/work/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


Output:
----------------------------------------------------------------------------------------------------
0:          He kisses her softly in İstanbul.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              

In [37]:
def generate(model, tokenizer, input_text=None, num_samples=1, max_length=1000, device = torch.device('cuda:0')):
    model.eval()
    

    if input_text:

        input_ids = tokenizer.encode(input_text, return_tensors='pt')

        if model.device.type == 'cuda':
            input_ids = input_ids.to(device)
            
        output = model.generate(
            input_ids= input_ids,
            do_sample=True,   
            top_k=50, 
            max_length = max_length,
            top_p=0.95, 
            num_return_sequences= num_samples
        )
    else:
        output = model.generate(
            bos_token_id = random.randint(1,50000),
            do_sample=True,   
            top_k=50, 
            max_length = max_length,
            top_p=0.95, 
            num_return_sequences=num_samples

        )


    decoded_output = []
    for sample in output:
        decoded_output.append(tokenizer.decode(
            sample, skip_special_tokens=True))

    return decoded_output

def script_to_txt(file_name:str,  text:str) -> None:
    if not file_name.endswith('.txt'):
        file_name += '.txt'
        
    with open(file_name, "w") as f:
        f.write(text)

In [20]:
input_text = 'A lonely night in İstanbul, two haters walks around the city streets.'
max_length = 100
decoded_output = generate(model, tokenizer, input_text = input_text, num_samples=1, max_length=max_length)
print(decoded_output[0])

A beautiful night in İstanbul, two lovers walk around the city streets.  It's 
               not unusual for me and your father to walk together 
               alone.


                                      DYLE
                           That, I guess, isn't too uncommon.

                They walk down street, with their backs to her. She looks 
               past. He follows behind them and stops.


                                      DYLE
                               (To himself)
                         This isn't just going on like that, 
                          it's not just 
                               (To her)
                          That's just your father, that's you.

               As they pass in front of him in front of 
               the street-view window, DYLE is still standing close 
               beside her, looking straight down at her.


                                       BOB
                          You're looking down at me, Dyle.


        