# Script Writing

GPT-2 Model Experiment 2<br>
Script writing in Korean
- Data: [짤툰](https://www.youtube.com/c/%EC%A7%A4%ED%88%B01) script data
- Model: [SKT AI KoGPT2](https://github.com/SKT-AI/KoGPT2) fine-tuning

Author: [Seongbum Seo](https://github.com/Seongbuming)

In [1]:
import torch
torch.cuda.empty_cache()

## Background Setup

In [2]:
# Install transformers library
%pip install -q git+https://github.com/huggingface/transformers.git
# Install helper functions
%pip install -q git+https://github.com/gmihaila/ml_things.git
%pip install -q fastai==2.2.5

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
# Clone base model
!git clone https://github.com/SKT-AI/KoGPT2
%pip install matplotlib==3.1.3

fatal: destination path 'KoGPT2' already exists and is not an empty directory.
Collecting matplotlib==3.1.3
  Using cached matplotlib-3.1.3-cp38-cp38-manylinux1_x86_64.whl (13.1 MB)
Installing collected packages: matplotlib
  Attempting uninstall: matplotlib
    Found existing installation: matplotlib 3.5.2
    Uninstalling matplotlib-3.5.2:
      Successfully uninstalled matplotlib-3.5.2
[31mERROR: After October 2020 you may experience errors when installing or updating packages. This is because pip will change the way that it resolves dependency conflicts.

We recommend you use --use-feature=2020-resolver to test your packages with the new resolver before it becomes the default.

ml-things 0.0.1 requires matplotlib>=3.4.0, but you'll have matplotlib 3.1.3 which is incompatible.[0m
Successfully installed matplotlib-3.1.3
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.

## Model setup

In [3]:
import io
import os
import torch
import transformers
import fastai
import re
import pandas as pd
from typing import Optional
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader, random_split
from ml_things import fix_text
from transformers import AutoModelWithLMHead, PreTrainedTokenizerFast, GPT2Config, AdamW, get_linear_schedule_with_warmup
from fastai.text.all import *

# Set seed for reproducibility
set_seed(123)

# Number of training epochs
epochs = 10

# Number of batches - depending on the max sequence length and GPU memory
# For 512 sequence length batch of 10 works without cuda memory issues
# For small sequence length can try batch of 32 or higher
batch_size = 8

# Pad or truncate text sequences to a specific length
# If 'None' it will use maximum sequence of word piece tokens allowed by model
max_length = 256

# Look for GPU to use
# Will use 'cpu' by default if no GPU found
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Name of the base model to use
model_name_or_path = 'skt/kogpt2-base-v2'

# Path of data to use for training
data_path = './dataset/jjaltoon_scripts'

kogpt2_config = {
    'initializer_range': 0.02,
    'layer_norm_epsilon': 1e-05,
    'n_ctx': 1024,
    'n_embd': 768,
    'n_head': 12,
    'n_layer': 12,
    'n_positions': 1024,
    'vocab_size': 50000
}

## Data

In [18]:
class ScriptDataset(Dataset):
    def __init__(self, path, use_tokenizer, max_sequence_len=None):
        # Check if path exists
        if not os.path.isdir(path):
            # Raise error if path is invalid
            raise ValueError('Invalid `path` variable. Needs to be a directory.')
            
        # Check max sequence length
        max_length = use_tokenizer.model_max_length if max_sequence_len is None else max_sequence_len
        
        self.dataset = []
        
        # Get all files from path
        files_names = os.listdir(path)
        # Go through each file and read its content
        for file_name in tqdm(files_names, desc=f'script files'):
            file_path = os.path.join(path, file_name)
            
            # Read content as pandas dataframe
            #content = io.open(file_path, mode='r', encoding='utf-8').read()
            df = pd.read_csv(file_path)
            
            # Load features from each row of content
            data = []
            for _, row in df.iterrows():
                # Fix any unicode issues
                speech = fix_text(row['speech'])
                speech_type = fix_text(row['speech_type'])
                speaker = fix_text(row['speaker'])
                
                # Call tokenizer on speech text to convert into tensors of numbers with appropriate padding
                tokenized = use_tokenizer(text=speech, return_tensors='pt', padding=True, max_length=max_length)
                
                # Save dataset
                self.dataset.append({
                    tokenized, row['speech_type'], row['speaker']
                })
        
        # Number of examples
        self.n_dataset = len(self.dataset)
    
    def __len__(self):
        r'''When used `len` return the number of examples.
        '''
        
        return self.n_dataset
    
    def __getitem__(self, item):
        r'''Given an index return an example from the position.
        
        Arguments:
            item(:obj:`int`):
                Index position to pick an example to return.
        
        Returns:
            :obj:`Dict[?]`: Dictionary of inputs.
        '''
        
        return self.dataset[item]

In [6]:
class Gpt2ScriptWritingCollator(object):
    r'''Data Collator used for GPT-2 in a script writing task.
    
    It uses a given tokenizer and its encoder to convert any text to numbers that can go straight into a GPT-2 model.
    
    Arguments:
        use_tokenizer(:obj:`transformers.tokenization_?`):
            Transformer type tokenizer used to process raw text into numbers
        max_sequence_len(:obj:`int`, `optional`):
            Value to indicate the maximum desired sequence to truncate or pad text sequences.
            If no value is passed it will used maximum sequence size supported by the tokenizer and model.
    '''
    
    def __init__(self, use_tokenizer, max_sequence_len=None):
        # Tokenizer to be used inside the class
        self.use_tokenizer = use_tokenizer
        # Check max sequence length
        self.max_sequence_len = use_tokenizer.model_max_length if max_sequence_len is None else max_sequence_len
    
    def __call__(self, sequences):
        r'''This function allowes the class object to be used as a function call.
        
        Since the PyTorch DataLoader needs a collator function, can use this class as a function.
        
        Arguments:
            item(:obj:`list`):
                List of texts.
        
        Returns:
            :obj:`Dict[str, object]`: Dictionary of inputs that feed into the model.
            It holds the statement `model(**Returned Dictionary)`.
        '''
        
        # Get all texts from sequences list
        texts = [sequence['text'] for sequence in sequences]
        # Call tokenizer on all texts to convert into tensors of numbers with appropriate padding
        inputs = self.use_tokenizer(text=texts, return_tensors='pt', padding=True, truncate=True, max_length=self.max_sequence_len)
        inputs.update()
        print(inputs)
        
        return inputs

In [7]:
def train(dataloader, optimizer_, scheduler_, device_):
    r'''Train PyTorch model on a single pass through the data loader.
    
    It will use the global variable `model` which is the transformer model loaded on `device_` that we want to train on.
    
    Arguments:
        dataloader(:obj:`torch.utils.data.dataloader.DataLoader`):
            Parsed data into batches of tensors.
        optimizer_(:obj:`transformers.optimization.AdamW`):
            Optimizer used for training.
        scheduler_(:obj:`torch.optim.lr_scheduler.LambdaLR`):
            PyTorch scheduler.
        device_(:obj:`torch.device`):
            Device used to load tensors before feeding to model.
        
        Returns:
            :obj:`List[
    '''
    
    #learn = Learner(dataloader, loss_func=CrossEntropyLossFlat(), cbs=[Dropout], metrics=Perplexity()).to_fp16()
    #lr = learn.lr_find()
    #print(f'learning rate: {lr}')
    #learn.fine_tune(epochs)
    
    # Use global variable for model
    global model
    
    # Total loss for this epoch
    total_loss = 0
    
    # Put the model into training mode
    model.train()
    
    # Bor deach batch of training data
    for batch in tqdm(dataloader, total=len(dataloader)):
        for k, v in batch.items():
            print(v.type(torch.long))
            break
        batch = {k: v.type(torch.long).to(device_) for k, v in batch.items()}
        
        # Always clear any previously calculated gradients before performing a backward pass
        # If use `optimizer.zero_grad()` it will call zero_grad() for all parameters registered in the optimizer
        optimizer.zero_grad()
        
        # Since it is composed of a list of tensors, it converts the list through the stack
        #data = torch.stack(batch)
        #data = batch
        #data = data.transpose(1, 0)
        # Move batch to device
        #data = data.to(device_)
        #model = model.to(device_)
        
        # Perform a forward pass
        outputs = model(**batch)
        
        # The call to `model` always returns a tuple, so we need to pull
        # the loss value out of the tuple along with the logits
        loss, logits = outputs[:2]
        # Accumulate the training loss over all of the batches so that we can
        # caculate the average loss at the end
        # The `loss` is a Tensor containing a single value
        # The `.item()` function just returns the Python value from the tensor
        total_loss += loss.item()
        # Perform a backward pass to calculate the gradients
        loss.backward()
        
        # Clip the norm of the gradients to 1.0
        # This is to help prevent the exploding radients problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # Update parameters and take a step using the computed gradient
        # The optimizer dictates the update rule - how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer_.step()
        # Update the learning rate
        scheduler_.step()
    
    # Calculate the average loss over the training data
    avg_epoch_loss = total_loss / len(dataloader)
        
    # Reutrn average loss at this epoch
    return avg_epoch_loss

## Model

In [16]:
# Get model configuration
print('Loading configuration...')
model_config = GPT2Config.from_pretrained(
    pretrained_model_name_or_path=model_name_or_path
)

# Get model's tokenizer
print('Loading tokenizer...')
from transformers import PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast.from_pretrained(
    model_name_or_path,
    bos_token='</s>',
    eos_token='</s>',
    unk_token='<unk>',
    pad_token='<pad>',
    mask_token='<mask>'
)
# Default to left padding
tokenizer.padding_side = 'left'
# Define PAD Token = EOS Token = 50256
tokenizer.pad_token = tokenizer.eos_token

# Get the actual model
print('Loading model...')
model = AutoModelWithLMHead.from_pretrained(
    pretrained_model_name_or_path=model_name_or_path,
    config=model_config
)

# Resize model embedding to match new tokenizer
model.resize_token_embeddings(len(tokenizer))
# Fix model padding token id
model.config.pad_token_id = model.config.eos_token_id
# Load model to define device
model.to(device)
print(f'Model loaded to `{device}`.')

Loading configuration...
Loading tokenizer...


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


Loading model...
Model loaded to `cuda`.


In [19]:
# Create data collator to encode texts into numbers
gpt2_script_writing_collator = Gpt2ScriptWritingCollator(
    use_tokenizer=tokenizer,
    max_sequence_len=max_length
)

# Create PyTorch dataset
print('Dealing with train...')
train_dataset = ScriptDataset(path=data_path, use_tokenizer=tokenizer, max_sequence_len=max_length)
print(f'Created `dataset` with {len(train_dataset)} examples.')

# Move PyTorch dataset into dataloader
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=gpt2_script_writing_collator)
print(f'Created `train_dataloader` with {len(train_dataloader)} batches.')

Dealing with train...


script files:   0%|          | 0/1 [00:00<?, ?it/s]



TypeError: unhashable type: 'BatchEncoding'

In [69]:
for ex in train_dataset:
    print(ex)
    break
print(len(train_dataset))
    
print()
print()

for ex in train_dataloader:
    print(ex)
    break

{'text': {'input_ids': tensor([[18643,  8422,  6919,  7285,  9020,  8756,   375,  7653, 12380,   375,
          7055, 13729, 19633,  8608,  7489, 14087, 44308,  9535,  7467,  7405,
         10187,  6889,  9080,  9025,  7586,   389,  9089,  7282, 14870, 14258,
          6889, 28895,  7281,  8660,  9021, 15148, 10190, 20564,  9051,  7847,
          7586,  9223,  9440,  7760,  8277, 14870, 14258,  7530,   384,  8133,
          8133,  8139, 10765,  9051,  8713,  7281,  8660, 32392,  9247,  7888,
         16633,  7671,  9585,  7847,  7586,  9223, 17368,  8139, 29045, 11059,
         11781,  8017,  7098, 13568,  9051, 16146, 14870,  9244,  7162,  9199,
          9025, 12398, 35233,  9797,  8265,  9016,  9051,  7847,  7586,  9223,
         19897,  7043,  7489,  9267, 27561,  8155,  9317, 10124,  7991,  8247,
         44657, 16146,  9025,  7586, 35233, 14870, 45510, 19674,   389, 28895,
          7281,  8660, 32392,  9303,  7888, 17965,  7671,  9585,  7847,  7586,
          9223,  9440,  7518,

## Train

In [76]:
optimizer = AdamW(model.parameters(), lr=3e-5)

total_steps = len(train_dataset) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

all_loss = {'train_loss': [], 'val_loss': []}
all_acc = {'train_acc': [], 'val_acc': []}

print('Epoch')
for epoch in tqdm(range(epochs)):
    print()
    print('Training on batches...')
    
    train_loss = train(train_dataset, optimizer, scheduler, device)
    
    print('  train_loss: %.5f' % train_loss)
    
    all_loss['train_loss'].append(train_loss)
        
    if epoch > 0 and epoch % 10 == 0:
        # Move model to CPU and generate sentences to evaluate writing performance in the current epoch
        #sentence = generate_sentences(model.to('cpu'), tokenizer, vocab, sent='내일', text_size=100, temperature=0.7, top_p=0.8, top_k=40)
        #sentence = sentence.replace('<unused0>', '\n')
        #sentence = auto_enter(sentence)
        inputs = tokenizer('내일', return_tensors='pt', padding=True, truncate=True, max_length=max_length)
        generation = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
        
        print(generation[:2])

        # Visualize the generated sentences through tensorboard
        #summary.add_text('Text', sentence, count)
    
    print()

    count += 1


# Plot loss curves
plot_dict(all_loss, use_xlabel='Epochs', use_ylabel='Value', use_linestyles=['-', '--'], use_title='Loss')


Epoch


  0%|          | 0/10 [00:00<?, ?it/s]


Training on batches...


  0%|          | 0/30 [00:00<?, ?it/s]

AttributeError: 