# Topic Classification

GPT-2 Model Experiment 1
- Data: [국립국어원 신문 말뭉치(v2)](https://corpus.korean.go.kr/) sampling data
- Model: [SKT AI KoGPT2](https://github.com/SKT-AI/KoGPT2) fine-tuning

Author: [Seongbum Seo](https://github.com/Seongbuming)

In [1]:
import torch
torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm


## Background Setup

In [2]:
%pip install -q git+https://github.com/huggingface/transformers.git
%pip install -q git+https://github.com/gmihaila/ml_things.git

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
!git clone https://github.com/SKT-AI/KoGPT2
%pip install matplotlib==3.1.3

fatal: destination path 'KoGPT2' already exists and is not an empty directory.
Collecting matplotlib==3.1.3
  Using cached matplotlib-3.1.3-cp38-cp38-manylinux1_x86_64.whl (13.1 MB)
[31mERROR: ml-things 0.0.1 has requirement matplotlib>=3.4.0, but you'll have matplotlib 3.1.3 which is incompatible.[0m
Installing collected packages: matplotlib
  Attempting uninstall: matplotlib
    Found existing installation: matplotlib 3.5.2
    Uninstalling matplotlib-3.5.2:
      Successfully uninstalled matplotlib-3.5.2
Successfully installed matplotlib-3.1.3
Note: you may need to restart the kernel to use updated packages.


## Model Setup

In [12]:
import io
import os
import torch
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from ml_things import plot_dict, plot_confusion_matrix, fix_text
from sklearn.metrics import classification_report, accuracy_score
from transformers import (set_seed, TrainingArguments, Trainer, GPT2Config, GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup, GPT2ForTokenClassification)

set_seed(123)

epochs = 4
batch_size = 32
max_length = 60
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name_or_path = 'skt/kogpt2-base-v2'
train_data_path = 'sampling_train_data'
test_data_path = 'sampling_test_data'

labels_ids = {
    'ITscience': 0,
    'culture': 1,
    'economy': 2,
    'entertainment': 3,
    'health': 4,
    'life': 5,
    'politic': 6,
    'social': 7,
    'sport': 8
}
label_names = list(labels_ids.keys())
n_labels = len(labels_ids)

## Data

In [5]:
class NewsDataset(Dataset):
    def __init__(self, path, use_tokenizer):
        if not os.path.isdir(path):
            raise ValueError('Invalid `path` variable. Needs to be a directory.')
        
        self.texts = []
        self.labels = []

        for label in label_names:
            sentiment_path = os.path.join(path, label)

            files_names = os.listdir(sentiment_path)#[:10] # Sample for debugging
            for file_name in tqdm(files_names, desc=f'{label} files'):
                file_path = os.path.join(sentiment_path, file_name)

                content = io.open(file_path, mode='r', encoding='utf-8').read()
                content = fix_text(content)
                self.texts.append(content)
                self.labels.append(label)
            
        self.n_examples = len(self.labels)
    
    def __len__(self):
        return self.n_examples
    
    def __getitem__(self, item):
        return {
            'text': self.texts[item],
            'label': self.labels[item]
        }

In [6]:
class Gpt2ClassificationCollator(object):
    def __init__(self, use_tokenizer, labels_encoder, max_sequence_len=None):
        self.use_tokenizer = use_tokenizer
        self.max_sequence_len = use_tokenizer.model_max_length if max_sequence_len is None else max_sequence_len
        self.labels.encoder = labels_encoder
    
    def __call__(self, sequences):
        texts = [sequence['text'] for sequence in sequences]
        labels = [sequence['label'] for sequence in sequences]
        labels = [self.labels_encoder[label] for label in labels]
        inputs = self.use_tokenizer(text=texts, return_tensors='pt', padding=True, truncation=True, max_length=self.max_sequence_len)
        inputs.update({'labels': torch.tensor(labels)})
        
        return inputs

In [7]:
def train(dataloader, optimizer_, scheduler_, device_):
    # Use global variable for model
    global model

    predictions_labels = []
    true_labels = []
    total_loss = 0

    model.train()

    for batch in tqdm(dataloader, total=len(dataloader)):
        true_labels += batch['labels'].numpy().flatten().tolist()
        batch = {k: v.type(torch.long).to(device_) for k, v in batch.items()}

        model.zero_grad()

        outputs = model(**batch)

        loss, logits = outputs[:2]
        total_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer_.step()
        scheduler_.step()

        logits = logits.detach().cpu().numpy()
        predictions_labels += logits.argmax(axis=-1).flatten().tolist()

    avg_epoch_loss = total_loss / len(dataloader)

    return true_labels, predictions_labels, avg_epoch_loss

In [8]:
def validation(dataloader, device_):
    # Use global variable for model
    global model

    predictions_labels = []
    true_labels = []
    total_loss = 0

    model.eval()

    for batch in tqdm(dataloader, total=len(dataloader)):
        true_labels += batch['labels'].numpy().flatten().tolist()
        batch = {k: v.type(torch.long).to(device_) for k, v in batch.items()}

        while torch.no_grad():
            outputs = model(**batch)
            
            loss, logits = outputs[:2]
            logits = logits.detach().cpu().numpy()
            total_loss += loss.item()

            predict_content = logits.argmax(axis=-1).flatten().tolist()
            predictions_labels += predict_content
    
    avg_epoch_loss = total_loss / len(dataloader)

    return true_labels, predictions_labels, avg_epoch_loss

## Model

In [9]:
print('Loading configuration...')
model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path=model_name_or_path, num_labels=n_labels)

print('Loading tokenizer...')
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name_or_path,
    bos_token='</s>',
    eos_token='</s>',
    unk_token='<unk>',
    pad_token='<pad>',
    mask_token='<mask>'
)
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.eos_token

print('Loading model...')
model = GPT2ForTokenClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=model_config)

model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = model.config.eos_token_id
model.to(device)
print(f'Model loaded to `{device}`.')

Loading configuration...
Loading tokenizer...


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


Loading model...


Some weights of the model checkpoint at skt/kogpt2-base-v2 were not used when initializing GPT2ForTokenClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at skt/kogpt2-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded to `cuda`.


NVIDIA GeForce RTX 3060 Laptop GPU with CUDA capability sm_86 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_70.
If you want to use the NVIDIA GeForce RTX 3060 Laptop GPU GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



In [10]:
gpt2_classification_collator = Gpt2ClassificationCollator(
    use_tokenizer=tokenizer,
    labels_encoder=labels_ids,
    max_sequence_len=max_length
)

print('Dealing with train...')
train_dataset = NewsDataset(path=train_data_path, use_tokenizer=tokenizer)
print(f'Created `train_dataset` with {len(train_dataset)} examples.')

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=gpt2_classification_collator)
print(f'Created `train_dataloader` with {len(train_dataloader)} batches.')
print()

print('Dealing with validation...')
valid_dataset = NewsDataset(path=test_data_path, use_tokenizer=tokenizer)
print(f'Created `valid_dataset` with {len(valid_dataset)} examples.')

valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=gpt2_classification_collator)
print(f'Created `valid_dataloader` with {len(valid_dataloader)} batches.')

AttributeError: 'Gpt2ClassificationCollator' object has no attribute 'labels'

## Train

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8) # by default lr is 5e-5 and eps is 1e-8

total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

all_loss = {'train_loss': [], 'val_loss': []}
all_acc = {'train_acc': [], 'val_acc': []}

print('Epoch')
for epoch in tqdm(range(epochs)):
    print()
    print('Training on batches...')
    train_labels, train_predict, train_loss = train(train_dataloader, optimizer, scheduler, device)
    train_acc = accuracy_score(train_labels, train_predict)

    print('Validation on batches...')
    valid_labels, valid_predict, val_loss = validation(valid_dataloader, device)
    val_acc = accuracy_score(valid_labels, valid_predict)

    print('  train_loss: %.5f - val_loss: %.5f - train_acc: %.5f - val_acc: %.5f' % (train_loss, val_loss, train_acc, val_acc))
    print()

    all_loss['train_loss'].append(train_loss)
    all_loss['val_loss'].append(val_loss)
    all_acc['train_acc'].append(train_acc)
    all_acc['val_acc'].append(val_acc)

plot_dict(all_loss, use_xlabel='Epochs', use_ylabel='Value', use_linestyles=['-', '--'], use_title='Loss')
plot_dict(all_acc, use_xlabel='Epochs', use_ylabel='Value', use_linestyles=['-', '--'], use_title='Accuracy')

## Evaluation

In [None]:
true_labels, predictions_labels, avg_epoch_loss = validation(valid_dataloader, device)

evaluation_report = classification_report(true_labels, predictions_labels, labels=list(labels_ids.values()), target_names=label_names)
print(evaluation_report)

plot_confusion_matrix(y_true=true_labels, y_pred=predictions_labels, classes=label_names, normalize=True, magnify=0.1)