# Topic Classification

GPT-2 Model Experiment 1
- Data: [국립국어원 신문 말뭉치(v2)](https://corpus.korean.go.kr/) sampling data
- Model: [SKT AI KoGPT2](https://github.com/SKT-AI/KoGPT2) fine-tuning

Author: [Seongbum Seo](https://github.com/Seongbuming)

In [2]:
import torch
torch.cuda.empty_cache()

## Background Setup

In [8]:
%pip install -q git+https://github.com/huggingface/transformers.git
%pip install -q git+https://github.com/gmihaila/ml_things.git

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [7]:
!git clone https://github.com/SKT-AI/KoGPT2
%pip install matplotlib==3.1.3

fatal: destination path 'KoGPT2' already exists and is not an empty directory.
Note: you may need to restart the kernel to use updated packages.


## Model Setup

In [10]:
import io
import os
import torch
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from ml_things import plot_dict, plot_confusion_matrix, fix_text
from sklearn.metrics import classification_report, accuracy_score
from transformers import (set_seed, TrainingArguments, Trainer, GPT2Config, GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup, GPT2ForTokenClassification)

set_seed(123)

epochs = 4
batch_size = 32
max_length = 60
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name_or_path = 'skt/kogpt2-base-v2'

labels_ids = {
    'ITscience': 0,
    'culture': 1,
    'economy': 2,
    'entertainment': 3,
    'health': 4,
    'life': 5,
    'politic': 6,
    'social': 7,
    'sport': 8
}
label_names = labels_ids.keys()
n_labels = len(labels_ids)

## Data

In [12]:
class NewsDataset(Dataset):
    def __init__(self, path, use_tokenizer):
        if not os.path.isdir(path):
            raise ValueError('Invalid `path` variable. Needs to be a directory.')
        
        self.texts = []
        self.labels = []

        for label in label_names:
            sentiment_path = os.path.join(path, label)

            files_names = os.listdir(sentiment_path)#[:10] # Sample for debugging
            for file_name in tqdm(files_names, desc=f'{label} files'):
                file_path = os.path.join(sentiment_path, file_name)

                content = io.open(file_path, mode='r', encoding='utf-8').read()
                content = fix_text(content)
                self.texts.append(content)
                self.labels.append(label)
            
        self.n_examples = len(self.labels)
    
    def __len__(self):
        return self.n_examples
    
    def __getitem__(self, item):
        return {
            'text': self.texts[item],
            'label': self.labels[item]
        }

In [13]:
class Gpt2ClassificationCollator(object):
    def __init__(self, use_tokenizer, labels_encoder, max_sequence_len=None):
        self.use_tokenizer = use_tokenizer
        self.max_sequence_len = use_tokenizer.model_max_length if max_sequence_len is None else max_sequence_len
        self.labels.encoder = labels_encoder
    
    def __call__(self, sequences):
        texts = [sequence['text'] for sequence in sequences]
        labels = [sequence['label'] for sequence in sequences]
        labels = [self.labels_encoder[label] for label in labels]
        inputs = self.use_tokenizer(text=texts, return_tensors='pt', padding=True, truncation=True, max_length=self.max_sequence_len)
        inputs.update({'labels': torch.tensor(labels)})
        
        return inputs

In [None]:
def train(dataloader, optimizer_, scheduler_, device_):
    # Use global variable for model
    global model

    predictions_labels = []
    true_labels = []
    total_loss = 0

    model.train()

    for batch in tqdm(dataloader, total=len(dataloader)):
        true_labels += batch['labels'].numpy().flatten().tolist()
        batch = {k: v.type(torch.long).to(device_) for k, v in batch.items()}

        model.zero_grad()

        outputs = model(**batch)

        loss, logits = outputs[:2]
        total_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer_.step()
        scheduler_.step()

        logits = logits.detach().cpu().numpy()
        predictions_labels += logits.argmax(axis=-1).flatten().tolist()

    avg_epoch_loss = total_loss / len(dataloader)

    return true_labels, predictions_labels, avg_epoch_loss

In [None]:
def validation(dataloader, device_):
    # Use global variable for model
    global model

    predictions_labels = []
    true_labels = []
    total_loss = 0

    model.eval()

    for batch in tqdm(dataloader, total=len(dataloader)):
        true_labels += batch['labels'].numpy().flatten().tolist()
        batch = {k: v.type(torch.long).to(device_) for k, v in batch.items()}

        while torch.no_grad():
            outputs = model(**batch)
            
            loss, logits = outputs[:2]
            logits = logits.detach().cpu().numpy()
            total_loss += loss.item()

            predict_content = logits.argmax(axis=-1).flatten().tolist()
            predictions_labels += predict_content
    
    avg_epoch_loss = total_loss / len(dataloader)

    return true_labels, predictions_labels, avg_epoch_loss