# Data-Centric NLP 대회: 주제 분류 프로젝트

## Load Libraries

In [None]:
import os
import random
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader

import evaluate
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

from sklearn.model_selection import train_test_split

from tokenization_kobert import KoBertTokenizer

## Set Hyperparameters

In [None]:
SEED = 456
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [None]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

In [None]:
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, '../data')
OUTPUT_DIR = os.path.join(BASE_DIR, '../output')

## Load Tokenizer and Model

In [None]:
model_name = 'monologg/kobert'
tokenizer = KoBertTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(DEVICE)

## Define Dataset

In [None]:
data = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
dataset_train, dataset_valid = train_test_split(data, test_size=0.3, random_state=SEED)

In [None]:
class BERTDataset(Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text']
        targets = data['target']
        self.inputs = []; self.labels = []
        for text, label in zip(input_texts, targets):
            tokenized_input = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label))
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0),  
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0),
            'labels': self.labels[idx].squeeze(0)
        }
    
    def __len__(self):
        return len(self.labels)

In [None]:
data_train = BERTDataset(dataset_train, tokenizer)
data_valid = BERTDataset(dataset_valid, tokenizer)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Define Metric

In [None]:
f1 = evaluate.load('f1')
def compute_metrics(eval_pred):
    global predicted_labels
    global true_labels
    global pred_probs

    predictions, labels = eval_pred
    pred_probs = predictions
    argmax_predictions = np.argmax(predictions, axis=1)
    true_labels = argmax_predictions
    predicted_labels = labels
    
    return f1.compute(predictions=predictions, references=labels, average='macro')

## Train Model

In [None]:
### for wandb setting
#os.environ['WANDB_DISABLED'] = 'true'

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    logging_strategy='steps',
    evaluation_strategy='steps',
    save_strategy='steps',
    logging_steps=100,
    eval_steps=100,
    save_steps=100,
    save_total_limit=2,
    learning_rate= 2e-05,
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    adam_epsilon=1e-08,
    weight_decay=0.01,
    lr_scheduler_type='linear',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    seed=SEED
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_valid,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

# Classification Report & Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

report = classification_report(predicted_labels, true_labels)

print(report)


confusion_matrix = confusion_matrix(predicted_labels, true_labels)
confusion_matrix_norm  = confusion_matrix.astype('float') / confusion_matrix.sum(axis=1)[:, np.newaxis]
# 클래스 레이블
labels=['IT/Science', 'Economy','Social','Life&Culture','World','Sports', 'Politics']

# 히트맵 시각화

fig, ax = plt.subplots()
sns.heatmap(confusion_matrix, annot=True,fmt = 'd', cmap='Blues', xticklabels=labels, yticklabels=labels)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix')
plt.show()

fig, ax = plt.subplots()
sns.heatmap(confusion_matrix_norm, annot=True, cmap='Blues', xticklabels=labels, yticklabels=labels)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix Normalized')
plt.show()

## Evaluate Model

In [None]:
dataset_test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))

In [None]:
model.eval()
preds = []
for idx, sample in dataset_test.iterrows():
    inputs = tokenizer(sample['text'], return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits
        pred = torch.argmax(torch.nn.Softmax(dim=1)(logits), dim=1).cpu().numpy()
        preds.extend(pred)

In [None]:
dataset_test['target'] = preds
dataset_test.to_csv(os.path.join(BASE_DIR, 'output.csv'), index=False)