# Data-Centric NLP 대회: 주제 분류 프로젝트

## Load Libraries

In [None]:
import os
import random
import numpy as np
import pandas as pd
import wandb
from json import loads

import torch
from torch.utils.data import Dataset, DataLoader

import evaluate
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm as tqdm

from tokenization_kobert import KoBertTokenizer

from utils import plot_confusion_matrix
from filtering import ood_cls_filter

In [None]:
label_to_num = {'IT과학': 0,
					'경제': 1,
					'사회': 2,
					'생활문화': 3,
					'세계': 4,
					'스포츠': 5,
					'정치': 6}

## Set Hyperparameters

In [None]:
SEED = 456
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [None]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

In [None]:
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, 'data')
OUTPUT_DIR = os.path.join(BASE_DIR, 'output')
PREDICT_DIR = os.path.join(BASE_DIR, "prediction") if os.path.exists(os.path.join(BASE_DIR, "prediction")) else os.mkdir(os.path.join(BASE_DIR, "prediction"))

## Load Tokenizer and Model

In [None]:
model_name = 'monologg/kobert'
tokenizer = KoBertTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(DEVICE)

## Define Dataset

In [None]:
data = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
dataset_train, dataset_valid = train_test_split(data, test_size=0.3, random_state=SEED)
if not os.path.exists(os.path.join(DATA_DIR, "origin_train.csv")):
    dataset_train.to_csv(os.path.join(DATA_DIR, "origin_train.csv"), index=False)
if not os.path.exists(os.path.join(DATA_DIR, "origin_valid.csv")):
    dataset_valid.to_csv(os.path.join(DATA_DIR, "origin_valid.csv"), index=False)
# dataset_train = pd.read_csv(os.path.join(DATA_DIR, "processed_train_10.csv"))

In [None]:
with open(os.path.join(DATA_DIR, 'synthetic_data.txt'), 'r') as f:
    txt = f.readline()
    while txt:
        json_text=loads(txt)
        dataset_train = pd.concat([dataset_train, pd.DataFrame([[json_text['text'], label_to_num[json_text['target']]]], columns=['text', 'target'])], ignore_index=True)
        txt = f.readline()

In [None]:
class BERTDataset(Dataset):
    def __init__(self, data, tokenizer):
        input_texts = data['text']
        targets = data['target']
        self.inputs = []; self.labels = []
        for text, label in zip(input_texts, targets):
            tokenized_input = tokenizer(text,
                                        max_length=64,
                                        padding='max_length',
                                        truncation=True,
                                        return_tensors='pt')
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label))
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0),  
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0),
            'labels': self.labels[idx].squeeze(0)
        }
    
    def __len__(self):
        return len(self.labels)

In [None]:
data_train = BERTDataset(dataset_train, tokenizer)
data_valid = BERTDataset(dataset_valid, tokenizer)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Define Metric

In [None]:
f1 = evaluate.load('f1')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')


## Train Model

In [None]:
### for wandb setting
#os.environ['WANDB_DISABLED'] = 'true'

In [None]:
run = wandb.init(project="data centric", entity="nlp-10")
run.name = "BaseLine Test Split=0.3,max_len=64,label_2aug100"
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    do_predict=True,
    logging_strategy='steps',
    evaluation_strategy='steps',
    save_strategy='steps',
    logging_steps=100,
    eval_steps=100,
    save_steps=100,
    save_total_limit=2,
    learning_rate= 2e-05,
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
    adam_epsilon=1e-08,
    weight_decay=0.01,
    lr_scheduler_type='linear',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    seed=SEED,
    report_to="wandb",
    disable_tqdm=True
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_valid,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()
run.finish()

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(os.path.join(OUTPUT_DIR, 'checkpoint-2000')).to(DEVICE)

In [None]:
model

## Evaluate Model

In [None]:
dataset_test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))

In [None]:
model.eval()
preds = []
for idx, sample in tqdm(dataset_test.iterrows(),
                        total=len(dataset_test),
                        desc='Predicting'):
    inputs = tokenizer(sample['text'],
                       max_length=64,
                       padding='max_length',
                       truncation=True,
                       return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits
        pred = torch.argmax(torch.nn.Softmax(dim=1)(logits), dim=1).cpu().numpy()
        preds.extend(pred)

In [None]:
dataset_test['target'] = preds
dataset_test.to_csv(os.path.join(PREDICT_DIR, 'output.csv'), index=False)

In [None]:
dataset_test

# For Training Data Analyze

In [None]:
model.eval()
preds = []
for idx, sample in tqdm(dataset_train.iterrows(),
                        total=len(dataset_train),
                        desc='Predicting'):
    inputs = tokenizer(sample['text'],
                       max_length=64,
                       padding='max_length',
                       truncation=True,
                       return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits
        # pred = torch.argmax(torch.nn.Softmax(dim=1)(logits), dim=1).cpu().numpy()
        pred = torch.nn.Softmax(dim=1)(logits).cpu().numpy()
        preds.extend(pred)

In [None]:
dataset_train['preds_value'] = np.array(preds).tolist()
dataset_train.to_csv(os.path.join(PREDICT_DIR, 'train_prediction.csv'), index=False)

In [None]:
f1.compute(predictions=torch.argmax(torch.tensor(dataset_train['preds_value'].to_list()), dim=1), references=torch.tensor(dataset_train['target'].to_list()), average='macro')

# For Validation Data Analyze

In [None]:
model.eval()
preds = []
for idx, sample in tqdm(dataset_valid.iterrows(),
                        total=len(dataset_valid),
                        desc='Predicting'):
    inputs = tokenizer(sample['text'],
                       max_length=64,
                       padding='max_length',
                       truncation=True,
                       return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits
        # pred = torch.argmax(torch.nn.Softmax(dim=1)(logits), dim=1).cpu().numpy()
        pred = torch.nn.Softmax(dim=1)(logits).cpu().numpy()
        preds.extend(pred)

In [None]:
dataset_valid['preds_value'] = np.array(preds).tolist()
dataset_valid.to_csv(os.path.join(PREDICT_DIR, 'valid_prediction.csv'), index=False)

In [None]:
dataset_valid

In [None]:
f1.compute(predictions=torch.argmax(torch.tensor(dataset_valid['preds_value'].to_list()), dim=1), references=torch.tensor(dataset_valid['target'].to_list()), average='macro')

# Synthetic Data

In [None]:
synthetic_data = pd.DataFrame(columns=['text', 'target'])
with open(os.path.join(DATA_DIR, 'synthetic_data.txt'), 'r') as f:
    txt = f.readline()
    while txt:
        json_text=loads(txt)
        synthetic_data = pd.concat([synthetic_data, pd.DataFrame([[json_text['text'], label_to_num[json_text['target']]]], columns=['text', 'target'])], ignore_index=True)
        txt = f.readline()

In [None]:
model.eval()
preds = []
for idx, sample in tqdm(synthetic_data.iterrows(),
                        total=len(synthetic_data),
                        desc='Predicting'):
    inputs = tokenizer(sample['text'],
                       max_length=64,
                       padding='max_length',
                       truncation=True,
                       return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits
        # pred = torch.argmax(torch.nn.Softmax(dim=1)(logits), dim=1).cpu().numpy()
        pred = torch.nn.Softmax(dim=1)(logits).cpu().numpy()
        preds.extend(pred)

In [None]:
synthetic_data['preds'] = preds

In [None]:
synthetic_data['pred_idx'] = torch.argmax(torch.tensor(preds), dim=1)

In [None]:
synthetic_data[synthetic_data['pred_idx'] != synthetic_data['target']]