# Text Classification

- Ensemble 10 mini-model -> voting

### Library

In [None]:
from tqdm import tqdm
import transformers
import pandas as pd
import numpy as np
import os

from transformers import AutoTokenizer
from datasets import load_dataset, load_metric
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline

### GPU settings

In [None]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]= "0,1,2,3,4,5,6,7"

### Hyperparameters

In [None]:
SAMPLE_RATIO = 0.01
TEST_RATIO = 0.1
BACKBONE_MODEL = "klue/roberta-large"
BATCH_SIZE = 4
LEARNING_RATE = 3e-5
MAX_EPOCHS = 1
WEIGHT_DECAY = 0.01

### Dataset

In [None]:
df = pd.read_csv("train.csv")

In [None]:
df = df.dropna()

In [None]:
df = df.fillna("")

In [None]:
df = df.sample(int(len(df)*SAMPLE_RATIO))

In [None]:
df = df.rename(columns={"data": "sentence", "category": "label"})

In [None]:
label_list = sorted(list(set(list(df["label"]))))
label_list

In [None]:
train_df = df[:int(len(df)*(1-TEST_RATIO))]
val_df = df[int(len(df)*(1-TEST_RATIO)):]

In [None]:
train_df.to_csv("./train_df.csv", index=False)
val_df.to_csv("./val_df.csv", index=False)

In [None]:
train_df

In [None]:
val_df

In [None]:
dataset = load_dataset('csv', data_files={'train': './train_df.csv','test': './val_df.csv'})

In [None]:
dataset

In [None]:
dataset["train"][0]

In [None]:
dataset["test"][0]

### Metrics

In [None]:
metric = load_metric("f1")

In [None]:
metric

In [None]:
import numpy as np
fake_preds = np.random.randint(0, 3, size=(64,))
fake_labels = np.random.randint(0, 3, size=(64,))
metric.compute(predictions=fake_preds, references=fake_labels, average="macro")

### Preprocess

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BACKBONE_MODEL, use_fast=True)

In [None]:
tokenizer("이순신은 조선 중기의 무신이다.", "대한민국은 자유민주주의 국가이다.")

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True)

In [None]:
preprocess_function(dataset['train'][:5])

In [None]:
encoded_dataset = dataset.map(preprocess_function, batched=True)

### Training

In [None]:
num_labels = len(label_list)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(BACKBONE_MODEL, num_labels=num_labels)

In [None]:
metric_name = "f1"

In [None]:
model_name = BACKBONE_MODEL.split("/")[-1]

In [None]:
args = TrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=MAX_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels, average="macro")

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

### Inference

In [None]:
submission_df = pd.read_csv("./test.csv")[:100]

In [None]:
sentences = list(submission_df["data"])

In [None]:
preds = list()

In [None]:
inferencer = pipeline(task="text-classification", model=model, tokenizer=tokenizer, device=0)
for sentence in tqdm(sentences) : 
    out = inferencer(sentence, batch_size=64, truncation="only_first")
    preds.append(out[0]["label"].split("_")[1])

In [None]:
submission_df["category"] = preds

In [None]:
submission_df = submission_df[["index", "category"]]

In [None]:
submission_df.to_csv("./submission.csv", index=False)