# Benchmarking LLMs for the Korean Language
- Created: 2025-06-26 (Thu)
- Updated: 2025-06-26 (Thu)

## 1. Environment Set-up
Create a Jupyter Notebook with PyTorch. 

- `PyTorch 1-13` 

In [1]:
!pip install transformers datasets evaluate accelerate

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Collecting accelerate
  Downloading accelerate-1.8.1-py3-none-any.whl.metadata (19 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.33.1-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting hf-xet<2.0.0,>=1.1.2 (from huggingface-

## 2. Load the dataset


In [4]:
from datasets import load_dataset

klue_sts_dataset = load_dataset('klue','sts')
print( klue_sts_dataset )

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.52M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/68.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/519 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['guid', 'source', 'sentence1', 'sentence2', 'labels'],
        num_rows: 11668
    })
    validation: Dataset({
        features: ['guid', 'source', 'sentence1', 'sentence2', 'labels'],
        num_rows: 519
    })
})


In [5]:
print( klue_sts_dataset['validation'][0] )

{'guid': 'klue-sts-v1_dev_00000', 'source': 'airbnb-rtt', 'sentence1': '무엇보다도 호스트분들이 너무 친절하셨습니다.', 'sentence2': '무엇보다도, 호스트들은 매우 친절했습니다.', 'labels': {'label': 4.9, 'real-label': 4.857142857142857, 'binary-label': 1}}


- `binary-label`: similar or dissimilar
- `real-label`: similarity score 0-5

## 3. Model & Tokenizer Set-up

In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "klue/roberta-base"
num_labels = 2
tokenizer = AutoTokenizer.from_pretrained( model_name )

In [12]:
model = AutoModelForSequenceClassification.from_pretrained( model_name, num_labels=2 )

ValueError: Could not find RobertaForSequenceClassification neither in <module 'transformers.models.roberta' from '/opt/conda/envs/pytorch/lib/python3.10/site-packages/transformers/models/roberta/__init__.py'> nor in <module 'transformers' from '/opt/conda/envs/pytorch/lib/python3.10/site-packages/transformers/__init__.py'>!

## 4. Preprocessing

In [13]:
def preprocess( examples ):
    # Notice the data structure of the input
    #
    # from datasets import load_dataset
    # klue_sts_dataset = load_dataset('klue','sts')
    # print( klue_sts_dataset )
    #
    # DatasetDict({
    #   train: Dataset({features: ['guid', 'source', 'sentence1', 'sentence2', 'labels'], num_rows: 11668 })
    #   validation: Dataset({features: ['guid', 'source', 'sentence1', 'sentence2', 'labels'], num_rows: 519 })
    # })
    tokenized_input = tokenizer(
        examples['sentence1'],
        examples['sentence2'],
        truncation=True,
        max_length=128  # Adjust for model and data
    )
    
    # print( klue_sts_dataset['validation'][0] )
    # {'guid': 'klue-sts-v1_dev_00000', 'source': 'airbnb-rtt', 
    #  'sentence1': '무엇보다도 호스트분들이 너무 친절하셨습니다.', 
    #  'sentence2': '무엇보다도, 호스트들은 매우 친절했습니다.', 
    #  'labels': {'label': 4.9, 'real-label': 4.857142857142857, 'binary-label': 1}
    # }
    
    tokenized_input['labels'] = [ ex['binary-label']for ex in examples['labels']]
    return tokenized_input

In [14]:
# Do it for all the dataset
tokenized_datasets = klue_sts_dataset.map( preprocess, batched=True )

Map:   0%|          | 0/11668 [00:00<?, ? examples/s]

Map:   0%|          | 0/519 [00:00<?, ? examples/s]

In [16]:
# Remove unnecessary columns
tokenized_datasets = tokenized_datasets.remove_columns( ['guid', 'source', 'sentence1', 'sentence2'] )
print( tokenized_datasets['validation'][0] )

{'labels': 1, 'input_ids': [0, 3890, 2178, 2062, 2119, 24769, 2377, 7285, 3760, 7798, 2205, 3011, 2219, 3606, 18, 2, 3890, 2178, 2062, 2119, 16, 24769, 2031, 2073, 4230, 7798, 2371, 2219, 3606, 18, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [17]:
print( klue_sts_dataset['validation'][0] )

{'guid': 'klue-sts-v1_dev_00000', 'source': 'airbnb-rtt', 'sentence1': '무엇보다도 호스트분들이 너무 친절하셨습니다.', 'sentence2': '무엇보다도, 호스트들은 매우 친절했습니다.', 'labels': {'label': 4.9, 'real-label': 4.857142857142857, 'binary-label': 1}}


## 5. Inference & Metrics
The performance of the binary classification is measured with F1 Score. 

In [19]:
import numpy as np
import evaluate

metric = evaluate.load("f1")

AttributeError: NEAREST_EXACT

In [20]:
import numpy as np
import evaluate

# F1 스코어 메트릭 로드
metric = evaluate.load("f1")

AttributeError: NEAREST_EXACT

In [21]:
def compute_metrics( eval_pred ):
    predictions, labels = eval_pred
    
    predictions = np.argmax( predictions, axis=1 )
    return metric.compute( predictions=predictions, references=labels )

In [22]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir = "./results",
    per_device_eval_batch_size=64,
    report_to="non",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],  # for fine-tuning
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

results = trainer.evaluate()
print( results )

ImportError: cannot import name 'PreTrainedModel' from 'transformers' (/opt/conda/envs/pytorch/lib/python3.10/site-packages/transformers/__init__.py)