# Online Market Product Review Classification


## Load Modules


In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


## Load Dataset


In [2]:
df = pd.read_csv(
    "../data/naver_shopping.txt", encoding="utf-8", delimiter="\t", header=None
)
df.head()

Unnamed: 0,0,1
0,5,배공빠르고 굿
1,2,택배가 엉망이네용 저희집 밑에층에 말도없이 놔두고가고
2,5,아주좋아요 바지 정말 좋아서2개 더 구매했어요 이가격에 대박입니다. 바느질이 조금 ...
3,2,선물용으로 빨리 받아서 전달했어야 하는 상품이었는데 머그컵만 와서 당황했습니다. 전...
4,5,민트색상 예뻐요. 옆 손잡이는 거는 용도로도 사용되네요 ㅎㅎ


## Select Features and Labels


In [3]:
features, labels = df.loc[:, 1], df.loc[:, 0].apply(lambda x: 0 if x < 3 else 1)
features.head(), labels.head()

(0                                              배공빠르고 굿
 1                        택배가 엉망이네용 저희집 밑에층에 말도없이 놔두고가고
 2    아주좋아요 바지 정말 좋아서2개 더 구매했어요 이가격에 대박입니다. 바느질이 조금 ...
 3    선물용으로 빨리 받아서 전달했어야 하는 상품이었는데 머그컵만 와서 당황했습니다. 전...
 4                    민트색상 예뻐요. 옆 손잡이는 거는 용도로도 사용되네요 ㅎㅎ
 Name: 1, dtype: object,
 0    1
 1    0
 2    1
 3    0
 4    1
 Name: 0, dtype: int64)

## Split Dataset Into Train and Test data


In [4]:
X_train, y_train, X_test, y_test = train_test_split(
    features, labels, test_size=0.2, random_state=42
)

train_df = pd.DataFrame({"text": X_train, "label": X_test})
test_df = pd.DataFrame({"text": y_train, "label": y_test})

train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(test_df)

## Select Model and Tokenizer


##


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "beomi/KcELECTRA-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)
torch.cuda.is_available(), torch.cuda.get_device_name(0), next(
    model.parameters()
).device

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


(True, 'NVIDIA GeForce RTX 4060 Laptop GPU', device(type='cuda', index=0))

## Preprocessing data


In [6]:
max_length = features.apply(len).max()


def preprocess(examples):
    return tokenizer(
        examples["text"], padding="max_length", truncation=True, max_length=128
    )


train_dataset = train_dataset.map(preprocess, batched=True)
eval_dataset = eval_dataset.map(preprocess, batched=True)

Map: 100%|██████████| 160000/160000 [00:10<00:00, 15652.08 examples/s]
Map: 100%|██████████| 40000/40000 [00:02<00:00, 15222.15 examples/s]


## Train The Model


In [7]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {"accuracy": accuracy_score(labels, preds), "f1": f1_score(labels, preds)}


training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1728,0.188437,0.937775,0.937577
2,0.1473,0.200703,0.94035,0.940829
3,0.1248,0.233105,0.9378,0.937995


TrainOutput(global_step=30000, training_loss=0.16115894991556803, metrics={'train_runtime': 7977.0265, 'train_samples_per_second': 60.173, 'train_steps_per_second': 3.761, 'total_flos': 3.15733266432e+16, 'train_loss': 0.16115894991556803, 'epoch': 3.0})

## Prediction using Trained Model


In [None]:
test_texts = ["배송이 너무 느렸어요", "포장이 예쁘고 마음에 들어요"]
inputs = tokenizer(test_texts, return_tensors="pt", padding=True, truncation=True).to(
    device
)
outputs = model(**inputs)
predictions = outputs.logits.argmax(dim=1)
print(predictions.tolist())  # [0, 1]

[0, 1]


## Evaluate Trained Model


In [10]:
trainer.evaluate()

{'eval_loss': 0.23310546576976776,
 'eval_accuracy': 0.9378,
 'eval_f1': 0.9379953147585107,
 'eval_runtime': 452.9274,
 'eval_samples_per_second': 88.314,
 'eval_steps_per_second': 5.52,
 'epoch': 3.0}

## Save The Model


In [11]:
model.save_pretrained("./my_korean_review_model")
tokenizer.save_pretrained("./my_korean_review_model")

('./my_korean_review_model\\tokenizer_config.json',
 './my_korean_review_model\\special_tokens_map.json',
 './my_korean_review_model\\tokenizer.json')