In [None]:
!pip install datasets
!pip install pandas
!pip install pandas-profiling
!pip install transformers
!pip install evaluate
!pip install sentencepiece
!pip install -U scikit-learn

In [None]:
import os
import shutil

dataset_name = "wongnai-dataset"
os.makedirs(dataset_name, exist_ok=True) 

!gdown --id 1N4oTVeumUFMaG6s5x4bN4TMC4diPB5fw
shutil.move("review_dataset.zip", dataset_name)

# for windows
!tar -xzvf wongnai-dataset/review_dataset.zip -C wongnai-dataset 

# for linux
# !unzip wongnai-dataset/review_dataset.zip -d wongnai-dataset

Data Preparation

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('wongnai-dataset/w_review_train.csv', sep=";", header=None).drop_duplicates()
df.columns = ["review", "rating"] 

labels = []
for rate in df["rating"]:
    if  rate == 1 :
        labels.append(0)
    elif rate == 2:
        labels.append(1)
    elif rate == 3:
        labels.append(2) 
    elif rate == 4:
        labels.append(3)
    else :
        labels.append(4)

df["label"] = labels
df.sample(frac=1)
df = df[["review", "label"]]
df["review"] = df["review"].str.replace('\n','') # remove \n

df = df[:1000] # choose 1000 samples for training
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42) # split train and test
train_df.to_csv("./train.csv", index=False) 
val_df.to_csv("./test.csv", index=False)

Preprocessing

In [None]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files={"train": "train.csv", "test" : "test.csv" }) # load dataset from csv file
dataset

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

# load tokenizer from pretrained model which alrady has a vocabulary
tokenizer = AutoTokenizer.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased", use_fast=False, model_max_length = 100) # set max length of input to 100

def preprocess_function(examples):
    return tokenizer(examples["review"], truncation=True)

tokenized_imdb = dataset.map(preprocess_function, batched=True ) # tokenize dataset

print(tokenized_imdb["train"][0]) # print first row of the tokenized training set

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='max_length') # pad batch

Training model

In [None]:
import evaluate
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# define accuracy metric
accuracy = evaluate.load("accuracy")

# กำหนด label ให้กับข้อมูล
id2label = {0: "Very poor (1) ", 1: "Poor (2)", 2: "Average (3)", 3: "Good (4)", 4: "Exellent (5)"} # กำหนด label ให้กับข้อมูล
label2id = {"Very poor (1)": 0, "Poor (2)": 1, "Average (3)": 2, "Good (4)": 3, "Exellent (5)": 4} 

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "airesearch/wangchanberta-base-att-spm-uncased", num_labels=5, id2label=id2label, label2id=label2id # load model from pretrained model
)

In [None]:
training_args = TrainingArguments(
    output_dir="Wongnai_classification", # กำหนด path ที่จะบันทึก output
    learning_rate=0.00001,  # กำหนด learning rate
    per_device_train_batch_size=1,  # กำหนด batch size
    per_device_eval_batch_size=1, 
    num_train_epochs=10,  # กำหนดจำนวนรอบในการ train
    weight_decay=0.01, 
    evaluation_strategy="epoch", 
    save_strategy="epoch", 
    load_best_model_at_end=True, 
    push_to_hub=False, 
    report_to="none"
)

trainer = Trainer( 
    model=model, 
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics, 
)

In [None]:
trainer.train() # train model

Inference

In [None]:
text = "ปกติมาทานบ่อยอยู่แล้ว วันนี้ลองมาทานก๋วยเตี๋ยวเจดู รสชาติสู้แบบปกติไม่ได้เพราะปกติอร่อยมากๆ แต่พอเป็นแนวเจ รสชาติของเครื่องจึงตกลง แต่น้ำซุปยังอร่อยเหมือนเดิม ส่วนไอศครีมกะทิราดซุปข้าวโพด อร่อยดี ทานไม่บ่อย แต่คราวนี้ลองดู ช่วยให้มื้อนี้ดูดีขึ้นเยอะเลย" # input text

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model="Wongnai_classification/checkpoint-8000/") # load model ตามที่ได้เทรนไว้
classifier(text)

In [None]:
from transformers import AutoModelForSequenceClassification
import torch
from transformers import AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("Wongnai_classification/checkpoint-1600/", local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased", use_fast=False)
inputs = tokenizer(text, return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]