# Installation

In [None]:
!pip install pandas
!pip install pandas-profiling
!pip install transformers
!pip install evaluate
!pip install sentencepiece
!pip install -U scikit-learn
!pip install accelerate
!pip install gradio

# Download the dataset

In [None]:
import os

# Create a directory for the downloaded dataset.
dataset_name = "wongnai-dataset"
os.makedirs(dataset_name, exist_ok=True) 

# Download the dataset from google drive.
!wget https://github.com/wongnai/wongnai-corpus/raw/master/review/review_dataset.zip

# Unzip the dataset.
!unzip review_dataset.zip -d wongnai-dataset # for linux
# !tar -xzvf review_dataset.zip -C wongnai-dataset # for windows

# Remove the zip file.
!rm review_dataset.zip
# Remove the unrelated __MACOSX folder.
!rm -r wongnai-dataset/__MACOSX

# Data Preparation

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the dataset into a pandas dataframe.
df = pd.read_csv("wongnai-dataset/w_review_train.csv", 
    sep=";",
    names=["review", "rating"],
    header=None
)
# Remove duplicate rows.
df.drop_duplicates(inplace=True)
# Remove newline (\n) characters from reviews.
df["review"] = df["review"].str.replace('\n','') # remove \n
# Create label column from rating column.
df["label"] = df["rating"].map({1: 0, 2: 1, 3: 2, 4: 3, 5: 4})
# Drop rating column because we only want review and label columns.
df.drop("rating", axis=1, inplace=True)
# Sample dataset for speed up training. Comment this line if you want to use the whole dataset.
df = df.sample(3000)
# Split the dataset into train and test sets.
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42) # Random state is just for reproducibility.
# Save the train and test sets to csv files.
train_df.to_csv("./train.csv", index=False) 
val_df.to_csv("./test.csv", index=False)

# Load Dataset

In [None]:
from datasets import load_dataset

# Load the dataset from the csv files.
dataset = load_dataset("csv", data_files={"train": "train.csv", "test" : "test.csv" })
dataset

# Create Metrics

In [None]:
import evaluate
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# Load accuracy metric.
accuracy = evaluate.load("accuracy")

# Create a dictionary of rating labels and their corresponding ids.
id2label = {0: "Very poor (1) ", 1: "Poor (2)", 2: "Average (3)", 3: "Good (4)", 4: "Excellent (5)"}
label2id = {"Very poor (1)": 0, "Poor (2)": 1, "Average (3)": 2, "Good (4)": 3, "Excellent (5)": 4} 

# Load Tokenizer and Model

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification

pretrained_model_name = "airesearch/wangchanberta-base-att-spm-uncased"

# Load pretrained model
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name, 
    num_labels=5, 
    id2label=id2label, 
    label2id=label2id
)

# Freeze the weights of the encoder and only train the classification head and the last layer of the encoder.
for param in model.roberta.parameters():
    param.requires_grad = False

for param in model.roberta.encoder.layer[11].parameters():
    param.requires_grad = True

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name,
    use_fast=False, 
    model_max_length = 256 # The maximum length (in number of tokens) for the inputs to the transformer model
)


def tokenize(examples):
    """
    Function to convert the review texts to tokens.
    For example: "I like this food" -> [101, 146, 108, 114, 117, 110, 170, 102]
    """
    return tokenizer(examples["review"], truncation=True)

# Applying the tokenize function to the our dataset.
dataset = dataset.map(tokenize, batched=True)
train_dataset, test_dataset = dataset["train"], dataset["test"]

# Data collator is used to create batches of input data from the dataset.
# DataCollatorWithPadding will dynamically pad the tokens to the maximum length in that batch.
# More info on data collators: https://www.youtube.com/watch?v=-RPeakdlHYo
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Let's take a look at our first sample in the training set.
print(train_dataset[0])

# Training model

In [None]:
from transformers import TrainingArguments, Trainer

# Create a TrainingArguments object to configure how to train the model.
training_args = TrainingArguments(
    output_dir="Wongnai_classification", # Where to store the final model.
    learning_rate=5e-5,  # Set learning rate
    per_device_train_batch_size=64,  # Batch size for training.
    per_device_eval_batch_size=64, # Batch size for evaluation.
    num_train_epochs=10,  # Number of training epochs.
    weight_decay=0.01, 
    evaluation_strategy="epoch", # How often to evaluate the model.
    save_strategy="epoch", # How often to save the model.
    load_best_model_at_end=True, # Whether to load the best model at the end of training.
    push_to_hub=False, # Whether to upload the final model to the Huggingface Hub.
    report_to="none", # Whether to report logging to any services.
    fp16=True # Use mixed precision to speed up training.
)

# Create a trainer to train the model.
trainer = Trainer( 
    model=model, 
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics, 
)

In [None]:
# Run training
trainer.train()

# Inference

In [None]:
example_text = (
"ปกติมาทานบ่อยอยู่แล้ว วันนี้ลองมาทานก๋วยเตี๋ยวเจดู "
"รสชาติสู้แบบปกติไม่ได้เพราะปกติอร่อยมากๆ แต่พอเป็นแนวเจ " 
"รสชาติของเครื่องจึงตกลง แต่น้ำซุปยังอร่อยเหมือนเดิม ส่วนไอศครีมกะทิราดซุปข้าวโพด "
"อร่อยดี ทานไม่บ่อย แต่คราวนี้ลองดู ช่วยให้มื้อนี้ดูดีขึ้นเยอะเลย"
)

## Use pipeline to predict

In [None]:
from transformers import pipeline

# Point to the directory where the model weight is stored.
model_dir = "Wongnai_classification/checkpoint-8000/"
# Create a pipeline to classify the sentiment of the input text.
classifier = pipeline("sentiment-analysis", model=model_dir)
# Let's try it out!
output = classifier(example_text)
print(output)

## Manually predict

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Point to the directory where the model weight is stored.
model_dir = "Wongnai_classification/checkpoint-8000/"

# Create model from our fine-tuned checkpoint.
model = AutoModelForSequenceClassification.from_pretrained(
    model_dir, 
    local_files_only=True # Look for the model in the local directory, not on the Huggingface Hub.
)
# Load tokenizer from our pretrained model.
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name, 
    use_fast=False
)

# Tokenize the example text to input_ids.
input_ids = tokenizer(example_text, return_tensors="pt")

with torch.no_grad(): # Disable gradient calculation because we are not training
    logits = model(**input_ids).logits

# Get the most probable class for the input text.
predicted_class_id = logits.argmax().item()
# Get the rating label from the predicted rating
predicted_rating_label = model.config.id2label[predicted_class_id]


print(f"Input review: {example_text}\nPredicted rating label: \"{predicted_rating_label}\"")

# Wrap with gradio

For a nice interactive demo, we use gradio to wrap our model and make it easy to use.

In [None]:
from gradio.components import Textbox, Label
from gradio import Interface

# Define a function that will process the input text from gradio widget.
def classify_text(text: str) -> str:
    """Classify the sentiment of the input text and return the predicted rating."""
    output = classifier(text)
    prediction = output[0]

    predicted_rating = prediction["label"]
    confidence_score = round(prediction["score"], 4)
    return predicted_rating, confidence_score


# Create a gradio interface.
rating_inferface = Interface(
    fn=classify_text,
    inputs=Textbox(label="Review"),
    outputs=[
        Label(label="Predicted rating"),
        Label(label="Confidence score"),
    ],

    title="Wongnai review rating prediction",
    description="Classify the sentiment of the review into the predicted rating with confidence score.",
)
rating_inferface.launch()