# Installation

In [None]:
!pip install scikit-learn
!pip install pythainlp
!pip install emoji
!pip install gradio

In [None]:
import os

# Create a directory for the downloaded dataset.
dataset_name = "wongnai-dataset"
os.makedirs(dataset_name, exist_ok=True)

# Download the dataset from github.
!wget https://github.com/wongnai/wongnai-corpus/raw/master/review/review_dataset.zip

# Unzip the dataset.
!unzip review_dataset.zip -d wongnai-dataset # for linux
# !tar -xzvf review_dataset.zip -C wongnai-dataset # for windows

# Remove the zip file.
!rm review_dataset.zip
# Remove the unrelated __MACOSX folder.
!rm -r wongnai-dataset/__MACOSX

# Data Preparation

In [None]:
import pandas as pd

# Read the train dataset.
train_df = pd.read_csv("wongnai-dataset/w_review_train.csv",
    sep=";",
    names=["review", "rating"],
    header=None
)
# Remove duplicate rows from training dataset.
train_df.drop_duplicates(inplace=True)

# Read the test dataset.
test_df = pd.read_csv("wongnai-dataset/test_file.csv", sep=";")
test_df["rating"] = 0 # Fill a dummy rating

In [None]:
# Preview the percentage of each rating on training dataset.
n_samples = len(train_df)
print(f"Total training samples: {n_samples}")
train_df.rating.value_counts() / n_samples

In [None]:
# Get each dataset components.
X_train, y_train = train_df["review"], train_df["rating"]
X_test = test_df["review"]

print(f"Total train samples: {len(X_train)}")
print(f"Total test samples: {len(X_test)}")

# Training Model

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC
from pythainlp.tokenize import word_tokenize

# Create a pipeline for text classification.
rating_classifier = Pipeline([
    ("vect", CountVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ("tfidf", TfidfTransformer()),
    ("clf", LinearSVC()),
])

In [None]:
# Train a classifier.
rating_classifier.fit(X_train, y_train)

# Predict

In [None]:
# Predict test dataset to get rating predictions
rating_predictions = rating_classifier.predict(X_test)

# Create a prediction result dataframe.
submit_df = pd.DataFrame({
    "review": test_df.review,
    "rating": rating_predictions
})
# See the first 10 results
pd.set_option('display.max_colwidth', 150) # Make long text easier to read
submit_df.head()

In [None]:
# Define function for later predict.
def predict_rating(review: str) -> str:
  """Predict a rating of a given text review."""
  predictions = rating_classifier.predict([review])
  prediction = predictions[0]
  return str(prediction)

# Try bad rating prediction
bad_review_text = "อาหารแย่มากๆ ไม่อร่อยเลย บรรยากาศร้านสกปรกและไม่ดูเอาจริงในราคาที่แพงมาก พนักงานบริการก็ไม่ใส่ใจเลย ไม่แนะนำเลยค่ะ อย่าไปเสียเวลาและเงินกับร้านนี้"
predicted_rating = predict_rating(bad_review_text)

print(f"Review: {bad_review_text}")
print(f"Predicted rating: {predicted_rating}")

# Try good rating prediction
good_review_text = "อาหารอร่อยมากค่ะ! บรรยากาศร้านสวยงามและเป็นกันเอง พนักงานบริการดีมาก ไม่เคยผิดหวังเลย ขอแนะนำเมนูทานเล่นและสเต็กที่นี่นะคะ สั่งมาทานหลายครั้งแล้วครับ ถ้ามีโอกาสจะกลับมาใหม่แน่นอน!"
predicted_rating = predict_rating(good_review_text)

print(f"Review: {good_review_text}")
print(f"Predicted rating: {predicted_rating}")

# Gradio Interface

In [None]:
from gradio.components import Textbox, Label
from gradio import Interface

# Create a gradio interface
rating_interface = Interface(
    fn=predict_rating,
    inputs=Textbox(label="Review"),
    outputs=Label(label="Predicted Rating")
)
# Launch the webapp
rating_interface.launch()