In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
import collections

In [4]:
import torch
import random
from scipy.special import softmax
from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset
from sklearn.metrics import f1_score

import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)

# **1st Model Test**

In [None]:
# Load data
df_pattaya = pd.read_csv('clean_pattaya.csv', encoding='utf-8')
df_pattaya_sen = df_pattaya[['processed_text']].dropna()

# Load model
def load_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    return tokenizer, model

# Compute sentiment probabilities
def sentiment_score(text, tokenizer, model):
    tokens = tokenizer(text, truncation=True, padding=True, max_length=512, return_tensors="pt")
    with torch.no_grad():
        logits = model(**tokens).logits
    return softmax(logits.numpy()[0])  # Returns [negative, neutral, positive]

# Convert probabilities to weighted sentiment intensity score
def intensity_score(probs):
    weights = [-1, 0, 1]  # Corresponding to [negative, neutral, positive]
    return round(sum(p * w for p, w in zip(probs, weights)), 3)

# Run sentiment analysis
models = {
    "RoBERTa": "cardiffnlp/twitter-roberta-base-sentiment"
}

for model_name, model_path in models.items():
    print(f"\nUsing model: {model_name}")
    tokenizer, model = load_model(model_path)

    # Apply sentiment scoring
    df_pattaya_sen['score_' + model_name] = df_pattaya_sen['processed_text'].apply(
        lambda x: sentiment_score(x, tokenizer, model)
    )

    # Apply intensity scoring
    df_pattaya_sen['intensity_' + model_name] = df_pattaya_sen['score_' + model_name].apply(intensity_score)

In [None]:
# convert scoreRoBERTa and scoreDistilBERT column into 2 other columns
df_pattaya_sen['resultRoBERTa'] = df_pattaya_sen['score_RoBERTa'].apply(lambda x: x.argmax())

# scoreRoBERTa will be resultRoBERTa with the max value being negative, neutral and positive according to their position
df_pattaya_sen['resultRoBERTa'] = df_pattaya_sen['resultRoBERTa'].replace({0: 'Negative', 1: 'Neutral', 2: 'Positive'})

df_pattaya_sen

In [None]:
# Merge on 'processed_text'
df_pattaya_cleaned = df_pattaya_sen.merge(
    df_pattaya[['processed_text', 'rating', 'date', 'travel_type', 'location']],  # choose columns to bring in
    on='processed_text',
    how='left'
)
df_pattaya_cleaned

In [None]:
df_pattaya_cleaned.to_csv('pattaya_sentiment.csv', index=False)

# **RoBERTa Enhancement**

In [5]:
df_pattaya = pd.read_csv('pattaya_sentiment.csv', encoding='utf-8')

In [6]:
# Encode labels
label_map = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
df_pattaya['label_encoded'] = df_pattaya['resultRoBERTa'].map(label_map)

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0,1,2]), y=df_pattaya['label_encoded'])
weights_tensor = torch.tensor(class_weights, dtype=torch.float)

In [7]:
# Set random seed for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

# Load tokenizer and model
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples["processed_text"], padding="max_length", truncation=True, max_length=128)

# Convert DataFrame to HuggingFace Dataset
dataset = Dataset.from_pandas(df_pattaya)
dataset = dataset.rename_column("label_encoded", "labels")
dataset = dataset.map(tokenize_function, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Train/Test Split
train_test = dataset.train_test_split(test_size=0.2, seed=seed)
train_dataset = train_test["train"]
eval_dataset = train_test["test"]

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=2e-5,
    seed=seed,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True
)

# Custom loss function with class weights
def compute_loss(model, inputs, return_outputs=False, num_items_in_batch=0):
    labels = inputs.get("labels")
    outputs = model(**inputs)
    logits = outputs.get("logits")
    loss_fct = torch.nn.CrossEntropyLoss(weight=weights_tensor.to(logits.device))
    loss = loss_fct(logits, labels)
    return (loss, outputs) if return_outputs else loss

# Evaluation metric
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    macro_f1 = f1_score(labels, preds, average='macro')
    return {"macro_f1": macro_f1}

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    # compute_loss_func=compute_loss
)

# Train the model
trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkg24543[0m ([33mkg24543-university-of-bristol[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss,Macro F1
1,No log,0.140443,0.794595
2,No log,0.092955,0.780309




TrainOutput(global_step=50, training_loss=0.08690813064575195, metrics={'train_runtime': 1529.2847, 'train_samples_per_second': 0.523, 'train_steps_per_second': 0.033, 'total_flos': 52622683545600.0, 'train_loss': 0.08690813064575195, 'epoch': 2.0})

Key - a42eb9bc954e96d4d958b9cacfadb39921dead98

In [8]:
# Save the trained model
model_path = "./roberta_finetune_model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

print(f"Model saved to: {model_path}")

Model saved to: ./roberta_finetune_model


**Pattaya Fitting**

In [9]:
# Load new data
df_pattaya = pd.read_csv('clean_pattaya.csv', encoding='utf-8')
df_pattaya_sen = df_pattaya[['processed_text']].dropna()

In [10]:
# Load fine-tuned model and tokenizer
model_path = "./roberta_finetune_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Compute sentiment probabilities
def sentiment_score(text, tokenizer, model):
    tokens = tokenizer(text, truncation=True, padding=True, max_length=512, return_tensors="pt")
    with torch.no_grad():
        logits = model(**tokens).logits
    return softmax(logits.numpy()[0])  # Returns [negative, neutral, positive]

# Convert probabilities to weighted sentiment intensity score
def intensity_score(probs):
    weights = [-1, 0, 1]  # Corresponding to [negative, neutral, positive]
    return round(sum(p * w for p, w in zip(probs, weights)), 3)

# Apply sentiment scoring
df_pattaya_sen['score_RoBERTa'] = df_pattaya_sen['processed_text'].apply(
    lambda x: sentiment_score(x, tokenizer, model)
)

# Apply intensity scoring
df_pattaya_sen['intensity_RoBERTa'] = df_pattaya_sen['score_RoBERTa'].apply(intensity_score)

In [11]:
# convert scoreRoBERTa column into 2 other columns
df_pattaya_sen['resultRoBERTa'] = df_pattaya_sen['score_RoBERTa'].apply(lambda x: x.argmax())

# scoreRoBERTa will be resultRoBERTa with the max value being negative, neutral and positive according to their position
df_pattaya_sen['resultRoBERTa'] = df_pattaya_sen['resultRoBERTa'].replace({0: 'Negative', 1: 'Neutral', 2: 'Positive'})

df_pattaya_sen

Unnamed: 0,processed_text,score_RoBERTa,intensity_RoBERTa,resultRoBERTa
0,best massage existence absolutely fabulous mas...,"[0.000545775, 0.0006527193, 0.99880147]",0.998,Positive
1,great place gem gallery pattaya amazing place ...,"[0.00043520975, 0.0006707744, 0.99889404]",0.998,Positive
2,recommend place beautiful organized people wor...,"[0.00041371837, 0.0007984634, 0.9987878]",0.998,Positive
3,life altering massage experience absolutely in...,"[0.00044122155, 0.0008386694, 0.99872017]",0.998,Positive
4,finally relaxed rejuvenated friend got massage...,"[0.0005455051, 0.00066375395, 0.9987908]",0.998,Positive
...,...,...,...,...
2333,great day well run water park start finish wen...,"[0.00052692054, 0.000617436, 0.9988556]",0.998,Positive
2334,awesome beach beach go ever restaurant bar lun...,"[0.00031094882, 0.0017167189, 0.9979723]",0.998,Positive
2335,rainbow beach must go look rainbow beach jomti...,"[0.00043922983, 0.00066251185, 0.99889827]",0.998,Positive
2336,beautiful building come animal rowing boat san...,"[0.00027121327, 0.0014958514, 0.99823296]",0.998,Positive


In [12]:
# Merge on 'processed_text'
df_pattaya_cleaned = df_pattaya_sen.merge(
    df_pattaya[['processed_text', 'rating', 'date', 'travel_type', 'location']],  # choose columns to bring in
    on='processed_text',
    how='left'
)
df_pattaya_cleaned

Unnamed: 0,processed_text,score_RoBERTa,intensity_RoBERTa,resultRoBERTa,rating,date,travel_type,location
0,best massage existence absolutely fabulous mas...,"[0.000545775, 0.0006527193, 0.99880147]",0.998,Positive,5,2025-08,Friends,
1,great place gem gallery pattaya amazing place ...,"[0.00043520975, 0.0006707744, 0.99889404]",0.998,Positive,5,2025-08,Friends,India
2,recommend place beautiful organized people wor...,"[0.00041371837, 0.0007984634, 0.9987878]",0.998,Positive,5,2025-08,Family,
3,life altering massage experience absolutely in...,"[0.00044122155, 0.0008386694, 0.99872017]",0.998,Positive,5,2025-08,Friends,
4,finally relaxed rejuvenated friend got massage...,"[0.0005455051, 0.00066375395, 0.9987908]",0.998,Positive,5,2025-08,Friends,
...,...,...,...,...,...,...,...,...
2333,great day well run water park start finish wen...,"[0.00052692054, 0.000617436, 0.9988556]",0.998,Positive,5,2023-01,Family,Australia
2334,awesome beach beach go ever restaurant bar lun...,"[0.00031094882, 0.0017167189, 0.9979723]",0.998,Positive,5,2023-01,Friends,Canada
2335,rainbow beach must go look rainbow beach jomti...,"[0.00043922983, 0.00066251185, 0.99889827]",0.998,Positive,4,2023-01,Friends,UK
2336,beautiful building come animal rowing boat san...,"[0.00027121327, 0.0014958514, 0.99823296]",0.998,Positive,3,2023-01,,Denmark


In [13]:
df_pattaya_cleaned.to_csv('pattaya_sentiment_1.csv', index=False)

# **Sentiment Analysis**

**HANOI Sentiment Analysis**

In [None]:
# Load data
df_hanoi = pd.read_csv('clean_hanoi.csv', encoding='utf-8')
df_hanoi_sen = df_hanoi[['processed_text']].dropna()

# Load model
def load_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    return tokenizer, model

# Compute sentiment probabilities
def sentiment_score(text, tokenizer, model):
    tokens = tokenizer(text, truncation=True, padding=True, max_length=512, return_tensors="pt")
    with torch.no_grad():
        logits = model(**tokens).logits
    return softmax(logits.numpy()[0])  # Returns [negative, neutral, positive]

# Convert probabilities to weighted sentiment intensity score
def intensity_score(probs):
    weights = [-1, 0, 1]  # Corresponding to [negative, neutral, positive]
    return round(sum(p * w for p, w in zip(probs, weights)), 3)

# Run sentiment analysis
models = {
    "RoBERTa": "cardiffnlp/twitter-roberta-base-sentiment"
}

for model_name, model_path in models.items():
    print(f"\nUsing model: {model_name}")
    tokenizer, model = load_model(model_path)

    # Apply sentiment scoring
    df_hanoi_sen['score_' + model_name] = df_hanoi_sen['processed_text'].apply(
        lambda x: sentiment_score(x, tokenizer, model)
    )

    # Apply intensity scoring
    df_hanoi_sen['intensity_' + model_name] = df_hanoi_sen['score_' + model_name].apply(intensity_score)


Using model: RoBERTa


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

  return forward_call(*args, **kwargs)


In [None]:
# convert scoreRoBERTa and scoreDistilBERT column into 2 other columns
df_hanoi_sen['resultRoBERTa'] = df_hanoi_sen['score_RoBERTa'].apply(lambda x: x.argmax())

# scoreRoBERTa will be resultRoBERTa with the max value being negative, neutral and positive according to their position
df_hanoi_sen['resultRoBERTa'] = df_hanoi_sen['resultRoBERTa'].replace({0: 'Negative', 1: 'Neutral', 2: 'Positive'})

df_hanoi_sen

Unnamed: 0,processed_text,score_RoBERTa,intensity_RoBERTa,rating,date,travel_type,location,resultRoBERTa
0,hà nội really like good food nice people frien...,"[0.016993705, 0.08790884, 0.89509743]",0.878,4.0,2025-07,Friends,,Positive
1,great thing hanoi great thing hanoi bella grea...,"[0.0022518958, 0.019215526, 0.9785326]",0.976,5.0,2025-07,Couples,Australia,Positive
2,great thing hanoi great thing hanoi bella grea...,"[0.0022518958, 0.019215526, 0.9785326]",0.976,5.0,2025-07,Couples,Australia,Positive
3,teacher bright lot fun food together meal grea...,"[0.0018884634, 0.024878994, 0.97323257]",0.971,5.0,2025-07,,,Positive
4,teacher bright lot fun food together meal grea...,"[0.0018884634, 0.024878994, 0.97323257]",0.971,5.0,2025-07,,,Positive
...,...,...,...,...,...,...,...,...
38797,fantastic couple massage easily best massage v...,"[0.0019025911, 0.021646915, 0.9764505]",0.975,5.0,2023-01,Couples,UK,Positive
38798,great hanoi food guide hieu undergrad studying...,"[0.0041389074, 0.0473428, 0.94851834]",0.944,5.0,2023-01,Family,Singapore,Positive
38799,really good afternoon really nice afternoon ng...,"[0.0014665928, 0.009970934, 0.98856246]",0.987,5.0,2023-01,Couples,,Positive
38800,best experience hanoi guide thuy amazing one t...,"[0.0020888352, 0.022353826, 0.9755574]",0.973,5.0,2023-01,Solo,,Positive


In [None]:
# Merge on 'processed_text'
df_hanoi_cleaned = df_hanoi_sen.merge(
    df_hanoi[['processed_text', 'rating', 'date', 'travel_type', 'location']],  # choose columns to bring in
    on='processed_text',
    how='left'
)
df_hanoi_cleaned

In [None]:
df_hanoi_cleaned.to_csv('hanoi_sentiment.csv', index=False)

**DANANG Sentiment Analysis**

In [None]:
# Load new data
df_danang = pd.read_csv('clean_danang.csv', encoding='utf-8')
df_danang_sen = df_danang[['processed_text']].dropna()

# Apply sentiment scoring
df_danang_sen['score_RoBERTa'] = df_danang_sen['processed_text'].apply(
    lambda x: sentiment_score(x, tokenizer, model)
)

# Apply intensity scoring
df_danang_sen['intensity_RoBERTa'] = df_danang_sen['score_RoBERTa'].apply(intensity_score)

  return forward_call(*args, **kwargs)


In [None]:
# convert scoreRoBERTa column into 2 other columns
df_danang_sen['resultRoBERTa'] = df_danang_sen['score_RoBERTa'].apply(lambda x: x.argmax())

# scoreRoBERTa will be resultRoBERTa with the max value being negative, neutral and positive according to their position
df_danang_sen['resultRoBERTa'] = df_danang_sen['resultRoBERTa'].replace({0: 'Negative', 1: 'Neutral', 2: 'Positive'})

df_danang_sen

Unnamed: 0,processed_text,score_RoBERTa,intensity_RoBERTa,resultRoBERTa
0,excellent experience went donk silk tailor jac...,"[0.0035434982, 0.048638206, 0.9478183]",0.944,Positive
1,really deliver ba ri tailor delivered outstand...,"[0.008937679, 0.09708384, 0.89397854]",0.885,Positive
2,clothes came amazingly super happy staff super...,"[0.002327867, 0.013598603, 0.9840735]",0.982,Positive
3,tailored clothing came shop recommended friend...,"[0.010668688, 0.0975591, 0.8917722]",0.881,Positive
4,relaxing experience came without booking récep...,"[0.002591971, 0.03365725, 0.9637508]",0.961,Positive
...,...,...,...,...
13666,app theatre show important raining bring umbre...,"[0.07968553, 0.79999715, 0.120317355]",0.041,Neutral
13667,visit worth effort step back time visiting imp...,"[0.009126912, 0.18701436, 0.8038587]",0.795,Positive
13668,rehahn experience came across gallery hochimin...,"[0.0017663914, 0.022590123, 0.97564346]",0.974,Positive
13669,subliming vietnam picture gallery minute walki...,"[0.010314843, 0.15958993, 0.83009523]",0.820,Positive


In [None]:
# Merge on 'processed_text'
df_danang_cleaned = df_danang_sen.merge(
    df_danang[['processed_text', 'rating', 'date', 'travel_type', 'location']],  # choose columns to bring in
    on='processed_text',
    how='left'
)
df_danang_cleaned

Unnamed: 0,processed_text,score_RoBERTa,intensity_RoBERTa,resultRoBERTa,rating,date,travel_type,location
0,excellent experience went donk silk tailor jac...,"[0.0035434982, 0.048638206, 0.9478183]",0.944,Positive,5,2025-08,Solo,
1,really deliver ba ri tailor delivered outstand...,"[0.008937679, 0.09708384, 0.89397854]",0.885,Positive,5,2025-08,Solo,CA
2,clothes came amazingly super happy staff super...,"[0.002327867, 0.013598603, 0.9840735]",0.982,Positive,5,2025-08,Family,
3,tailored clothing came shop recommended friend...,"[0.010668688, 0.0975591, 0.8917722]",0.881,Positive,5,2025-08,Family,
4,relaxing experience came without booking récep...,"[0.002591971, 0.03365725, 0.9637508]",0.961,Positive,5,2025-08,Couples,
...,...,...,...,...,...,...,...,...
13670,app theatre show important raining bring umbre...,"[0.07968553, 0.79999715, 0.120317355]",0.041,Neutral,4,2023-01,Friends,Australia
13671,visit worth effort step back time visiting imp...,"[0.009126912, 0.18701436, 0.8038587]",0.795,Positive,4,2023-01,Solo,Australia
13672,rehahn experience came across gallery hochimin...,"[0.0017663914, 0.022590123, 0.97564346]",0.974,Positive,5,2023-01,Couples,
13673,subliming vietnam picture gallery minute walki...,"[0.010314843, 0.15958993, 0.83009523]",0.820,Positive,5,2023-01,Family,


In [None]:
df_danang_cleaned.to_csv('danang_sentiment.csv', index=False)

**BANGKOK Sentiment Analysis**

In [None]:
# Load new data
df_bangkok = pd.read_csv('clean_bangkok.csv', encoding='utf-8')
df_bangkok_sen = df_bangkok[['processed_text']].dropna()

# Apply sentiment scoring
df_bangkok_sen['score_RoBERTa'] = df_bangkok_sen['processed_text'].apply(
    lambda x: sentiment_score(x, tokenizer, model)
)

# Apply intensity scoring
df_bangkok_sen['intensity_RoBERTa'] = df_bangkok_sen['score_RoBERTa'].apply(intensity_score)

  return forward_call(*args, **kwargs)


In [None]:
# convert scoreRoBERTa column into 2 other columns
df_bangkok_sen['resultRoBERTa'] = df_bangkok_sen['score_RoBERTa'].apply(lambda x: x.argmax())

# scoreRoBERTa will be resultRoBERTa with the max value being negative, neutral and positive according to their position
df_bangkok_sen['resultRoBERTa'] = df_bangkok_sen['resultRoBERTa'].replace({0: 'Negative', 1: 'Neutral', 2: 'Positive'})

df_bangkok_sen

Unnamed: 0,processed_text,score_RoBERTa,intensity_RoBERTa,resultRoBERTa
0,great tour anna great tour guide alot fun saw ...,"[0.0018257378, 0.013423596, 0.9847506]",0.983,Positive
1,boat tour excellent excursion fun ken attentiv...,"[0.002018995, 0.022882884, 0.9750981]",0.973,Positive
2,thon buri canal trip great insight thon buri c...,"[0.002734813, 0.041992098, 0.95527303]",0.953,Positive
3,great tour great guide showed u lot interestin...,"[0.0029229203, 0.02523496, 0.9718421]",0.969,Positive
4,must great experience would definitely guide f...,"[0.0028189837, 0.027880639, 0.96930045]",0.966,Positive
...,...,...,...,...
14509,high end shopping mall opposite hotel staying ...,"[0.009450286, 0.111303054, 0.8792466]",0.870,Positive
14510,overwhelmingly good mall architectural wonder ...,"[0.0078194225, 0.06344658, 0.928734]",0.921,Positive
14511,worth visiting took green flag boat wat arun p...,"[0.0018982462, 0.056462947, 0.94163877]",0.940,Positive
14512,huge shopping mall huge shopping mall lot food...,"[0.009233985, 0.7506472, 0.24011883]",0.231,Neutral


In [None]:
# Merge on 'processed_text'
df_bangkok_cleaned = df_bangkok_sen.merge(
    df_bangkok[['processed_text', 'rating', 'date', 'travel_type', 'location']],  # choose columns to bring in
    on='processed_text',
    how='left'
)
df_bangkok_cleaned

Unnamed: 0,processed_text,score_RoBERTa,intensity_RoBERTa,resultRoBERTa,rating,date,travel_type,location
0,great tour anna great tour guide alot fun saw ...,"[0.0018257378, 0.013423596, 0.9847506]",0.983,Positive,5,2025-07,Couples,
1,boat tour excellent excursion fun ken attentiv...,"[0.002018995, 0.022882884, 0.9750981]",0.973,Positive,5,2025-07,Family,
2,thon buri canal trip great insight thon buri c...,"[0.002734813, 0.041992098, 0.95527303]",0.953,Positive,5,2025-07,Family,
3,great tour great guide showed u lot interestin...,"[0.0029229203, 0.02523496, 0.9718421]",0.969,Positive,5,2025-07,Friends,
4,must great experience would definitely guide f...,"[0.0028189837, 0.027880639, 0.96930045]",0.966,Positive,5,2025-07,Solo,
...,...,...,...,...,...,...,...,...
14858,high end shopping mall opposite hotel staying ...,"[0.009450286, 0.111303054, 0.8792466]",0.870,Positive,4,2023-01,Couples,France
14859,overwhelmingly good mall architectural wonder ...,"[0.0078194225, 0.06344658, 0.928734]",0.921,Positive,5,2023-01,Couples,
14860,worth visiting took green flag boat wat arun p...,"[0.0018982462, 0.056462947, 0.94163877]",0.940,Positive,5,2023-01,,Singapore
14861,huge shopping mall huge shopping mall lot food...,"[0.009233985, 0.7506472, 0.24011883]",0.231,Neutral,5,2023-01,Family,China


In [None]:
df_bangkok_cleaned.to_csv('bangkok_sentiment.csv', index=False)

**PATTAYA Sentiment Analysis**

In [None]:
# Load new data
df_pattaya = pd.read_csv('clean_pattaya.csv', encoding='utf-8')
df_pattaya_sen = df_pattaya[['processed_text']].dropna()

# Apply sentiment scoring
df_pattaya_sen['score_RoBERTa'] = df_pattaya_sen['processed_text'].apply(
    lambda x: sentiment_score(x, tokenizer, model)
)

# Apply intensity scoring
df_pattaya_sen['intensity_RoBERTa'] = df_pattaya_sen['score_RoBERTa'].apply(intensity_score)

  return forward_call(*args, **kwargs)


In [None]:
# convert scoreRoBERTa column into 2 other columns
df_pattaya_sen['resultRoBERTa'] = df_pattaya_sen['score_RoBERTa'].apply(lambda x: x.argmax())

# scoreRoBERTa will be resultRoBERTa with the max value being negative, neutral and positive according to their position
df_pattaya_sen['resultRoBERTa'] = df_pattaya_sen['resultRoBERTa'].replace({0: 'Negative', 1: 'Neutral', 2: 'Positive'})

df_pattaya_sen

Unnamed: 0,processed_text,score_RoBERTa,intensity_RoBERTa,resultRoBERTa
0,best massage existence absolutely fabulous mas...,"[0.0072626476, 0.050901104, 0.9418363]",0.935,Positive
1,great place gem gallery pattaya amazing place ...,"[0.0029863047, 0.034561317, 0.9624524]",0.959,Positive
2,recommend place beautiful organized people wor...,"[0.0049539725, 0.07577563, 0.9192704]",0.914,Positive
3,life altering massage experience absolutely in...,"[0.014548055, 0.12608872, 0.85936326]",0.845,Positive
4,finally relaxed rejuvenated friend got massage...,"[0.0035248916, 0.031173743, 0.9653014]",0.962,Positive
...,...,...,...,...
2333,great day well run water park start finish wen...,"[0.0024267584, 0.01431447, 0.9832587]",0.981,Positive
2334,awesome beach beach go ever restaurant bar lun...,"[0.0027717587, 0.13596162, 0.8612667]",0.858,Positive
2335,rainbow beach must go look rainbow beach jomti...,"[0.0026613665, 0.031255536, 0.9660831]",0.963,Positive
2336,beautiful building come animal rowing boat san...,"[0.009857115, 0.121590525, 0.8685523]",0.859,Positive


In [None]:
# Merge on 'processed_text'
df_pattaya_cleaned = df_pattaya_sen.merge(
    df_pattaya[['processed_text', 'rating', 'date', 'travel_type', 'location']],  # choose columns to bring in
    on='processed_text',
    how='left'
)
df_pattaya_cleaned

Unnamed: 0,processed_text,score_RoBERTa,intensity_RoBERTa,resultRoBERTa,rating,date,travel_type,location
0,best massage existence absolutely fabulous mas...,"[0.0072626476, 0.050901104, 0.9418363]",0.935,Positive,5,2025-08,Friends,
1,great place gem gallery pattaya amazing place ...,"[0.0029863047, 0.034561317, 0.9624524]",0.959,Positive,5,2025-08,Friends,India
2,recommend place beautiful organized people wor...,"[0.0049539725, 0.07577563, 0.9192704]",0.914,Positive,5,2025-08,Family,
3,life altering massage experience absolutely in...,"[0.014548055, 0.12608872, 0.85936326]",0.845,Positive,5,2025-08,Friends,
4,finally relaxed rejuvenated friend got massage...,"[0.0035248916, 0.031173743, 0.9653014]",0.962,Positive,5,2025-08,Friends,
...,...,...,...,...,...,...,...,...
2333,great day well run water park start finish wen...,"[0.0024267584, 0.01431447, 0.9832587]",0.981,Positive,5,2023-01,Family,Australia
2334,awesome beach beach go ever restaurant bar lun...,"[0.0027717587, 0.13596162, 0.8612667]",0.858,Positive,5,2023-01,Friends,Canada
2335,rainbow beach must go look rainbow beach jomti...,"[0.0026613665, 0.031255536, 0.9660831]",0.963,Positive,4,2023-01,Friends,UK
2336,beautiful building come animal rowing boat san...,"[0.009857115, 0.121590525, 0.8685523]",0.859,Positive,3,2023-01,,Denmark


In [None]:
df_pattaya_cleaned.to_csv('pattaya_sentiment.csv', index=False)