In [14]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
from tqdm import tqdm
df = pd.read_csv('../data/training_data.csv',sep=";",names=["review","rating"])
MAX_LENGTH = 128
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
EPOCHS = 3

In [7]:
tokenizer = BertTokenizer.from_pretrained('monsoon-nlp/bert-base-thai')
model = BertForSequenceClassification.from_pretrained('monsoon-nlp/bert-base-thai', num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monsoon-nlp/bert-base-thai and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
tokenized_texts = [tokenizer.encode(review, max_length=MAX_LENGTH, truncation=True) for review in df['review']]
labels = torch.tensor(df['rating'] - 1) 
padded_texts = torch.nn.utils.rnn.pad_sequence([torch.tensor(tokens) for tokens in tokenized_texts], batch_first=True, padding_value=0)
dataset = TensorDataset(padded_texts, labels)
train_dataset, val_dataset = train_test_split(dataset, test_size=0.2, random_state=42)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

ValueError: could not determine the shape of object type 'Series'

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.CrossEntropyLoss()



In [6]:

# Fine-tuning loop
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{EPOCHS}'):
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)[0]
        loss = criterion(outputs, labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    # Calculate validation loss
    model.eval()
    val_loss = 0
    val_predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in val_loader:
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)[0]
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            # Store predictions and true labels for metrics calculation
            predictions = torch.argmax(outputs, dim=1).cpu().numpy()
            val_predictions.extend(predictions)
            true_labels.extend(labels.cpu().numpy())

    avg_train_loss = total_loss / len(train_loader)
    avg_val_loss = val_loss / len(val_loader)

    print(f'Epoch {epoch + 1}/{EPOCHS} - Avg Train Loss: {avg_train_loss:.4f}, Avg Val Loss: {avg_val_loss:.4f}')

    # Print classification report at the validation step
    print("Validation Classification Report:")
    print(classification_report(true_labels, val_predictions))

Epoch 1/3:   0%|          | 0/1000 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Epoch 1/3:   0%|          | 3/1000 [01:46<9:44:14, 35.16s/it] 