In [6]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel

tokenizer = Tokenizer(BPE(unk_token="<unk>"))
tokenizer.pre_tokenizer = ByteLevel()  # thay vì Whitespace()

trainer = BpeTrainer(
    vocab_size=20000,
    min_frequency=1,
    special_tokens=["<pad>", "<unk>" , "<sos>", "<eos>"]
)

tokenizer.train(["data/VSFC/train/sents.txt"], trainer)
tokenizer.save("bpe_tokenizer.json")


In [None]:
vocab_size = tokenizer.get_vocab_size()

3526

In [28]:
import torch
from torch import nn
from torch.utils.data import DataLoader,TensorDataset

In [10]:
train_feature_path, train_label_path = "data/VSFC/train/sents.txt","data/VSFC/train/sentiments.txt"
test_feature_path, test_label_path = "data/VSFC/test/sents.txt","data/VSFC/test/sentiments.txt"


In [62]:
def load_file(path_src,path_tgt):
    with open(path_src, "r", encoding="utf-8") as f:
        src_lines = [line.strip() for line in f.readlines()]

    with open(path_tgt, "r", encoding="utf-8") as f:
        tgt_lines = [line.strip() for line in f.readlines()]
    return src_lines,tgt_lines

In [63]:
train_features, train_labels = load_file(train_feature_path,train_label_path)
test_features, test_labels = load_file(test_feature_path,test_label_path)

In [64]:
len(train_features), len(train_labels), len(test_features), len(test_labels)

(11426, 11426, 3166, 3166)

In [67]:
train_feature_ids, test_feature_ids = [], []

for train_feature_line in train_features:
        train_feature_ids.append(tokenizer.encode(train_feature_line).ids)
for test_feature_line in test_features:
        test_feature_ids.append(tokenizer.encode(test_feature_line).ids)


# for train_feature_line, test_feature_line in zip(train_features,test_features):
#     train_feature_ids.append(tokenizer.encode(train_feature_line).ids)
#     test_feature_ids.append(tokenizer.encode(test_feature_line).ids)

11426

In [69]:
max_len = max(len(seq) for seq in train_feature_ids)
max_len

159

In [25]:
from torch.nn.utils.rnn import pad_sequence

In [70]:
padded_train_feature = pad_sequence([torch.tensor(item) for item in train_feature_ids],batch_first=True,padding_value=0)
padded_test_feature = pad_sequence([torch.tensor(item) for item in test_feature_ids], batch_first=True,padding_value=0)

In [76]:
padded_test_feature.shape

torch.Size([3166, 103])

In [71]:
tensor_train_label = torch.tensor([int(item) for item in train_labels])
tensor_test_label = torch.tensor([int(item) for item in test_labels])

In [73]:
data_train = TensorDataset(padded_train_feature, tensor_train_label)
data_test = TensorDataset(padded_test_feature, tensor_test_label)

In [74]:
train_dataloader = DataLoader(dataset=data_train,batch_size=32,shuffle=True)
test_dataloader = DataLoader(dataset=data_test,batch_size=32)

In [108]:
class BasicClassifierRNN(nn.Module):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.embedding = nn.Embedding(vocab_size,embedding_dim=64,padding_idx=0)
        self.encoder = nn.LSTM(input_size=64,hidden_size=128,bidirectional=True,batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(256,3)
    # def forward(self,x):
    #     x = self.embedding(x)
    #     output, (hn, cn) = self.encoder(x)
    #     hn = torch.cat(hn[-2,:,:],hn[-1,:,:],dim=1)
    #     # x = self.encoder(x)
    #     x = self.linear(hn)
    #     return x
    def forward(self, x):
        x = self.embedding(x)  # shape: (batch, seq_len, 64)
        output, (hn, cn) = self.encoder(x)  # output: (batch, seq_len, 256)
        # Lấy hidden state cuối cùng từ cả 2 hướng
        hn = torch.cat([hn[-2,:,:], hn[-1,:,:]], dim=1)  # shape: (batch, 256)
        hn = self.dropout(hn)
        logits = self.linear(hn)  # shape: (batch, 3)
        
        return logits  # ✅ Trả về tensor 2D

In [80]:
device = 'cuda'

In [113]:
model = BasicClassifierRNN().to(device=device)

In [115]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(),lr=0.01,weight_decay=0.0001)

In [116]:
epochs = 5
for epoch in range(epochs):
    model.train()
    train_loss, train_accuracy = 0,0
    for x,y in train_dataloader:
        x,y = x.to(device),y.to(device)

        y_logits = model(x)

        loss = loss_fn(y_logits,y)
        y_pred = torch.softmax(y_logits,dim=1).argmax(dim=1)
        train_loss += loss.item()
        train_accuracy += (y == y_pred).sum().item() / len(y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    with torch.inference_mode():
        model.eval()
        test_loss,test_accuracy = 0,0
        for x,y in test_dataloader:
            x,y = x.to(device),y.to(device)

            y_logits = model(x)

            loss = loss_fn(y_logits,y)
            y_pred = torch.softmax(y_logits,dim=1).argmax(dim=1)
            test_loss += loss.item()
            test_accuracy += (y == y_pred).sum().item() / len(y)
    
    train_accuracy /= len(train_dataloader)
    train_loss /= len(train_dataloader)

    test_accuracy /= len(test_dataloader)
    test_loss /= len(test_dataloader)

    print(f"Epoch: {epoch+1}/{epochs} | Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f} | Test Loss: {test_loss:.4f}, Test Acc: {test_accuracy:.4f}")


Epoch: 1/5 | Train Loss: 0.3599, Train Acc: 0.8748 | Test Loss: 0.7955, Test Acc: 0.6408
Epoch: 2/5 | Train Loss: 0.3170, Train Acc: 0.8966 | Test Loss: 0.3387, Test Acc: 0.8907
Epoch: 3/5 | Train Loss: 0.2596, Train Acc: 0.9145 | Test Loss: 0.3101, Test Acc: 0.8980
Epoch: 4/5 | Train Loss: 0.2397, Train Acc: 0.9221 | Test Loss: 0.3194, Test Acc: 0.8932
Epoch: 5/5 | Train Loss: 0.2255, Train Acc: 0.9247 | Test Loss: 0.3430, Test Acc: 0.8926


In [117]:
model_path = "sentiment.pt"

In [119]:
torch.save(obj=model.state_dict(),f=model_path)

In [120]:
new_model  = BasicClassifierRNN().to(device=device)

In [None]:
new_model.load_state_dict(torch.load(f=model_path))