# Install HuggingFace datasets package

In [1]:
!pip install datasets



# Import packages

In [2]:
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing import sequence # For padding sentence
from tensorflow.keras.preprocessing.text import Tokenizer # For tokenize sentence and make corpus

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

2024-03-10 08:09:52.474534: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-10 08:09:52.474591: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-10 08:09:52.475999: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Import dataset

In [3]:
dataset = load_dataset("imdb")
train_data = dataset["train"]
test_data = dataset["test"]

train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

  0%|          | 0/3 [00:00<?, ?it/s]

# Configuration

In [4]:
config = {
    "batch_size": 50,
    "epoch": 5,
    "lr": 0.01,
    "device": torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"),
    "vocab_size": 5001 # Plus one for padding
}
print(f"Device: {config['device']}")

Device: cuda


# Utility functions

In [5]:
def clean_text(text):
    before_clean = text
    after_clean = re.sub("[^a-zA-Z\s]", "", before_clean).lower() # Only keep english
   
    return after_clean

# Text preprocessing

## Text cleaning

In [6]:
train_df["clean_text"] = train_df["text"].apply(clean_text)
test_df["clean_text"] = test_df["text"].apply(clean_text)

## Tokenization

In [7]:
tokenizer = Tokenizer(num_words = config["vocab_size"])
tokenizer.fit_on_texts(train_df["clean_text"])

## Text to sequence (represent text by number)

In [8]:
train_seq = tokenizer.texts_to_sequences(train_df['clean_text'])
test_seq = tokenizer.texts_to_sequences(test_df['clean_text'])

## Sequence padding (Let sequence length uniform)

In [9]:
train_pad = sequence.pad_sequences(train_seq, maxlen = 300)
test_pad = sequence.pad_sequences(test_seq, maxlen = 300)

# Ready dataset

In [10]:
y_train = torch.Tensor(train_df['label'])
y_test = torch.Tensor(test_df['label'])

train_tensor_dataset = TensorDataset(torch.from_numpy(train_pad), y_train)
test_tensor_dataset = TensorDataset(torch.from_numpy(test_pad), y_test)

train_loader = DataLoader(train_tensor_dataset, shuffle = True, batch_size = config['batch_size'])
test_loader = DataLoader(test_tensor_dataset, shuffle = False, batch_size = config['batch_size'])

# Model

In [11]:
class Model(nn.Module):
    def __init__(self, emb_size, hidden_size, output_size, vocab_size):
        super().__init__()
        self.emb_size = emb_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.lstm = nn.LSTM(input_size = emb_size, hidden_size = hidden_size, batch_first = True)
        self.fc = nn.Linear(in_features = hidden_size, out_features = output_size)
        
    def forward(self, x):
        # x => [batch_size(幾篇文章), sequence_length(一篇文章幾個字), embedding_size(一個字用幾維表示)] => [50, 300, 1]
        embeds = self.embedding(x)
        # embeds => [50, 300, 64]
        output, (hn, cn) = self.lstm(embeds)
        output = output[:, -1, :]
        output = output.view(-1, self.hidden_size)
        y = self.fc(output)
        y = F.sigmoid(y)
        
        return y

In [12]:
emb_size = 64
hidden_size = 256
output_size = 1

model = Model(emb_size, hidden_size, output_size, config['vocab_size'])
model = model.to(config['device'])

loss_function = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = config['lr'])

# Training

In [13]:
def Train(model, config, train_loader, test_loader):
    epoch = config['epoch']
    device = config['device']
    progress = tqdm(total = epoch, leave = False) # leave = False, 表示輸出不換行
    epoch_train_loss = []
    epoch_train_acc = []
    for e in range(epoch):
        batch_train_loss = []
        batch_train_acc = []
        model.train()
        
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            preds = model(inputs)
            preds = preds.squeeze()
            loss = loss_function(preds, labels.float())
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            preds = torch.round(preds)
            batch_train_loss.append(loss.detach().item())
            batch_train_acc.append(torch.eq(preds, labels).sum().item())
        
        epoch_train_loss.append(sum(batch_train_loss) / (len(train_loader) * config['batch_size']))
        epoch_train_acc.append(sum(batch_train_acc) / (len(train_loader) * config['batch_size']))
        progress.update(1)
        progress.set_description(f"Epoch [{e+1}/{epoch}]")
        progress.set_postfix({"Loss": epoch_train_loss[-1], "Acc": epoch_train_acc[-1]})
        Test(model, config, test_loader)
    
    return epoch_train_loss, epoch_train_acc

# Testing

In [15]:
def Test(model, config, test_loader):
    device = config['device']
    test_loss = []
    test_acc = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            preds = model(inputs)
            loss = loss_function(preds.squeeze(), labels.float())
            
            preds = torch.round(preds)
            test_loss.append(loss.detach().item())
            test_acc.append(torch.eq(preds, labels).sum().item())
        print(f"Test Loss: {sum(test_loss) / (len(test_loader) * config['batch_size'])}")
        print(f"Test Acc: {sum(test_acc) / (len(test_loader) * config['batch_size'])}")

In [None]:
Train(model, config, train_loader, test_loader)


  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:09<00:38,  9.65s/it][A
Epoch [1/5]:  20%|██        | 1/5 [00:09<00:38,  9.65s/it][A
Epoch [1/5]:  20%|██        | 1/5 [00:09<00:38,  9.65s/it, Loss=0.0063, Acc=0.866][A

Test Loss: 0.0064561963897943496
Test Acc: 43.036



Epoch [1/5]:  40%|████      | 2/5 [00:23<00:36, 12.16s/it, Loss=0.0063, Acc=0.866][A
Epoch [2/5]:  40%|████      | 2/5 [00:23<00:36, 12.16s/it, Loss=0.0063, Acc=0.866][A
Epoch [2/5]:  40%|████      | 2/5 [00:23<00:36, 12.16s/it, Loss=0.00501, Acc=0.897][A

Test Loss: 0.0063907060727477075
Test Acc: 43.41



Epoch [2/5]:  60%|██████    | 3/5 [00:37<00:26, 13.09s/it, Loss=0.00501, Acc=0.897][A
Epoch [3/5]:  60%|██████    | 3/5 [00:37<00:26, 13.09s/it, Loss=0.00501, Acc=0.897][A
Epoch [3/5]:  60%|██████    | 3/5 [00:37<00:26, 13.09s/it, Loss=0.00421, Acc=0.918][A

Test Loss: 0.006669541134685278
Test Acc: 43.258



Epoch [3/5]:  80%|████████  | 4/5 [00:51<00:13, 13.44s/it, Loss=0.00421, Acc=0.918][A
Epoch [4/5]:  80%|████████  | 4/5 [00:51<00:13, 13.44s/it, Loss=0.00421, Acc=0.918][A
Epoch [4/5]:  80%|████████  | 4/5 [00:51<00:13, 13.44s/it, Loss=0.00365, Acc=0.929][A

In [None]:
Test(model, config, test_loader)