In [None]:
import torch
from torch import nn

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [None]:
sentences = []
labels = []
sentence = []
label = []
# max_sen = 1000
with open('eng.train', 'r') as f:
  cnt = 0
  for line in f:
    if line == "\n":
      sentences.append(sentence)
      labels.append(label)
      sentence, label = [], []
      # cnt += 1
    else:
      word, tag = line.split()
      sentence.append(word)
      label.append(tag)
    # if cnt == max_sen:
    #   break

In [None]:
import string

In [None]:
def transform_word(word):
  if word in string.punctuation + '...':
    return 'SYMBOL'
  elif any(char.isdigit() for char in word):
    return 'NUMBER'
  else:
    return word

transform_sents = [[transform_word(word) for word in sentence] for sentence in sentences]

In [None]:
from collections import Counter

In [None]:
counter = Counter()
for sentence in transform_sents:
  for word in sentence:
    counter[word] += 1

In [None]:
items = counter.items()
print(len(items))

19738


In [None]:
vocab = list(counter.keys())

In [None]:
vocab_dict = {word: i + 1 for i, word in enumerate(vocab)}
vocab_dict['UNKNOWN'] = 0

In [None]:
print(vocab_dict.keys())
print(len(vocab_dict.keys()))

19739


In [None]:
label_counter = Counter()
for sublist in labels:
  for label in sublist:
    label_counter[label] +=1
len_classes = len(label_counter.items())
print('Number of classes {0}'.format(len_classes))

Number of classes 17


In [None]:
label_dict = {label: i+1 for i,label in enumerate(label_counter.keys())}

In [None]:
label_dict

{'S-ORG': 1,
 'O': 2,
 'S-MISC': 3,
 'B-PER': 4,
 'E-PER': 5,
 'S-LOC': 6,
 'B-ORG': 7,
 'E-ORG': 8,
 'I-PER': 9,
 'S-PER': 10,
 'B-MISC': 11,
 'I-MISC': 12,
 'E-MISC': 13,
 'I-ORG': 14,
 'B-LOC': 15,
 'E-LOC': 16,
 'I-LOC': 17}

In [None]:
# Tokenize sentences
tokenized_sents = [[vocab_dict.get(word, vocab_dict['UNKNOWN']) for word in sentence] for sentence in transform_sents]

In [None]:
# tokenize labels
labels = [[label_dict.get(label) for label in sublist] for sublist in labels]

In [None]:
# find the longest sentence. It will determine the padding len
len_counter = Counter()
for sentence in sentences:
  len_counter[len(sentence)]+=1


In [None]:
max_len = max(len_counter.keys())
max_len

113

In [None]:
import numpy as np

In [None]:
# padding
def padding(tokenized_list, max_len):
  features = np.zeros((len(tokenized_list), max_len), dtype = int)
  for i,row in enumerate(tokenized_list):
    features[i, -len(row):] = np.array(row)[:max_len]
  return features

In [None]:
features = padding(tokenized_sents, max_len)

In [None]:
features

array([[   0,    0,    0, ...,    7,    8,    9],
       [   0,    0,    0, ...,    0,   10,   11],
       [   0,    0,    0, ...,    0,   12,   13],
       ...,
       [   0,    0,    0, ...,   13, 8959,   13],
       [   0,    0,    0, ...,    0, 2724,  970],
       [   0,    0,    0, ...,   13, 4754,   13]])

In [None]:
labels = padding(labels, max_len)

In [None]:
# 17 in labels

In [None]:
labels

array([[0, 0, 0, ..., 3, 2, 2],
       [0, 0, 0, ..., 0, 4, 5],
       [0, 0, 0, ..., 0, 6, 2],
       ...,
       [0, 0, 0, ..., 2, 1, 2],
       [0, 0, 0, ..., 0, 2, 2],
       [0, 0, 0, ..., 2, 1, 2]])

In [None]:
split_frac = 0.8
split_idx = int(len(features) * split_frac)
train_x, test_x = features[:split_idx], features[split_idx:]
train_y, test_y = labels[:split_idx], labels[split_idx:]

print(train_x.shape, test_x.shape)
print(train_y.shape, test_y.shape)

(11988, 113) (2998, 113)
(11988, 113) (2998, 113)


In [None]:
from torch.utils.data import TensorDataset, DataLoader

In [None]:
train_dataset = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
test_dataset = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

In [None]:
import os
num_workers = os.cpu_count()
num_workers
# should not use in colab if dataset is very large

2

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size = 32, shuffle = True, num_workers = 0)
test_dataloader = DataLoader(test_dataset, batch_size = 32, shuffle = False, num_workers = 0)

In [None]:
X, y = next(iter(train_dataloader))
print(X.shape, y.shape)

torch.Size([32, 113]) torch.Size([32, 113])


In [None]:
from torch.nn import functional as F

In [None]:
vocab_size = len(vocab_dict.keys())
print(f'vocab size: {vocab_size}')

vocab size: 19739


In [None]:
class LSTM(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_size, output_size, num_layers):
    super().__init__()
    self.embedding = nn.Embedding(num_embeddings= vocab_size, embedding_dim = embedding_dim)
    self.lstm = nn.LSTM(input_size = embedding_dim,
                        hidden_size = hidden_size,
                        num_layers = num_layers,
                        batch_first= True)
    self.fc = nn.Linear(in_features = hidden_size, out_features= output_size)

  def forward(self, x):
    embedded = self.embedding(x) # (batch_size, seq_len, embedding_dim)
    lstm_out, (hn, cn) = self.lstm(embedded) # lstm_out có shape (batch_size, seq_len, hidden_size)
    output = self.fc(lstm_out) # (batch_size, seq_len, output_size)
    return output

In [None]:
class Bidirectional_LSTM(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_size, output_size, num_layers):
    super().__init__()
    self.embedding = nn.Embedding(num_embeddings= vocab_size, embedding_dim = embedding_dim)

    # LSTM bidirectional
    self.lstm = nn.LSTM(input_size = embedding_dim,
                        hidden_size = hidden_size,
                        num_layers = num_layers,
                        batch_first= True,
                        bidirectional=True)  # Thêm tham số bidirectional

    self.fc = nn.Linear(in_features = 2 * hidden_size, out_features= output_size)  # Vì là bidirectional nên output sẽ có kích thước gấp đôi

  def forward(self, x):
    embedded = self.embedding(x)  # (batch_size, seq_len, embedding_dim)
    lstm_out, (hn, cn) = self.lstm(embedded)  # lstm_out có shape (batch_size, seq_len, 2*hidden_size)
    output = self.fc(lstm_out)  # (batch_size, seq_len, output_size)
    return output


In [None]:
lstm_model = LSTM(vocab_size = vocab_size,
                  embedding_dim = 300,
                  hidden_size = 128,
                  output_size = len_classes + 1,
                  num_layers = 10).to(device)

# y_logits = lstm_model(X.to(device))
# y_pred = torch.argmax(y_logits, dim = -1)
# correct_preds = torch.eq(y_pred, y).sum().item()
# acc = (correct_preds / (y.shape[0]*y.shape[1]))
# print(acc)

In [None]:
lr=0.001

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lstm_model.parameters(), lr=lr)

In [None]:
from tqdm.auto import tqdm

In [None]:
def train(model: torch.nn.Module,
          train_loader: torch.utils.data.DataLoader,
          test_loader: torch.utils.data.DataLoader,
          epochs,
          device = device):
  model.train()
  for epoch in tqdm(range(epochs)):
    train_loss, train_acc = 0, 0
    for i, (X, y) in tqdm(enumerate(train_loader)):
      X, y = X.to(device), y.to(device)
      y_logits = model(X)
      loss = criterion(y_logits.view(-1, len_classes+1), y.view(-1))
      train_loss += loss.item()
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      y_pred = torch.argmax(y_logits, dim = -1)
      corrects_per_batch_train = torch.eq(y_pred, y).sum().item()
      train_acc += (corrects_per_batch_train / y.numel())

    test_loss, test_acc = 0, 0
    model.eval()
    with torch.inference_mode():
      for i, (X, y) in enumerate(test_loader):
        X, y = X.to(device), y.to(device)
        y_logits = model(X)
        loss = criterion(y_logits.view(-1, len_classes+1), y.view(-1))
        test_loss += loss.item()
        y_pred = torch.argmax(y_logits, dim = -1)
        corrects_per_batch_valid = torch.eq(y_pred, y).sum().item()
        test_acc += (corrects_per_batch_valid / y.numel())

    print(f'Epoch {epoch+1}')
    print(f'Train_loss: {train_loss / len(train_loader):.4f}')
    print(f'Train_acc: {train_acc / len(train_loader):.4f}')
    print(f'Valid_loss: {test_loss / len(test_loader):.4f}')
    print(f'Valid_acc: {test_acc / len(test_loader):.4f}')



In [None]:
train(lstm_model, train_dataloader, test_dataloader, 3)

  0%|          | 0/3 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Epoch 1
Train_loss: 0.4669
Train_acc: 0.8756
Valid_loss: 0.3562
Valid_acc: 0.8623


0it [00:00, ?it/s]

Epoch 2
Train_loss: 0.2938
Train_acc: 0.9011
Valid_loss: 0.3087
Valid_acc: 0.8947


0it [00:00, ?it/s]

Epoch 3
Train_loss: 0.2878
Train_acc: 0.9042
Valid_loss: 0.3202
Valid_acc: 0.8855


In [None]:
torch.save(lstm_model.state_dict(), 'lstm.pth')

In [None]:
bi_lstm_model = Bidirectional_LSTM(vocab_size = vocab_size,
                  embedding_dim = 300,
                  hidden_size = 128,
                  output_size = len_classes + 1,
                  num_layers = 10).to(device)

In [None]:
lr=0.001

bi_criterion = nn.CrossEntropyLoss()
bi_optimizer = torch.optim.Adam(bi_lstm_model.parameters(), lr=lr)

In [None]:
train(bi_lstm_model, train_dataloader, test_dataloader, 2)

  0%|          | 0/2 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Epoch 1
Train_loss: 2.8467
Train_acc: 0.8835
Valid_loss: 2.8492
Valid_acc: 0.8623


0it [00:00, ?it/s]

Epoch 2
Train_loss: 2.8467
Train_acc: 0.8835
Valid_loss: 2.8492
Valid_acc: 0.8623


In [None]:
torch.save(bi_lstm_model.state_dict(), 'bi_lstm.pth')