<a href="https://colab.research.google.com/github/tomonari-masada/course-nlp2020/blob/master/08_document_classification_with_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import random
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext import data
from torchtext import datasets
from torchtext.data import Field, BucketIterator

SEED = 123

random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [2]:
TEXT = Field(tokenize = "spacy",
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)
LABEL = data.LabelField()

In [3]:
train_valid_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [4]:
train_data, valid_data = train_valid_data.split(split_ratio=0.8, random_state = random.seed(SEED))
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 20000
Number of validation examples: 5000
Number of testing examples: 25000


In [5]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 25004
Unique tokens in LABEL vocabulary: 2


In [6]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', '<sos>', '<eos>', 'the', ',', '.', 'and', 'a', 'of']


In [7]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 263353), (',', 220515), ('.', 189706), ('and', 130737), ('a', 129741), ('of', 116262), ('to', 108619), ('is', 87772), ('it', 74780), ('in', 74341), ('i', 66293), ('this', 58689), ('that', 58620), ('"', 50102), ("'s", 49819), ('-', 42239), ('/><br', 40900), ('was', 40501), ('as', 37056), ('with', 35260)]


In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
BATCH_SIZE = 100

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device)

In [10]:
INPUT_DIM = len(TEXT.vocab)
NUM_CLASS = len(LABEL.vocab)
EMBED_DIM = 64
HIDDEN_DIM = 64
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

### 念のためモデルを定義する前に埋め込み層の動作を確認
* padding用のトークンがどのように埋め込まれるかを確認。
* また、埋め込み層を通ったあとにミニバッチの形がどうなるかを確認。

In [11]:
embed = nn.Embedding(INPUT_DIM, EMBED_DIM, padding_idx=PAD_IDX).to(device)

In [12]:
for batch in valid_iterator:
  pass
print(batch)


[torchtext.data.batch.Batch of size 100]
	[.text]:[torch.cuda.LongTensor of size 1991x100 (GPU 0)]
	[.label]:[torch.cuda.LongTensor of size 100 (GPU 0)]


In [13]:
embed(batch.text).shape

torch.Size([1991, 100, 64])

In [14]:
class RNNTextSentiment(nn.Module):
  def __init__(self, emb_dim, hid_dim,
               num_class, vocab_size, padding_idx, p=0.0):
    super().__init__()

    self.input_dim = vocab_size
    self.emb_dim = emb_dim
    self.hid_dim = hid_dim
    self.dropout = p

    self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=padding_idx)
    self.rnn = nn.LSTM(emb_dim, hid_dim)
    self.fc = nn.Linear(hid_dim * 2, num_class)
    self.dropout = nn.Dropout(p=p)

  def forward(self, src):
    # srcの形は[単語列長, バッチサイズ]

    embedded = self.dropout(self.embedding(src))
    # embeddedの形は[単語列長, バッチサイズ, 埋め込み次元数]

    outputs, (hidden, _) = self.rnn(embedded)
    # outputsの形は[単語列長, バッチサイズ, 隠れ状態の次元数]
    # hiddenの形は[1, バッチサイズ, 隠れ状態の次元数]

    mean_outputs = outputs.mean(0)
    hidden = hidden.squeeze()
    # mean_outputsの形は[バッチサイズ, 隠れ状態の次元数]
    # hiddenの形は[バッチサイズ, 隠れ状態の次元数]
    output = self.fc(torch.cat((mean_outputs, hidden), dim=1))

    return output

In [15]:
def init_weights(m):
  for name, param in m.named_parameters():
    if 'weight' in name:
      nn.init.normal_(param.data, mean=0, std=0.01)
    else:
      nn.init.constant_(param.data, 0)

In [16]:
model = RNNTextSentiment(EMBED_DIM, HIDDEN_DIM, NUM_CLASS, INPUT_DIM,
                         padding_idx=PAD_IDX, p=0.5).to(device)

In [17]:
model.apply(init_weights)

RNNTextSentiment(
  (embedding): Embedding(25004, 64, padding_idx=1)
  (rnn): LSTM(64, 64)
  (fc): Linear(in_features=128, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [18]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [19]:
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,633,794 trainable parameters


In [20]:
criterion = nn.CrossEntropyLoss().to(device)

In [21]:
def train(model, iterator, optimizer, criterion, clip):
  model.train()

  epoch_loss = 0
  epoch_acc = 0
  for batch in iterator:

    optimizer.zero_grad()
    output = model(batch.text)
    loss = criterion(output, batch.label)
    loss.backward()

    nn.utils.clip_grad_norm_(model.parameters(), clip)
    optimizer.step()

    epoch_loss += loss.item()
    epoch_acc += (output.argmax(1) == batch.label).sum().item()

  return epoch_loss / len(iterator), epoch_acc / len(iterator.dataset)

In [22]:
def evaluate(model, iterator, criterion):
  model.eval()

  epoch_loss = 0
  epoch_acc = 0
  with torch.no_grad():
    for batch in iterator:
      output = model(batch.text)
      loss = criterion(output, batch.label)
      epoch_loss += loss.item()
      epoch_acc += (output.argmax(1) == batch.label).sum().item()

  return epoch_loss / len(iterator), epoch_acc / len(iterator.dataset)

In [23]:
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

In [24]:
N_EPOCHS = 10
CLIP = 1

for epoch in range(1, N_EPOCHS + 1):

  start_time = time.time()
  train_loss, train_acc = train(model, train_iterator, optimizer, criterion, CLIP)
  valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
  end_time = time.time()
  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  print('Epoch: %d' %(epoch), " | time in %d minutes, %d seconds" %(epoch_mins, epoch_secs))
  print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
  print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 11 seconds
	Loss: 0.5937(train)	|	Acc: 67.1%(train)
	Loss: 0.4733(valid)	|	Acc: 78.9%(valid)
Epoch: 2  | time in 0 minutes, 11 seconds
	Loss: 0.3863(train)	|	Acc: 85.6%(train)
	Loss: 0.4109(valid)	|	Acc: 82.7%(valid)
Epoch: 3  | time in 0 minutes, 11 seconds
	Loss: 0.3101(train)	|	Acc: 89.3%(train)
	Loss: 0.4228(valid)	|	Acc: 82.5%(valid)
Epoch: 4  | time in 0 minutes, 11 seconds
	Loss: 0.2854(train)	|	Acc: 89.6%(train)
	Loss: 0.5056(valid)	|	Acc: 81.8%(valid)
Epoch: 5  | time in 0 minutes, 11 seconds
	Loss: 0.2450(train)	|	Acc: 91.8%(train)
	Loss: 0.5643(valid)	|	Acc: 81.8%(valid)
Epoch: 6  | time in 0 minutes, 12 seconds
	Loss: 0.2341(train)	|	Acc: 92.7%(train)
	Loss: 0.4690(valid)	|	Acc: 81.6%(valid)
Epoch: 7  | time in 0 minutes, 11 seconds
	Loss: 0.2112(train)	|	Acc: 93.7%(train)
	Loss: 0.5632(valid)	|	Acc: 80.2%(valid)
Epoch: 8  | time in 0 minutes, 12 seconds
	Loss: 0.2332(train)	|	Acc: 92.2%(train)
	Loss: 0.5567(valid)	|	Acc: 82.1%(valid)
Epoch: 9

In [25]:
EMBED_DIM = 32
HIDDEN_DIM = 32
model = RNNTextSentiment(EMBED_DIM, HIDDEN_DIM, NUM_CLASS, INPUT_DIM,
                         padding_idx=PAD_IDX, p=0.5).to(device)
model.apply(init_weights)
optimizer = optim.Adam(model.parameters(), lr=0.0003)

In [26]:
N_EPOCHS = 10
CLIP = 1

for epoch in range(1, N_EPOCHS + 1):

  start_time = time.time()
  train_loss, train_acc = train(model, train_iterator, optimizer, criterion, CLIP)
  valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
  end_time = time.time()
  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  print('Epoch: %d' %(epoch), " | time in %d minutes, %d seconds" %(epoch_mins, epoch_secs))
  print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
  print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 11 seconds
	Loss: 0.6931(train)	|	Acc: 50.6%(train)
	Loss: 0.6912(valid)	|	Acc: 67.3%(valid)
Epoch: 2  | time in 0 minutes, 11 seconds
	Loss: 0.6082(train)	|	Acc: 69.9%(train)
	Loss: 0.5431(valid)	|	Acc: 74.7%(valid)
Epoch: 3  | time in 0 minutes, 11 seconds
	Loss: 0.5073(train)	|	Acc: 78.9%(train)
	Loss: 0.4820(valid)	|	Acc: 78.4%(valid)
Epoch: 4  | time in 0 minutes, 11 seconds
	Loss: 0.4405(train)	|	Acc: 83.0%(train)
	Loss: 0.4322(valid)	|	Acc: 81.6%(valid)
Epoch: 5  | time in 0 minutes, 11 seconds
	Loss: 0.3871(train)	|	Acc: 86.1%(train)
	Loss: 0.4101(valid)	|	Acc: 82.9%(valid)
Epoch: 6  | time in 0 minutes, 11 seconds
	Loss: 0.3477(train)	|	Acc: 88.1%(train)
	Loss: 0.4135(valid)	|	Acc: 82.6%(valid)
Epoch: 7  | time in 0 minutes, 11 seconds
	Loss: 0.3128(train)	|	Acc: 89.7%(train)
	Loss: 0.3922(valid)	|	Acc: 83.7%(valid)
Epoch: 8  | time in 0 minutes, 11 seconds
	Loss: 0.2904(train)	|	Acc: 90.7%(train)
	Loss: 0.4027(valid)	|	Acc: 83.6%(valid)
Epoch: 9