# Sequence를 처리하기 위한 RNN 

1. 주어진 데이터를 RNN에 넣을 수 있는 형태로 만듭니다.
2. 기본적인 RNN 사용법 및 적용법을 익힙니다.
3. LSTM, GRU의 사용법 및 적용법을 익힙니다.

In [1]:
from tqdm import tqdm
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import torch

## 데이터 전처리

아래의 sample data를 확인해봅시다.  
전체 단어 수와 pad token의 id도 아래와 같습니다.

In [2]:
vocab_size = 100
pad_id = 0

data = [
  [85,14,80,34,99,20,31,65,53,86,3,58,30,4,11,6,50,71,74,13],
  [62,76,79,66,32],
  [93,77,16,67,46,74,24,70],
  [19,83,88,22,57,40,75,82,4,46],
  [70,28,30,24,76,84,92,76,77,51,7,20,82,94,57],
  [58,13,40,61,88,18,92,89,8,14,61,67,49,59,45,12,47,5],
  [22,5,21,84,39,6,9,84,36,59,32,30,69,70,82,56,1],
  [94,21,79,24,3,86],
  [80,80,33,63,34,63],
  [87,32,79,65,2,96,43,80,85,20,41,52,95,50,35,96,24,80]
]

Padding 처리를 해주면서 padding 전 길이도 저장합니다.

In [3]:
max_len = len(max(data, key=len))
print(f"Maximum sequence length: {max_len}")

valid_lens = []
for i, seq in enumerate(tqdm(data)):
  valid_lens.append(len(seq))
  if len(seq) < max_len:
    data[i] = seq + [pad_id] * (max_len - len(seq))

Maximum sequence length: 20


100%|██████████| 10/10 [00:00<00:00, 64527.75it/s]


In [4]:
print(data)
print(valid_lens)

[[85, 14, 80, 34, 99, 20, 31, 65, 53, 86, 3, 58, 30, 4, 11, 6, 50, 71, 74, 13], [62, 76, 79, 66, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [93, 77, 16, 67, 46, 74, 24, 70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [19, 83, 88, 22, 57, 40, 75, 82, 4, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [70, 28, 30, 24, 76, 84, 92, 76, 77, 51, 7, 20, 82, 94, 57, 0, 0, 0, 0, 0], [58, 13, 40, 61, 88, 18, 92, 89, 8, 14, 61, 67, 49, 59, 45, 12, 47, 5, 0, 0], [22, 5, 21, 84, 39, 6, 9, 84, 36, 59, 32, 30, 69, 70, 82, 56, 1, 0, 0, 0], [94, 21, 79, 24, 3, 86, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [80, 80, 33, 63, 34, 63, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [87, 32, 79, 65, 2, 96, 43, 80, 85, 20, 41, 52, 95, 50, 35, 96, 24, 80, 0, 0]]
[20, 5, 8, 10, 15, 18, 17, 6, 6, 18]


In [5]:
# B: batch size, L: maximum sequence length
batch = torch.LongTensor(data)  # (B, L)
batch_lens = torch.LongTensor(valid_lens)  # (B)

In [6]:
batch, batch_lens

(tensor([[85, 14, 80, 34, 99, 20, 31, 65, 53, 86,  3, 58, 30,  4, 11,  6, 50, 71,
          74, 13],
         [62, 76, 79, 66, 32,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
           0,  0],
         [93, 77, 16, 67, 46, 74, 24, 70,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
           0,  0],
         [19, 83, 88, 22, 57, 40, 75, 82,  4, 46,  0,  0,  0,  0,  0,  0,  0,  0,
           0,  0],
         [70, 28, 30, 24, 76, 84, 92, 76, 77, 51,  7, 20, 82, 94, 57,  0,  0,  0,
           0,  0],
         [58, 13, 40, 61, 88, 18, 92, 89,  8, 14, 61, 67, 49, 59, 45, 12, 47,  5,
           0,  0],
         [22,  5, 21, 84, 39,  6,  9, 84, 36, 59, 32, 30, 69, 70, 82, 56,  1,  0,
           0,  0],
         [94, 21, 79, 24,  3, 86,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
           0,  0],
         [80, 80, 33, 63, 34, 63,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
           0,  0],
         [87, 32, 79, 65,  2, 96, 43, 80, 85, 20, 41, 52, 95, 50, 35, 96, 24, 80,
         

In [7]:
batch_lens, sorted_idx = batch_lens.sort(descending=True)
batch = batch[sorted_idx]

In [8]:
print(batch)
print(batch_lens)

tensor([[85, 14, 80, 34, 99, 20, 31, 65, 53, 86,  3, 58, 30,  4, 11,  6, 50, 71,
         74, 13],
        [58, 13, 40, 61, 88, 18, 92, 89,  8, 14, 61, 67, 49, 59, 45, 12, 47,  5,
          0,  0],
        [87, 32, 79, 65,  2, 96, 43, 80, 85, 20, 41, 52, 95, 50, 35, 96, 24, 80,
          0,  0],
        [22,  5, 21, 84, 39,  6,  9, 84, 36, 59, 32, 30, 69, 70, 82, 56,  1,  0,
          0,  0],
        [70, 28, 30, 24, 76, 84, 92, 76, 77, 51,  7, 20, 82, 94, 57,  0,  0,  0,
          0,  0],
        [19, 83, 88, 22, 57, 40, 75, 82,  4, 46,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [93, 77, 16, 67, 46, 74, 24, 70,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [94, 21, 79, 24,  3, 86,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [80, 80, 33, 63, 34, 63,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [62, 76, 79, 66, 32,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0]])
tensor([2

## RNN 사용해보기

RNN에 넣기 전 word embedding을 위한 embedding layer를 만듭니다.

In [9]:
embedding_size = 256
embedding = nn.Embedding(vocab_size, embedding_size)

# d_w: embedding size
batch_emb = embedding(batch)  # (B, L, d_w)

In [10]:
batch_emb

tensor([[[-0.3489, -1.5471, -0.2449,  ..., -1.6943, -0.6424,  0.6633],
         [-0.3844,  0.0208, -2.1909,  ..., -0.2871,  0.9900,  0.2157],
         [ 0.1978, -0.3223,  0.9422,  ..., -0.3540,  0.2403,  0.2146],
         ...,
         [-0.7779,  0.3056,  0.0847,  ..., -1.8285,  0.2277,  1.0158],
         [-1.1020,  1.4160, -1.9694,  ...,  0.7368, -0.6531,  0.1494],
         [-1.4464,  1.2650, -0.0028,  ...,  0.4699, -0.2929, -1.9076]],

        [[-0.7429, -2.2946,  1.1008,  ...,  0.4090, -0.6459,  1.4167],
         [-1.4464,  1.2650, -0.0028,  ...,  0.4699, -0.2929, -1.9076],
         [-0.1069,  0.1279, -0.1290,  ...,  0.4190, -0.9328,  0.6271],
         ...,
         [-0.8008,  0.6918, -0.3759,  ..., -0.2661,  0.2465, -0.3408],
         [ 0.7561, -0.4155,  0.4712,  ..., -1.0136,  2.5601,  0.3991],
         [ 0.7561, -0.4155,  0.4712,  ..., -1.0136,  2.5601,  0.3991]],

        [[ 0.7491,  1.0150, -0.1220,  ..., -0.3036, -0.5501,  0.6761],
         [-1.6761,  0.2788,  0.1659,  ..., -0

아래와 같이 RNN 모델 및 초기 hidden state를 정의합니다.

In [11]:
hidden_size = 512  # RNN의 hidden size
num_layers = 1  # 쌓을 RNN layer의 개수
num_dirs = 1  # 1: 단방향 RNN, 2: 양방향 RNN

rnn = nn.RNN(
    input_size=embedding_size,
    hidden_size=hidden_size,
    num_layers=num_layers,
    bidirectional=True if num_dirs > 1 else False,
    batch_first = True
)

h_0 = torch.zeros((num_layers * num_dirs, batch.shape[0], hidden_size))  # (num_layers * num_dirs, B, d_h)

### **Vanilla RNN 활용법**

RNN에 batch data를 넣으면 아래와 같이 2가지 output을 얻습니다.


*   `hidden_states`: 각 time step에 해당하는 hidden state들의 묶음.
*   `h_n`: 모든 sequence를 거치고 나온 마지막 hidden state.

In [13]:
hidden_states, h_n = rnn(batch_emb.transpose(0, 1), h_0)

# d_h: hidden size, num_layers: layer 개수, num_dirs: 방향의 개수
print(hidden_states.shape)  # (L, B, d_h)
print(h_n.shape)  # (num_layers*num_dirs, B, d_h) = (1, B, d_h)

torch.Size([20, 10, 512])
torch.Size([1, 10, 512])


In [14]:
hidden_states[-1]

tensor([[-0.4109,  0.3321,  0.0458,  ..., -0.0407, -0.6261, -0.6126],
        [ 0.1313, -0.5507,  0.0065,  ...,  0.3961, -0.6402, -0.2472],
        [-0.0968, -0.5875,  0.0230,  ...,  0.3421, -0.4615, -0.4243],
        ...,
        [ 0.0296, -0.5039, -0.1297,  ...,  0.2896, -0.6487, -0.2937],
        [ 0.0296, -0.5039, -0.1296,  ...,  0.2897, -0.6487, -0.2937],
        [ 0.0297, -0.5039, -0.1297,  ...,  0.2896, -0.6488, -0.2938]],
       grad_fn=<SelectBackward0>)

In [15]:
h_n

tensor([[[-0.4109,  0.3321,  0.0458,  ..., -0.0407, -0.6261, -0.6126],
         [ 0.1313, -0.5507,  0.0065,  ...,  0.3961, -0.6402, -0.2472],
         [-0.0968, -0.5875,  0.0230,  ...,  0.3421, -0.4615, -0.4243],
         ...,
         [ 0.0296, -0.5039, -0.1297,  ...,  0.2896, -0.6487, -0.2937],
         [ 0.0296, -0.5039, -0.1296,  ...,  0.2897, -0.6487, -0.2937],
         [ 0.0297, -0.5039, -0.1297,  ...,  0.2896, -0.6488, -0.2938]]],
       grad_fn=<StackBackward0>)

마지막 hidden state를 이용하여 text classification task에 적용할 수 있습니다.

In [16]:
num_classes = 2
classification_layer = nn.Linear(hidden_size, num_classes)

# C: number of classes
output = classification_layer(h_n.squeeze(0))  # (1, B, d_h) => (B, C)
print(output.shape)

torch.Size([10, 2])


각 time step에 대한 hidden state를 이용하여 token-level의 task를 수행할 수도 있습니다.

In [29]:
num_classes = 5
entity_layer = nn.Linear(hidden_size, num_classes)

# C: number of classes
output = entity_layer(hidden_states)  # (L, B, d_h) => (L, B, C)
print(output.shape)

torch.Size([20, 10, 5])


### **LSTM 활용법**

LSTM에선 cell state가 추가됩니다.  
Cell state의 shape는 hidden state의 그것과 동일합니다.

In [30]:
embedding_size = 256
hidden_size = 512
num_layers = 1
num_dirs = 1

embedding = nn.Embedding(vocab_size, embedding_size)
lstm = nn.LSTM(
    input_size=embedding_size,
    hidden_size=hidden_size,
    num_layers=num_layers,
    bidirectional=True if num_dirs > 1 else False
)

h_0 = torch.zeros((num_layers * num_dirs, batch.shape[0], hidden_size))  # (num_layers * num_dirs, B, d_h)
c_0 = torch.zeros((num_layers * num_dirs, batch.shape[0], hidden_size))  # (num_layers * num_dirs, B, d_h)

In [31]:
# d_w: word embedding size
batch_emb = embedding(batch)  # (B, L, d_w)

packed_batch = pack_padded_sequence(batch_emb.transpose(0, 1), batch_lens)

packed_outputs, (h_n, c_n) = lstm(packed_batch, (h_0, c_0))
print(packed_outputs)
print(packed_outputs[0].shape)
print(h_n.shape)
print(c_n.shape)

PackedSequence(data=tensor([[-0.0051, -0.0573,  0.0297,  ...,  0.0953,  0.0757, -0.0379],
        [-0.0528,  0.0441, -0.2874,  ...,  0.1027,  0.1413, -0.0969],
        [-0.0315, -0.0562,  0.0097,  ..., -0.0773, -0.0882, -0.0005],
        ...,
        [ 0.0278, -0.2434, -0.0330,  ...,  0.0811, -0.0304,  0.0084],
        [ 0.1104,  0.1274,  0.1265,  ..., -0.1008,  0.0584, -0.0545],
        [-0.0718,  0.0593,  0.1316,  ...,  0.1445,  0.0004, -0.0046]],
       grad_fn=<CatBackward0>), batch_sizes=tensor([10, 10, 10, 10, 10,  9,  7,  7,  6,  6,  5,  5,  5,  5,  5,  4,  4,  3,
         1,  1]), sorted_indices=None, unsorted_indices=None)
torch.Size([123, 512])
torch.Size([1, 10, 512])
torch.Size([1, 10, 512])


In [32]:
outputs, output_lens = pad_packed_sequence(packed_outputs)
print(outputs.shape)
print(output_lens)

torch.Size([20, 10, 512])
tensor([20, 18, 18, 17, 15, 10,  8,  6,  6,  5])


### **GRU 사용**

GRU는 cell state가 없어 RNN과 동일하게 사용 가능합니다.   
GRU를 이용하여 LM task를 수행해봅시다.

In [34]:
gru = nn.GRU(
    input_size=embedding_size,
    hidden_size=hidden_size,
    num_layers=num_layers,
    bidirectional=True if num_dirs > 1 else False
)

In [35]:
output_layer = nn.Linear(hidden_size, vocab_size)

In [36]:
input_id = batch.transpose(0, 1)[0, :]  # (B)
hidden = torch.zeros((num_layers * num_dirs, batch.shape[0], hidden_size))  # (1, B, d_h)

In [37]:
for t in range(max_len):
  input_emb = embedding(input_id).unsqueeze(0)  # (1, B, d_w)
  output, hidden = gru(input_emb, hidden)  # output: (1, B, d_h), hidden: (1, B, d_h)

  # V: vocab size
  output = output_layer(output)  # (1, B, V)
  probs, top_id = torch.max(output, dim=-1)  # probs: (1, B), top_id: (1, B)

  print("*" * 50)
  print(f"Time step: {t}")
  print(output.shape)
  print(probs.shape)
  print(top_id.shape)

  input_id = top_id.squeeze(0)  # (B)

**************************************************
Time step: 0
torch.Size([1, 10, 100])
torch.Size([1, 10])
torch.Size([1, 10])
**************************************************
Time step: 1
torch.Size([1, 10, 100])
torch.Size([1, 10])
torch.Size([1, 10])
**************************************************
Time step: 2
torch.Size([1, 10, 100])
torch.Size([1, 10])
torch.Size([1, 10])
**************************************************
Time step: 3
torch.Size([1, 10, 100])
torch.Size([1, 10])
torch.Size([1, 10])
**************************************************
Time step: 4
torch.Size([1, 10, 100])
torch.Size([1, 10])
torch.Size([1, 10])
**************************************************
Time step: 5
torch.Size([1, 10, 100])
torch.Size([1, 10])
torch.Size([1, 10])
**************************************************
Time step: 6
torch.Size([1, 10, 100])
torch.Size([1, 10])
torch.Size([1, 10])
**************************************************
Time step: 7
torch.Size([1, 10, 100])
torch.Si

### **양방향 및 여러 layer 사용**

이번엔 양방향 + 2개 이상의 layer를 쓸 때 얻을 수 있는 결과에 대해 알아봅니다.


In [39]:
num_layers = 2
num_dirs = 2
dropout=0.1

gru = nn.GRU(
    input_size=embedding_size,
    hidden_size=hidden_size,
    num_layers=num_layers,
    dropout=dropout,
    bidirectional=True if num_dirs > 1 else False
)

Bidirectional이 되었고 layer의 개수가 $2$로 늘었기 때문에 hidden state의 shape도 `(4, B, d_h)`가 됩니다.

In [40]:
# d_w: word embedding size, num_layers: layer의 개수, num_dirs: 방향의 개수
batch_emb = embedding(batch)  # (B, L, d_w)
h_0 = torch.zeros((num_layers * num_dirs, batch.shape[0], hidden_size))  # (num_layers * num_dirs, B, d_h) = (4, B, d_h)

packed_batch = pack_padded_sequence(batch_emb.transpose(0, 1), batch_lens)

packed_outputs, h_n = gru(packed_batch, h_0)
print(packed_outputs)
print(packed_outputs[0].shape)
print(h_n.shape)

PackedSequence(data=tensor([[-0.0711,  0.0160,  0.0645,  ...,  0.1040, -0.1675,  0.1733],
        [ 0.1313,  0.0846, -0.0115,  ...,  0.0140, -0.0193,  0.0374],
        [ 0.0423, -0.0514,  0.0019,  ...,  0.1278,  0.1063, -0.1231],
        ...,
        [-0.1345, -0.0334,  0.0670,  ...,  0.1298, -0.1048,  0.0155],
        [-0.0156,  0.0632, -0.2120,  ...,  0.1014, -0.2007, -0.0115],
        [ 0.0169,  0.1330, -0.1025,  ...,  0.0990, -0.1478,  0.0286]],
       grad_fn=<CatBackward0>), batch_sizes=tensor([10, 10, 10, 10, 10,  9,  7,  7,  6,  6,  5,  5,  5,  5,  5,  4,  4,  3,
         1,  1]), sorted_indices=None, unsorted_indices=None)
torch.Size([123, 1024])
torch.Size([4, 10, 512])


In [41]:
outputs, output_lens = pad_packed_sequence(packed_outputs)

print(outputs.shape)  # (L, B, num_dirs*d_h)
print(output_lens)

torch.Size([20, 10, 1024])
tensor([20, 18, 18, 17, 15, 10,  8,  6,  6,  5])


각각의 결과물의 shape는 다음과 같습니다.

`outputs`: `(max_len, batch_size, num_dir * hidden_size)`  
`h_n`: `(num_layers*num_dirs, batch_size, hidden_size)`

In [42]:
batch_size = h_n.shape[1]
print(h_n.view(num_layers, num_dirs, batch_size, hidden_size))
print(h_n.view(num_layers, num_dirs, batch_size, hidden_size).shape)

tensor([[[[ 0.3989,  0.1724, -0.2259,  ..., -0.0692,  0.3286,  0.1672],
          [ 0.1684,  0.0925, -0.0764,  ..., -0.2898, -0.2372,  0.1887],
          [ 0.1659,  0.0009,  0.0104,  ...,  0.0262, -0.2917, -0.3000],
          ...,
          [ 0.3059, -0.1365,  0.2527,  ...,  0.0679,  0.1679, -0.2213],
          [ 0.2599,  0.1552, -0.1134,  ...,  0.0717, -0.2702,  0.0314],
          [ 0.0945, -0.1085,  0.1190,  ...,  0.3101,  0.2203,  0.2463]],

         [[ 0.2374, -0.1953,  0.2140,  ...,  0.0625,  0.0948, -0.1328],
          [ 0.3287, -0.4926, -0.1708,  ...,  0.0374, -0.0492,  0.1033],
          [ 0.0440, -0.1881,  0.3845,  ...,  0.3745,  0.0447,  0.0108],
          ...,
          [-0.0076, -0.1605, -0.1553,  ...,  0.3674,  0.1845, -0.3679],
          [-0.0451, -0.2429,  0.5334,  ...,  0.2654, -0.5207, -0.4605],
          [ 0.0429, -0.0020, -0.3378,  ...,  0.0215, -0.3116, -0.3582]]],


        [[[ 0.0169,  0.1330, -0.1025,  ..., -0.1606,  0.1019,  0.0182],
          [-0.0993, -0.0041,