# from Scratch

In [None]:
import math
import torch
import torch.nn as nn

In [None]:
class squash(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 2 * torch.tanh(x/2)

In [None]:
class OriginalLSTM(nn.Module):
  def __init__(self, input_size, hidden_size):
    super().__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    # squash function, input_gate, output_gate에 대해 한번에 계산을 하기 위하여 weight를 
    # concat 시킨 개념
    self.W = nn.Parameter(torch.Tensor(input_size, 3 * hidden_size)) 
    self.U = nn.Parameter(torch.Tensor(hidden_size, 3* hidden_size))
    self.bias = nn.Parameter(torch.Tensor(3 * hidden_size))
    self.init_weights()

  # Pytorch nn.Module()의 initialization과 동일
  def init_weights(self):

    stdv = 1.0 / math.sqrt(self.hidden_size)
    for weight in self.parameters():
      weight.data.uniform(-stdv, stdv)

  def forward(self, x, init_states = None):
    """Assumes x is of shape (batch, sequence, feature)"""
    batch_size, seq_size, _ = x.size()
    hidden_seq = []

    if init_states is None:
      h_t, c_t = (torch.zeros(batch_size, self.hidden_size).to(x.device),
                  torch.zeros(batch_size, self.hidden_size).to(x.device))
    
    else:
      h_t, c_t = init_states

    hs = self.hidden_size
    for t in range(seq_size):
      x_t = x[:, t, :]
      gates = x_t @ self.W + h_t @ self.U + self.bias
      s_t, i_t, o_t = (
          squash(gates[:, :hs]),
          torch.sigmoid(gates[:, hs : 2 * hs]),
          torch.sigmoid(gates[:, 2 * hs : 3 * hs]),
      )
      c_t = c_t + s_t * i_t
      h_t = o_t * squash(c_t)

      hidden_seq.append(h_t.unsqeeze(0))
    
    hidden_seq = torch.cat(hidden_seq, dim = 0)
    # reshape from shape (sequence, batch, feature) to (batch, sequence, feature)
    hidden_seq = hidden_seq.transpose(0, 1).contiguous() 
    # 데이터 포인터의 물리적 순서와 shape 상에서의 데이터 순서가 같았기 때문에 contiguous 했던 상태가 
    # 깨지게 되는데 이를 다시 contiguous한 상태로 돌리는 작업
    return hidden_seq, (h_t, c_t)

In [None]:
class VanillaLSTM(nn.Module):
  def __init__(self, input_size, hidden_size):
    super().__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    # tanh function, forget_gate, input_gate, output_gate에 대해 한번에 계산을 하기 
    # 위하여 weight를 concat 시킨 개념
    self.W = nn.Parameter(torch.Tensor(input_size, 4 * hidden_size))
    self.U = nn.Parameter(torch.Tensor(hidden_size, 4* hidden_size))
    self.bias = nn.Parameter(torch.Tensor(4 * hidden_size))
    self.init_weights()

  # Pytorch nn.Module()의 initialization과 동일
  def init_weights(self):
    stdv = 1.0 / math.sqrt(self.hidden_size)
    for weight in self.parameters():
      weight.data.uniform(-stdv, stdv)

  def forward(self, x, init_states = None):
    """Assumes x is of shape (batch, sequence, feature)"""
    batch_size, seq_size, _ = x.size()
    hidden_seq = []

    if init_states is None:
      h_t, c_t = (torch.zeros(batch_size, self.hidden_size).to(x.device),
                  torch.zeros(batch_size, self.hidden_size).to(x.device))
    
    else:
      h_t, c_t = init_states

    hs = self.hidden_size
    for t in range(seq_size):
      x_t = x[:, t, :]
      gates = x_t @ self.W + h_t @ self.U + self.bias
      f_t, cc_t, i_t, o_t = (
          torch.sigmoid(gates[:, :hs]),
          torch.tanh(gates[:, hs : 2 * hs]),
          torch.sigmoid(gates[:, 2 * hs : 3 * hs]),
          torch.sigmoid(gates[:, 3 * hs : 4 * hs]),
      )
      c_t = f_t * c_t + i_t * cc_t
      h_t = o_t * torch.tanh(c_t)

      hidden_seq.append(h_t.unsqeeze(0))
    
    hidden_seq = torch.cat(hidden_seq, dim = 0)
    # reshape from shape (sequence, batch, feature) to (batch, sequence, feature)
    hidden_seq = hidden_seq.transpose(0, 1).contiguous() 
    # 데이터 포인터의 물리적 순서와 shape 상에서의 데이터 순서가 같았기 때문에 contiguous 했던 상태가 
    # 깨지게 되는데 이를 다시 contiguous한 상태로 돌리는 작업
    return hidden_seq, (h_t, c_t)

# Tensorflow에 구현되어있는 LSTM

## 데이터 불러오기

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
from string import punctuation

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [None]:
df = pd.read_csv("/content/drive/MyDrive/논문 미니프로젝트/LSTM/ArticlesApril2018.csv")
df.head()

Unnamed: 0,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,5adf6684068401528a2aa69b,781,By JOHN BRANCH,article,Former N.F.L. Cheerleaders’ Settlement Offer: ...,"['Workplace Hazards and Violations', 'Football...",68,Sports,0,2018-04-24 17:16:49,Pro Football,"“I understand that they could meet with us, pa...",The New York Times,News,https://www.nytimes.com/2018/04/24/sports/foot...
1,5adf653f068401528a2aa697,656,By LISA FRIEDMAN,article,E.P.A. to Unveil a New Rule. Its Effect: Less ...,"['Environmental Protection Agency', 'Pruitt, S...",68,Climate,0,2018-04-24 17:11:21,Unknown,The agency plans to publish a new regulation T...,The New York Times,News,https://www.nytimes.com/2018/04/24/climate/epa...
2,5adf4626068401528a2aa628,2427,By PETE WELLS,article,"The New Noma, Explained","['Restaurants', 'Noma (Copenhagen, Restaurant)...",66,Dining,0,2018-04-24 14:58:44,Unknown,What’s it like to eat at the second incarnatio...,The New York Times,News,https://www.nytimes.com/2018/04/24/dining/noma...
3,5adf40d2068401528a2aa619,626,By JULIE HIRSCHFELD DAVIS and PETER BAKER,article,Unknown,"['Macron, Emmanuel (1977- )', 'Trump, Donald J...",68,Washington,0,2018-04-24 14:35:57,Europe,President Trump welcomed President Emmanuel Ma...,The New York Times,News,https://www.nytimes.com/2018/04/24/world/europ...
4,5adf3d64068401528a2aa60f,815,By IAN AUSTEN and DAN BILEFSKY,article,Unknown,"['Toronto, Ontario, Attack (April, 2018)', 'Mu...",68,Foreign,0,2018-04-24 14:21:21,Canada,"Alek Minassian, 25, a resident of Toronto’s Ri...",The New York Times,News,https://www.nytimes.com/2018/04/24/world/canad...


In [None]:
print("The shape of the data:", df.shape)
print(df.columns)

The shape of the data: (1324, 15)
Index(['articleID', 'articleWordCount', 'byline', 'documentType', 'headline',
       'keywords', 'multimedia', 'newDesk', 'printPage', 'pubDate',
       'sectionName', 'snippet', 'source', 'typeOfMaterial', 'webURL'],
      dtype='object')


In [None]:
df.isnull().sum()

articleID           0
articleWordCount    0
byline              0
documentType        0
headline            0
keywords            0
multimedia          0
newDesk             0
printPage           0
pubDate             0
sectionName         0
snippet             0
source              0
typeOfMaterial      0
webURL              0
dtype: int64

In [None]:
headline = []
headline.extend(list(df.headline.values)) # headline list에 df의 headline column의 value들을 추가, parameter로는 iterable을 받기 때문에 리스트화
headline[:5]

['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained',
 'Unknown',
 'Unknown']

Unknown값은 불필요하므로 제거

In [None]:
print("before removing Unknown, # of headlines are", len(headline))
headline = [word for word in headline if word != "Unknown"]
print("after removing Unknown, # of headlines are", len(headline))

before removing Unknown, # of headlines are 1324
after removing Unknown, # of headlines are 1214


In [None]:
headline[:5]

['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained',
 'How a Bag of Texas Dirt  Became a Times Tradition',
 'Is School a Place for Self-Expression?']

## 전처리 

구두점 제거 및 소문자화

In [None]:
def repreprocessing(raw_sentence):
  preprocessed_sentence = raw_sentence.encode("utf8").decode("ascii", "ignore") # utf8로 인코딩 후, ascii로 decoding해라, error는 무시
  return ''.join(word for word in preprocessed_sentence if word not in punctuation).lower()

preprocessed_headline = [repreprocessing(x) for x in headline]
preprocessed_headline[:5]

['former nfl cheerleaders settlement offer 1 and a meeting with goodell',
 'epa to unveil a new rule its effect less science in policymaking',
 'the new noma explained',
 'how a bag of texas dirt  became a times tradition',
 'is school a place for selfexpression']

## 토큰화

In [None]:
tokenizer = Tokenizer()
# fit_on_texts()안에 코퍼스를 입력으로 하면 빈도수를 기준으로 단어 집합을 생성한다.
# 빈도수가 높을수록 낮은 index를 부여한다.
tokenizer.fit_on_texts(preprocessed_headline)
vocab_size = len(tokenizer.word_index) + 1
# word index가 1로 시작하기 때문에 가장 큰 word index = len(word_index)이기 때문에 
# 이를 전부 커버하기 위해서는 그보다 하나 큰 vocab_size가 필요하다.
print("단어 집합의 크기: %d" % vocab_size)

단어 집합의 크기: 3494


In [None]:
vocab_size

3494

## 정수 인코딩

In [None]:
sequences = list() # empty list

for sentence in preprocessed_headline:
  # 각 샘플에 대해 정수 인코딩
  encoded = tokenizer.texts_to_sequences([sentence])[0] # 각 헤드라인별로 인코딩된 값 ex([99, 269, 371, 1115, 582, 52, 7, 2, 372, 10, 1116])
  for i in range(1, len(encoded)):
    sequence = encoded[:i + 1]
    sequences.append(sequence)
  
print(sequences[:11])

seqs = list()

for sentence in preprocessed_headline:
  for i in range(1, len(sentence)):
    sent_split = sentence.split()
    seq = sent_split[:i + 1]
    seqs.append(seq)

print(seqs[:11])

[[99, 269], [99, 269, 371], [99, 269, 371, 1115], [99, 269, 371, 1115, 582], [99, 269, 371, 1115, 582, 52], [99, 269, 371, 1115, 582, 52, 7], [99, 269, 371, 1115, 582, 52, 7, 2], [99, 269, 371, 1115, 582, 52, 7, 2, 372], [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10], [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10, 1116], [100, 3]]
[['former', 'nfl'], ['former', 'nfl', 'cheerleaders'], ['former', 'nfl', 'cheerleaders', 'settlement'], ['former', 'nfl', 'cheerleaders', 'settlement', 'offer'], ['former', 'nfl', 'cheerleaders', 'settlement', 'offer', '1'], ['former', 'nfl', 'cheerleaders', 'settlement', 'offer', '1', 'and'], ['former', 'nfl', 'cheerleaders', 'settlement', 'offer', '1', 'and', 'a'], ['former', 'nfl', 'cheerleaders', 'settlement', 'offer', '1', 'and', 'a', 'meeting'], ['former', 'nfl', 'cheerleaders', 'settlement', 'offer', '1', 'and', 'a', 'meeting', 'with'], ['former', 'nfl', 'cheerleaders', 'settlement', 'offer', '1', 'and', 'a', 'meeting', 'with', 'goodell'], ['former', 'nfl'

| Samples | X                                       | y            |
|---------|-----------------------------------------|--------------|
| 1       | former nfl                              | cheerleaders |
| 2       | former nfl cheerleaders                 | settlement   |
| 3       | former nfl cheerleaders settlement      | offer        |
| 4       | former nfl cheerleaders settlement offer| 1            |

어떤 정수가 어떤 단어를 의미하는지 알아보기 위해 word_index의 key, value를 반전시킨 dictionary index_to_word 생성

In [None]:
index_to_word = {}
for key, value in tokenizer.word_index.items():
  index_to_word[value] = key

print("빈도수 상위 582번 단어: {}".format(index_to_word[582]))

빈도수 상위 582번 단어: offer


## 패딩

자연어 처리를 하다보면 각 문장(또는 문서)은 서로 길이가 다른 경우가 많다.   
그런데 기계는 길이가 전부 동일한 문서들에 대해서는 하나의 행렬로 보고, 한꺼번에 묶어서 처리할 수 있기 때문에, 병렬 연산을 위해서 여러 문장의 길이를 임의로 동일하게 맞춰준다.


In [None]:
# 가장 긴 문장의 길이에 맞게 패딩을 해주기 위해 max_len 구하기
max_len = max(len(l) for l in sequences)
print("제일 긴 문장 길이: {}".format(max_len))

제일 긴 문장 길이: 24


In [None]:
# padding = 문장의 앞(pre), 혹은 뒤(post) 어디에 padding할지 결정
sequences = pad_sequences(sequences, maxlen = max_len, padding = 'pre')
print(sequences[:3])

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0   99  269]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0   99  269  371]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0   99  269  371 1115]]


## Feature, Label 분리

sequence의 가장 우측에 위치하는 단어는 label로 분리

In [None]:
sequences = np.array(sequences)
X = sequences[:, :-1] # 행은 전부, 열은 마지막 하나 빼고
y = sequences[:, -1]

In [None]:
print(X[:3])
print(y[:3])

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0  99]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0  99 269]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0  99 269 371]]
[ 269  371 1115]


In [None]:
y = to_categorical(y, num_classes = vocab_size)

In [None]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

## 모델 설계하기

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM

In [None]:
embedding_dim = 10
hidden_units = 128

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(LSTM(hidden_units))
model.add(Dense(vocab_size, activation = "softmax"))
model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ['accuracy'])
model.fit(X, y, epochs = 200, verbose = 2)

Epoch 1/200
244/244 - 14s - loss: 7.6470 - accuracy: 0.0278 - 14s/epoch - 57ms/step
Epoch 2/200
244/244 - 16s - loss: 7.1068 - accuracy: 0.0317 - 16s/epoch - 65ms/step
Epoch 3/200
244/244 - 10s - loss: 6.9648 - accuracy: 0.0352 - 10s/epoch - 39ms/step
Epoch 4/200
244/244 - 10s - loss: 6.8307 - accuracy: 0.0433 - 10s/epoch - 40ms/step
Epoch 5/200
244/244 - 10s - loss: 6.6710 - accuracy: 0.0460 - 10s/epoch - 40ms/step
Epoch 6/200
244/244 - 10s - loss: 6.4966 - accuracy: 0.0507 - 10s/epoch - 40ms/step
Epoch 7/200
244/244 - 10s - loss: 6.2963 - accuracy: 0.0559 - 10s/epoch - 40ms/step
Epoch 8/200
244/244 - 10s - loss: 6.0827 - accuracy: 0.0607 - 10s/epoch - 40ms/step
Epoch 9/200
244/244 - 10s - loss: 5.8712 - accuracy: 0.0631 - 10s/epoch - 40ms/step
Epoch 10/200
244/244 - 10s - loss: 5.6650 - accuracy: 0.0697 - 10s/epoch - 40ms/step
Epoch 11/200
244/244 - 10s - loss: 5.4715 - accuracy: 0.0761 - 10s/epoch - 40ms/step
Epoch 12/200
244/244 - 10s - loss: 5.2909 - accuracy: 0.0841 - 10s/epoch -

<keras.callbacks.History at 0x7fbc79d1f190>

In [None]:
def sentence_generation(model, tokenizer, current_word, n):
  init_word = current_word
  sentence = ''

  # n번 반복
  for _ in range(n):
    encoded = tokenizer.texts_to_sequences([current_word])[0]
    encoded = pad_sequences([encoded], maxlen = max_len - 1, padding = "pre") # label을 떼버렸기 때문에 max_len - 1

    #입력한 current word에 대해서 새로운 단어 예측, 이를 result에 저장
    result = model.predict(encoded, verbose = 0)
    result = np.argmax(result, axis = 1) # activation = softmax

    for word, index in tokenizer.word_index.items():
      # 모델이 예측한 숫자와 인덱스가 동일한 단어가 있다면
      if index == result:
        break

    # current word + ' ' + result를 현재 단어로 변경
    current_word = current_word + ' ' + word

    # 예측 단어를 문장에 저장
    sentence = sentence + ' ' + word

  sentence = init_word + sentence
  return sentence

In [None]:
print(sentence_generation(model, tokenizer, 'how', 15))

how do you get your nature fix how far have we come on martin luther king


# Pytorch에 구현되어있는 LSTM

In [None]:
!pip install torch torchvision

In [None]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## 데이터 불러오기

In [None]:
import pandas as pd
import numpy as np
from string import punctuation

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

import itertools

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
df = pd.read_csv("/content/drive/MyDrive/논문 미니프로젝트/LSTM/ArticlesApril2018.csv")
df.head()

Unnamed: 0,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,5adf6684068401528a2aa69b,781,By JOHN BRANCH,article,Former N.F.L. Cheerleaders’ Settlement Offer: ...,"['Workplace Hazards and Violations', 'Football...",68,Sports,0,2018-04-24 17:16:49,Pro Football,"“I understand that they could meet with us, pa...",The New York Times,News,https://www.nytimes.com/2018/04/24/sports/foot...
1,5adf653f068401528a2aa697,656,By LISA FRIEDMAN,article,E.P.A. to Unveil a New Rule. Its Effect: Less ...,"['Environmental Protection Agency', 'Pruitt, S...",68,Climate,0,2018-04-24 17:11:21,Unknown,The agency plans to publish a new regulation T...,The New York Times,News,https://www.nytimes.com/2018/04/24/climate/epa...
2,5adf4626068401528a2aa628,2427,By PETE WELLS,article,"The New Noma, Explained","['Restaurants', 'Noma (Copenhagen, Restaurant)...",66,Dining,0,2018-04-24 14:58:44,Unknown,What’s it like to eat at the second incarnatio...,The New York Times,News,https://www.nytimes.com/2018/04/24/dining/noma...
3,5adf40d2068401528a2aa619,626,By JULIE HIRSCHFELD DAVIS and PETER BAKER,article,Unknown,"['Macron, Emmanuel (1977- )', 'Trump, Donald J...",68,Washington,0,2018-04-24 14:35:57,Europe,President Trump welcomed President Emmanuel Ma...,The New York Times,News,https://www.nytimes.com/2018/04/24/world/europ...
4,5adf3d64068401528a2aa60f,815,By IAN AUSTEN and DAN BILEFSKY,article,Unknown,"['Toronto, Ontario, Attack (April, 2018)', 'Mu...",68,Foreign,0,2018-04-24 14:21:21,Canada,"Alek Minassian, 25, a resident of Toronto’s Ri...",The New York Times,News,https://www.nytimes.com/2018/04/24/world/canad...


In [None]:
print("The shape of the data:", df.shape)
print(df.columns)

The shape of the data: (1324, 15)
Index(['articleID', 'articleWordCount', 'byline', 'documentType', 'headline',
       'keywords', 'multimedia', 'newDesk', 'printPage', 'pubDate',
       'sectionName', 'snippet', 'source', 'typeOfMaterial', 'webURL'],
      dtype='object')


In [None]:
df.isnull().sum()

articleID           0
articleWordCount    0
byline              0
documentType        0
headline            0
keywords            0
multimedia          0
newDesk             0
printPage           0
pubDate             0
sectionName         0
snippet             0
source              0
typeOfMaterial      0
webURL              0
dtype: int64

In [None]:
headline = []
headline.extend(list(df.headline.values)) # headline list에 df의 headline column의 value들을 추가, parameter로는 iterable을 받기 때문에 리스트화
headline[:5]

['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained',
 'Unknown',
 'Unknown']

Unknown값은 불필요하므로 제거

In [None]:
print("before removing Unknown, # of headlines are", len(headline))
headline = [word for word in headline if word != "Unknown"]
print("after removing Unknown, # of headlines are", len(headline))

before removing Unknown, # of headlines are 1324
after removing Unknown, # of headlines are 1214


In [None]:
headline[:5]

['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained',
 'How a Bag of Texas Dirt  Became a Times Tradition',
 'Is School a Place for Self-Expression?']

## 전처리 

구두점 제거 및 소문자화

In [None]:
def repreprocessing(raw_sentence):
  preprocessed_sentence = raw_sentence.encode("utf8").decode("ascii", "ignore") # utf8로 인코딩 후, ascii로 decoding해라, error는 무시
  return ''.join(word for word in preprocessed_sentence if word not in punctuation).lower()

preprocessed_headline = [repreprocessing(x) for x in headline]
preprocessed_headline[:5]

['former nfl cheerleaders settlement offer 1 and a meeting with goodell',
 'epa to unveil a new rule its effect less science in policymaking',
 'the new noma explained',
 'how a bag of texas dirt  became a times tradition',
 'is school a place for selfexpression']

## 토큰화

In [None]:
sentence = list()

for headline in preprocessed_headline:
  sentence.append(word_tokenize(headline))

tokenized_sentence = list(itertools.chain(*sentence))

In [None]:
vocab = list(set(tokenized_sentence))
print(vocab)
vocab_size = len(vocab) + 1
print(vocab_size)

['2002', 'children', 'syndrome', 'gave', 'ronny', 'q', 'massachusetts', 'than', 'europe', 'letter', 'mom', 'fast', 'trumpland', 'salty', 'india', 'shades', 'jumbo', 'bag', 'faulted', 'credit', 'behavior', 'forever', 'braindamaged', 'trauma', '600000', 'boycotts', 'wife', 'answers', 'buy', 'steady', 'receding', 'against', 'bracing', 'gruesome', '8000', 'courted', 'call', 'arms', 'pictures', 'balancing', 'drag', 'lady', 'still', 'dorm', 'camdens', 'dinner', 'retirement', 'overweight', 'supermans', 'blasts', 'convicted', 'fallon', 'midair', 'generation', 'kidnapped', 'valley', 'weight', 'recharge', 'offspring', 'mitzi', 'my', 'fulton', 'missed', 'courses', 'dying', 'thanks', 'stringfield', 'fresh', 'sets', 'karl', 'uk', 'sex', 'bromance', 'rim', 'walking', 'woman', 'gunmans', 'should', 'whisperer', 'military', 'evans', 'chemicals', 'alfie', 'threat', 'arlee', 'foothold', 'became', 'threaten', 'hasty', 'bronx', 'episodes', 'push', 'daunting', 'tried', 'markdowns', 'underwear', 'brain', 'gy

## 정수 인코딩

In [None]:
word_to_index = {tkn: i for i, tkn in enumerate(vocab, start = 1)}
print(word_to_index)

{'2002': 1, 'children': 2, 'syndrome': 3, 'gave': 4, 'ronny': 5, 'q': 6, 'massachusetts': 7, 'than': 8, 'europe': 9, 'letter': 10, 'mom': 11, 'fast': 12, 'trumpland': 13, 'salty': 14, 'india': 15, 'shades': 16, 'jumbo': 17, 'bag': 18, 'faulted': 19, 'credit': 20, 'behavior': 21, 'forever': 22, 'braindamaged': 23, 'trauma': 24, '600000': 25, 'boycotts': 26, 'wife': 27, 'answers': 28, 'buy': 29, 'steady': 30, 'receding': 31, 'against': 32, 'bracing': 33, 'gruesome': 34, '8000': 35, 'courted': 36, 'call': 37, 'arms': 38, 'pictures': 39, 'balancing': 40, 'drag': 41, 'lady': 42, 'still': 43, 'dorm': 44, 'camdens': 45, 'dinner': 46, 'retirement': 47, 'overweight': 48, 'supermans': 49, 'blasts': 50, 'convicted': 51, 'fallon': 52, 'midair': 53, 'generation': 54, 'kidnapped': 55, 'valley': 56, 'weight': 57, 'recharge': 58, 'offspring': 59, 'mitzi': 60, 'my': 61, 'fulton': 62, 'missed': 63, 'courses': 64, 'dying': 65, 'thanks': 66, 'stringfield': 67, 'fresh': 68, 'sets': 69, 'karl': 70, 'uk': 71

In [None]:
sentence_index = []

for sent in sentence:
  sent_seq = []
  for word in sent:
    sent_seq.append(word_to_index[word])
  sentence_index.append(sent_seq)

In [None]:
print(sentence)
print(sentence_index)

[['former', 'nfl', 'cheerleaders', 'settlement', 'offer', '1', 'and', 'a', 'meeting', 'with', 'goodell'], ['epa', 'to', 'unveil', 'a', 'new', 'rule', 'its', 'effect', 'less', 'science', 'in', 'policymaking'], ['the', 'new', 'noma', 'explained'], ['how', 'a', 'bag', 'of', 'texas', 'dirt', 'became', 'a', 'times', 'tradition'], ['is', 'school', 'a', 'place', 'for', 'selfexpression'], ['commuter', 'reprogramming'], ['ford', 'changed', 'leaders', 'looking', 'for', 'a', 'lift', 'its', 'still', 'looking'], ['romney', 'failed', 'to', 'win', 'at', 'utah', 'convention', 'but', 'few', 'believe', 'hes', 'doomed'], ['chain', 'reaction'], ['he', 'forced', 'the', 'vatican', 'to', 'investigate', 'sex', 'abuse', 'now', 'hes', 'meeting', 'with', 'pope', 'francis'], ['in', 'berlin', 'artists', 'find', 'a', 'home'], ['the', 'right', 'stuff'], ['jimmy', 'carter', 'knows', 'what', 'north', 'korea', 'wants'], ['the', 'truth', 'is', 'out', 'there'], ['new', 'jersey', 'ruling', 'could', 'reignite', 'battle', '

In [None]:
sequences = list()

for encoded in sentence_index:
  for i in range(1, len(encoded)):
    sequence = encoded[:i + 1]
    sequences.append(sequence)
  
print(sequences[:11])

[[2224, 767], [2224, 767, 464], [2224, 767, 464, 3242], [2224, 767, 464, 3242, 1001], [2224, 767, 464, 3242, 1001, 1914], [2224, 767, 464, 3242, 1001, 1914, 2626], [2224, 767, 464, 3242, 1001, 1914, 2626, 407], [2224, 767, 464, 3242, 1001, 1914, 2626, 407, 3166], [2224, 767, 464, 3242, 1001, 1914, 2626, 407, 3166, 211], [2224, 767, 464, 3242, 1001, 1914, 2626, 407, 3166, 211, 2026], [2153, 1218]]


| Samples | X                                       | y            |
|---------|-----------------------------------------|--------------|
| 1       | former nfl                              | cheerleaders |
| 2       | former nfl cheerleaders                 | settlement   |
| 3       | former nfl cheerleaders settlement      | offer        |
| 4       | former nfl cheerleaders settlement offer| 1            |

In [None]:
index_to_word = {v: k for k, v in word_to_index.items()}
print(index_to_word)

{1: '2002', 2: 'children', 3: 'syndrome', 4: 'gave', 5: 'ronny', 6: 'q', 7: 'massachusetts', 8: 'than', 9: 'europe', 10: 'letter', 11: 'mom', 12: 'fast', 13: 'trumpland', 14: 'salty', 15: 'india', 16: 'shades', 17: 'jumbo', 18: 'bag', 19: 'faulted', 20: 'credit', 21: 'behavior', 22: 'forever', 23: 'braindamaged', 24: 'trauma', 25: '600000', 26: 'boycotts', 27: 'wife', 28: 'answers', 29: 'buy', 30: 'steady', 31: 'receding', 32: 'against', 33: 'bracing', 34: 'gruesome', 35: '8000', 36: 'courted', 37: 'call', 38: 'arms', 39: 'pictures', 40: 'balancing', 41: 'drag', 42: 'lady', 43: 'still', 44: 'dorm', 45: 'camdens', 46: 'dinner', 47: 'retirement', 48: 'overweight', 49: 'supermans', 50: 'blasts', 51: 'convicted', 52: 'fallon', 53: 'midair', 54: 'generation', 55: 'kidnapped', 56: 'valley', 57: 'weight', 58: 'recharge', 59: 'offspring', 60: 'mitzi', 61: 'my', 62: 'fulton', 63: 'missed', 64: 'courses', 65: 'dying', 66: 'thanks', 67: 'stringfield', 68: 'fresh', 69: 'sets', 70: 'karl', 71: 'uk'

어떤 정수가 어떤 단어를 의미하는지 알아보기 위해 word_index의 key, value를 반전시킨 dictionary index_to_word 생성

## 패딩

자연어 처리를 하다보면 각 문장(또는 문서)은 서로 길이가 다른 경우가 많다.   
그런데 기계는 길이가 전부 동일한 문서들에 대해서는 하나의 행렬로 보고, 한꺼번에 묶어서 처리할 수 있기 때문에, 병렬 연산을 위해서 여러 문장의 길이를 임의로 동일하게 맞춰준다.


In [None]:
padded_sequences = torch.nn.utils.rnn.pad_sequence([
              torch.tensor(sent_index[::-1]) for sent_index in sequences],  # reverse the list and create tensors 
              batch_first=True).flip(dims=[1])  # pad

In [None]:
padded_sequences

tensor([[   0,    0,    0,  ...,    0, 2224,  767],
        [   0,    0,    0,  ..., 2224,  767,  464],
        [   0,    0,    0,  ...,  767,  464, 3242],
        ...,
        [   0,    0,    0,  ..., 1362, 1666, 2627],
        [   0,    0,    0,  ..., 1666, 2627,  407],
        [   0,    0,    0,  ..., 2627,  407, 1740]])

## Feature, Label 분리

sequence의 가장 우측에 위치하는 단어는 label로 분리

In [None]:

X = padded_sequences[:, :-1] # 행은 전부, 열은 마지막 하나 빼고
y = padded_sequences[:, -1]

In [None]:
print(X[:3])
print(y[:3])

tensor([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 2224],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0, 2224,  767],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0, 2224,  767,  464]])
tensor([ 767,  464, 3242])


In [None]:
def torch_to_categorical(y, num_classes):
    """ 1-hot encodes a tensor """
    return np.eye(num_classes, dtype='uint8')[y]

In [None]:
y = torch_to_categorical(y, num_classes = vocab_size)

In [None]:
y

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

## 모델 설계하기

In [None]:
embedding_dim = 10
hidden_units = 128

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(LSTM(hidden_units))
model.add(Dense(vocab_size, activation = "softmax"))
model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ['accuracy'])
model.fit(X, y, epochs = 200, verbose = 2)



TypeError: ignored

In [None]:
input_size = 10
hidden_units = 128

class Net(nn.Module):
  def __init__(self, vocab_size = vocab_size, input_size = input_size, hidden_size = hidden_units, batch_first = True):
    super(Net, self).__init__()
    self.embedding_layer = nn.Embedding(num_embeddings = vocab_size, embedding_dim = input_size)
    self.lstm_layer = nn.LSTM(input_size, hidden_size, batch_first = batch_first)
    self.linear = nn.Linear(hidden_size, vocab_size)

  def forward(self, x):
    x = self.embedding_layer(x)
    x, state = self.lstm_layer(x)
    x = self.linear(x)

    return x

In [None]:
model = Net()
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(params = model.parameters())

In [None]:
outputs = Net(X[1])

TypeError: ignored

In [None]:
decode = lambda y: [index_to_word.get(x) for x in y]

In [None]:
torch.tensor(X)

tensor([[   0,    0,    0,  ...,    0,    0, 2224],
        [   0,    0,    0,  ...,    0, 2224,  767],
        [   0,    0,    0,  ..., 2224,  767,  464],
        ...,
        [   0,    0,    0,  ..., 2239, 1362, 1666],
        [   0,    0,    0,  ..., 1362, 1666, 2627],
        [   0,    0,    0,  ..., 1666, 2627,  407]])

In [None]:
for step in range(201): 
  # 경사 초기화 
  optimizer.zero_grad() 
  # 순방향 전파 
  output = model(torch.tensor(X))
  # 손실값 계산 
  loss = loss_func(output, Y.view(-1)) 
  # 역방향 전파 
  loss.backward() 
  # 매개변수 업데이트 
  optimizer.step() 
  # 기록 
  if step % 40 == 0: 
    print("[{:02d}/201] {:.4f} ".format(step+1, loss)) 
    pred = output.argmax(-1).tolist() 
    print(" ".join(["Repeat"] + decode(pred))) 
    print()

TypeError: ignored

In [None]:
def sentence_generation(model, tokenizer, current_word, n):
  init_word = current_word
  sentence = ''

  # n번 반복
  for _ in range(n):
    encoded = tokenizer.texts_to_sequences([current_word])[0]
    encoded = pad_sequences([encoded], maxlen = max_len - 1, padding = "pre") # label을 떼버렸기 때문에 max_len - 1

    #입력한 current word에 대해서 새로운 단어 예측, 이를 result에 저장
    result = model.predict(encoded, verbose = 0)
    result = np.argmax(result, axis = 1) # activation = softmax

    for word, index in tokenizer.word_index.items():
      # 모델이 예측한 숫자와 인덱스가 동일한 단어가 있다면
      if index == result:
        break

    # current word + ' ' + result를 현재 단어로 변경
    current_word = current_word + ' ' + word

    # 예측 단어를 문장에 저장
    sentence = sentence + ' ' + word

  sentence = init_word + sentence
  return sentence

In [None]:
print(sentence_generation(model, tokenizer, 'how', 15))