## Download corpus

In [31]:
import nltk
nltk.download("book", quiet = True)

True

In [32]:
# nltk.book에 저장된 다양한 corpus
from nltk.book import *
nltk.book.texts()

text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [33]:
ex_book = nltk.book.text1[:30000]

In [34]:
# tokenize 다 되어 있음음
print(ex_book[:10])

['[', 'Moby', 'Dick', 'by', 'Herman', 'Melville', '1851', ']', 'ETYMOLOGY', '.']


## Stop-words 제거

In [35]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

# stop-words에 특수기호 추가
stopwords  = stopwords + ['.',',','\'','!','?','\"','[',']','(',')','*']

print(f'Stop-words의 개수 : {len(stopwords)}')

Stop-words의 개수 : 190


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
# 불용어 빠진 것을 확인인
ex_book_preprocessing = [[t] for t in ex_book if t not in stopwords]
print(ex_book_preprocessing[:10])

[['Moby'], ['Dick'], ['Herman'], ['Melville'], ['1851'], ['ETYMOLOGY'], ['Supplied'], ['Late'], ['Consumptive'], ['Usher']]


## One-hot Encoding

In [37]:
from sklearn.preprocessing import OneHotEncoder

In [38]:
oe = OneHotEncoder()
document_matrix = oe.fit_transform(ex_book_preprocessing)
print(f'문서의 단의 개수 : {document_matrix.shape[0]}')
print(f'One-Hot vector의 크기 : {document_matrix.shape[1]}')

문서의 단의 개수 : 14990
One-Hot vector의 크기 : 5408


## processing

In [39]:
import torch
import torch.optim as optim
import torch.nn as nn


In [40]:
#GPU 체크
is_cuda = torch.cuda.is_available()
if is_cuda:
  device = torch.device("cuda")
  print("GPU is available")
else:
  device = torch.device("cpu")
  print("CPU is availalbe")

GPU is available


In [51]:
W = torch.rand(document_matrix.shape[1],2).to(device)
W_prime = torch.rand(2, document_matrix.shape[1]).to(device)

In [45]:
train_x = []
train_y = []
for i in range(document_matrix.shape[0] - 4):
  neighbor = []
  neighbor.append(document_matrix[i].toarray())
  neighbor.append(document_matrix[i+1].toarray())
  neighbor.append(document_matrix[i+3].toarray())
  neighbor.append(document_matrix[i+4].toarray())

  train_x.append(neighbor)
  train_y.append(document_matrix[i+2].toarray())

In [46]:
train_x_tensor = torch.Tensor(train_x).view(-1,4,document_matrix.shape[1]).to(device)
train_y_tensor = torch.Tensor(train_y).view(-1,document_matrix.shape[1]).to(device)

print(f'train_x의 shape : {train_x_tensor.shape}') # 단어 개수 * 4 * one_hot
print(f'train_y의 shape : {train_y_tensor.shape}') # 단어 개수 * one_hot

train_x의 shape : torch.Size([14986, 4, 5408])
train_y의 shape : torch.Size([14986, 5408])


In [73]:
# 주변 단어로 중심단어의 예측
num_epochs = 10000
optimizer = optim.Adam([W,W_prime], lr = 0.1)
criterion = nn.CrossEntropyLoss()
softmax = nn.Softmax(dim = 0)

In [91]:
for i in range(num_epochs+1):
  mat = []
  for j in range(train_x_tensor.size(0)):
    y_pred = torch.matmul(train_x_tensor[i], W)
    y_pred = torch.mean(y_pred, dim = 0)
    y_pred = torch.matmul(y_pred, W_prime)
    y_pred = softmax(y_pred)
    mat.append(y_pred.tolist())
  mat = torch.FloatTensor(mat).to(device)
  loss = criterion(mat, train_y_tensor)
  loss.requires_grad_(True)

  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

  if i % 1 == 0:
    print(f'Epoch {i} Loss : {loss.item()}')


Epoch 0 Loss : 8.595635414123535
Epoch 1 Loss : 8.595635414123535
Epoch 2 Loss : 8.595634460449219
Epoch 3 Loss : 8.595635414123535


KeyboardInterrupt: ignored