<a href="https://colab.research.google.com/github/castlechoi/summarize_ml/blob/main/NLP/Word2Vec_CBOW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing

## Download corpus

In [None]:
import torch
import torch.optim as optim
import torch.nn as nn

import nltk
nltk.download("book", quiet = True)

True

In [None]:
# nltk.book에 저장된 다양한 corpus
from nltk.book import *
nltk.book.texts()

text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [None]:
# tokenize 모두 완료 되어 있음
ex_book = nltk.book.text1[:5000]

## Stop-words 제거

In [None]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

# stop-words에 특수기호 추가
stopwords  = stopwords + ['.',',','\'','!','?','\"','[',']','(',')','*','I',':',';','-','."','--','<','>']

print(f'Stop-words의 개수 : {len(stopwords)}')

Stop-words의 개수 : 198


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# 불용어 빠진 것을 확인
ex_book_no_stopwords = [[t] for t in ex_book if t not in stopwords]
print(f'불용어 제외한 후 문장의 길이 : {len(ex_book_no_stopwords)}')

불용어 제외한 후 문장의 길이 : 2520


## Data preprocessing hyperparameter


In [None]:
min_count = 2
window = 2

## One-hot Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

In [None]:
# 토큰 집합 추출 ( 등장횟수가 2이하면 토큰화 안함)
cut_off = 0

tokens = pd.Series(ex_book_no_stopwords).value_counts()
for i in range(len(tokens)):
  if tokens[i] == min_count-1:
    cut_off = i
    break
tokens = tokens[:cut_off].index.tolist()
print(f'Token의 개수 : {tokens}')

Token의 개수 : [['S'], ['whale'], ['THE'], ['OF'], ['The'], ['WHALE'], ['A'], ['sea'], ['whales'], ['AND'], ['great'], ['Whale'], ['And'], ['Leviathan'], ['saw'], ['...'], ['one'], ['take'], ['There'], ['TO'], ['head'], ['ocean'], ['IN'], ['In'], ['It'], ['every'], ['water'], ['ship'], ['Sub'], ['Whales'], ['To'], ['BY'], ['would'], ['NANTUCKET'], ['near'], ['VOYAGE'], ['upon'], ['like'], ['He'], ['!"'], ['animal'], ['shall'], ['?"'], ['see'], ['mouth'], [',"'], ['ON'], ['killed'], ['vast'], ['Sperm'], ['world'], ['D'], ['us'], ['sir'], ['ye'], ['HISTORY'], ['shore'], ['find'], ['So'], ['ever'], ['many'], ['NUEE'], ['open'], ['said'], ['They'], ['king'], ['NARRATIVE'], ['boats'], ['If'], ['HIS'], ['air'], ['This'], ['oil'], ['time'], ['little'], ['years'], ['known'], ['fish'], ['called'], ['God'], ['WHALING'], ['two'], ['GLOBE'], ['jaws'], ['boat'], ['seen'], ['never'], ['long'], ['blows'], ['mast'], ['seas'], ['first'], ['told'], ['whether'], ['FOR'], ['could'], ['Right'], ['vessel'], ['

In [None]:
# token에 없는 데이터 모두 <unk>로 변경
ex_book_process = [t if t in tokens else ['<unk>'] for t in ex_book_no_stopwords]

In [None]:
# One-Hot Encoding
oe = OneHotEncoder()
document_matrix = oe.fit_transform(ex_book_process)
print(f'문서의 단의 개수 : {document_matrix.shape[0]}')
print(f'One-Hot vector의 크기 : {document_matrix.shape[1]}')

문서의 단의 개수 : 2520
One-Hot vector의 크기 : 359


## GPU 설정

In [None]:
#GPU 체크
is_cuda = torch.cuda.is_available()
if is_cuda:
  device = torch.device("cuda")
  print("GPU is available")
else:
  device = torch.device("cpu")
  print("CPU is availalbe")

GPU is available


# CBOW Low-level

## Train data preprocessing 

In [None]:
# train_x에 CBOW의 input으로 들어가는 4개의 벡터
train_x = []
train_y = []
for i in range(document_matrix.shape[0] - (window * 2)):
  neighbor = []
  neighbor.append(document_matrix[i].toarray())
  neighbor.append(document_matrix[i+1].toarray())
  neighbor.append(document_matrix[i+3].toarray())
  neighbor.append(document_matrix[i+4].toarray())

  train_x.append(neighbor)
  train_y.append(document_matrix[i+2].toarray())

In [None]:
train_x_tensor = torch.FloatTensor(train_x).view(-1,4,document_matrix.shape[1]).to(device)
train_y_tensor = torch.FloatTensor(train_y).view(-1,document_matrix.shape[1]).to(device)

print(f'train_x의 shape : {train_x_tensor.shape}') # 단어 개수 * 4 * one_hot
print(f'train_y의 shape : {train_y_tensor.shape}') # 단어 개수 * one_hot

train_x의 shape : torch.Size([2516, 4, 359])
train_y의 shape : torch.Size([2516, 359])


## Define Model

In [None]:
# Hyperparameter
num_epochs = 20000
lr = 0.001
emb_vector_size = 2

In [None]:
# Define weights without bias
W = torch.randn(document_matrix.shape[1],emb_vector_size).to(device).requires_grad_()
W_prime = torch.randn(emb_vector_size,document_matrix.shape[1]).to(device).requires_grad_()

In [None]:
# Define optimizer and loss
CBOW_optimizer = optim.Adam([W], lr = lr)
CBOW_optimizer_p = optim.Adam([W_prime], lr = 0.001)
criterion = nn.CrossEntropyLoss()

## Train the model

In [None]:
for i in range(num_epochs+1):
  # Input : 4 neighbor vector
  y_pred = train_x_tensor @ W
  y_pred = torch.mean(y_pred, dim = 1)
  
  # Input : Embedding vector
  # Output : predict one-hot vector
  y_pred = y_pred @ W_prime
  y_pred = y_pred.softmax(dim = 1)

  # compute loss
  loss = criterion(y_pred , train_y_tensor)
  
  # initiate optimizer
  CBOW_optimizer.zero_grad()
  CBOW_optimizer_p.zero_grad()
  # backpropagation
  loss.backward()
  CBOW_optimizer.step()
  CBOW_optimizer_p.step()

  if i % 500 == 0:
    print(f'epoch {i} : {loss.item()}')


epoch 0 : 6.470300197601318
epoch 500 : 5.733725547790527
epoch 1000 : 5.087561130523682
epoch 1500 : 4.510709762573242
epoch 2000 : 4.055849552154541
epoch 2500 : 3.769590377807617
epoch 3000 : 3.6288301944732666
epoch 3500 : 3.5637311935424805
epoch 4000 : 3.5267300605773926
epoch 4500 : 3.498671770095825
epoch 5000 : 3.473546028137207


In [None]:
# linear2 = nn.Linear(document_matrix.shape[1], 10, bias = False).to(device)

# optimizer = optim.Adam(linear2.parameters(), lr = 0.03)
# linear = nn.Linear(10,document_matrix.shape[1], bias = False).to(device)
# optimizer_li = optim.Adam(linear.parameters(), lr = 0.03)

In [None]:
for model in linear2.parameters():
  print(model)

Parameter containing:
tensor([[-13.9413, -52.3662, -13.6099,  ..., -18.9126,   5.3579,  -7.0772],
        [ 10.6471,  -6.2604,  26.2865,  ...,  11.2689,  16.2586,   3.5958],
        [-26.5287,   1.1138, -11.1199,  ..., -12.2975,  14.4212,  20.5384],
        ...,
        [ -9.3449,   8.7183,  25.1193,  ...,  13.9652,   2.6227, -46.6823],
        [ 28.1785,  37.6067, -31.2626,  ..., -58.6981, -26.0484,  -0.5406],
        [ 19.5140,   8.3228, -61.3487,  ...,   3.3553,   4.0766, -12.7914]],
       device='cuda:0', requires_grad=True)


# CBOW Using nn.Linear

## Define Model

In [None]:
# Hyperparameter
num_epochs = 20000
lr = 0.001
emb_vector_size = 2

In [None]:
# Define model, optimizer and loss
CBOW_linear = nn.Linear(document_matrix.shape[1], emb_vector_size, bias = False).to(device)
CBOW_linear_p = nn.Linear(emb_vector_size,document_matrix.shape[1], bias = False).to(device)

CBOW_linear_optimizer = optim.Adam(CBOW_linear.parameters(), lr = lr)
CBOW_linear_optimizer = optim.Adam(CBOW_linear_p.parameters(), lr = lr)
criterion = nn.CrossEntropyLoss()

## Train the model

In [None]:
for i in range(num_epochs+1):
  # Input : 4 neighbor vector
  y_pred = CBOW_linear(train_x_tensor)
  y_pred = torch.mean(y_pred, dim = 0)
  
  # Input : Embedding vector
  # Output : predict one-hot vector
  y_pred = CBOW_linear_p(y_pred)
  y_pred = y_pred.softmax(dim = 1)
  


  # compute loss
  loss = criterion(y_pred , train_y_tensor)
  
  # initiate optimizer
  CBOW_optimizer.zero_grad()
  CBOW_optimizer_p.zero_grad()
  # backpropagation
  loss.backward()
  CBOW_optimizer.step()
  CBOW_optimizer_p.step()

  if i % 500 == 0:
    print(f'epoch {i} : {loss.item()}')

# CBOW using nn.Embedding

In [None]:
class Word2VecCBOW(nn.Module):
  def __init__(self, one_hot_dim, embedding_dim):
    self.emb = nn.Embedding(num_embeddings = one_hot_dim, 
                              embedding_dim = embedding_dim)
    
    self.linear = nn.Linear(embedding_dim, one_hot_dim)
    self.softmax = nn.Softmax(dim = 1)

  def forward(self, x, tokens):
    lookuptable = torch.mean(self.emb(x),dim = 1)
    out = self.linear(lookuptable)
    out = self.softmax(out)
    return out

In [None]:
# Hyperparameter
num_epochs = 20000
lr = 0.001
emb_vector_size = 2

In [None]:
# Define model, optimizer and loss
model = Word2VecCBOW(train_x_tensor.size(2),2)

CBOW_emb_optimizer = optim.Adam(CBOW_linear.parameters(), lr = lr)
criterion = nn.CrossEntropyLoss()

In [None]:
for i in range(num_epochs+1):
  # CBOW model
  y_pred = model(train_x_tensor)

  # compute loss
  loss = criterion(y_pred , train_y_tensor)
  
  # initiate optimizer
  CBOW_emb_optimizer.zero_grad()
  # backpropagation
  loss.backward()
  CBOW_emb_optimizer.step()

  if i % 500 == 0:
    print(f'epoch {i} : {loss.item()}')