<a href="https://colab.research.google.com/github/castlechoi/studyingDL/blob/main/NLP/Word2Vec_CBOW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing

## Download corpus

In [1]:
import torch
import torch.optim as optim
import torch.nn as nn

import nltk
nltk.download("book", quiet = True)

True

In [2]:
# nltk.book에 저장된 다양한 corpus
from nltk.book import *
nltk.book.texts()

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [3]:
# tokenize 모두 완료 되어 있음
ex_book = nltk.book.text1[:500]

## Stop-words 제거

In [4]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

# stop-words에 특수기호 추가
stopwords  = stopwords + ['.',',','\'','!','?','\"','[',']','(',')','*','I',':',';','-','."','--','<','>']

print(f'Stop-words의 개수 : {len(stopwords)}')

Stop-words의 개수 : 198


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# 불용어 빠진 것을 확인
ex_book_no_stopwords = [[t] for t in ex_book if t not in stopwords]
print(f'불용어 제외한 후 문장의 길이 : {len(ex_book_no_stopwords)}')

불용어 제외한 후 문장의 길이 : 241


## Data preprocessing hyperparameter


In [6]:
min_count = 2
window = 2

## One-hot Encoding

In [7]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

In [8]:
# 토큰 집합 추출 ( 등장횟수가 2이하면 토큰화 안함)
cut_off = 0

tokens = pd.Series(ex_book_no_stopwords).value_counts()
for i in range(len(tokens)):
  if tokens[i] == min_count-1:
    cut_off = i
    break
tokens = tokens[:cut_off].index.tolist()
print(f'Token의 개수 : {len(tokens)}')

Token의 개수 : 24


In [9]:
# token에 없는 데이터 모두 <unk>로 변경
ex_book_process = [t if t in tokens else ['<unk>'] for t in ex_book_no_stopwords]

In [10]:
# One-Hot Encoding
oe = OneHotEncoder()
document_matrix = oe.fit_transform(ex_book_process)
print(f'문서의 단의 개수 : {document_matrix.shape[0]}')
print(f'One-Hot vector의 크기 : {document_matrix.shape[1]}')

문서의 단의 개수 : 241
One-Hot vector의 크기 : 25


## GPU 설정

In [11]:
#GPU 체크
is_cuda = torch.cuda.is_available()
if is_cuda:
  device = torch.device("cuda")
  print("GPU is available")
else:
  device = torch.device("cpu")
  print("CPU is availalbe")

GPU is available


# CBOW Low-level

## Train data preprocessing 

In [12]:
# train_x에 CBOW의 input으로 들어가는 4개의 벡터
train_x = []
train_y = []
for i in range(document_matrix.shape[0] - (window * 2)):
  neighbor = []
  neighbor.append(document_matrix[i].toarray())
  neighbor.append(document_matrix[i+1].toarray())
  neighbor.append(document_matrix[i+3].toarray())
  neighbor.append(document_matrix[i+4].toarray())

  train_x.append(neighbor)
  train_y.append(document_matrix[i+2].toarray())

In [13]:
train_x_tensor = torch.FloatTensor(train_x).view(-1,4,document_matrix.shape[1]).to(device)
train_y_tensor = torch.FloatTensor(train_y).view(-1,document_matrix.shape[1]).to(device)

print(f'train_x의 shape : {train_x_tensor.shape}') # 단어 개수 * 4 * one_hot
print(f'train_y의 shape : {train_y_tensor.shape}') # 단어 개수 * one_hot

  train_x_tensor = torch.FloatTensor(train_x).view(-1,4,document_matrix.shape[1]).to(device)


train_x의 shape : torch.Size([237, 4, 25])
train_y의 shape : torch.Size([237, 25])


## Define Model

In [18]:
# Hyperparameter
num_epochs = 5000
lr = 0.001
emb_vector_size = 2

In [19]:
# Define weights without bias
W = torch.randn(document_matrix.shape[1],emb_vector_size).to(device).requires_grad_()
W_prime = torch.randn(emb_vector_size,document_matrix.shape[1]).to(device).requires_grad_()

In [20]:
# Define optimizer and loss
CBOW_optimizer = optim.Adam([W], lr = lr)
CBOW_optimizer_p = optim.Adam([W_prime], lr = 0.001)
criterion = nn.CrossEntropyLoss()

## Train the model

In [21]:
for i in range(num_epochs+1):
  # Input : 4 neighbor vector
  y_pred = train_x_tensor @ W
  y_pred = torch.mean(y_pred, dim = 1)
  
  # Input : Embedding vector
  # Output : predict one-hot vector
  y_pred = y_pred @ W_prime
  y_pred = y_pred.softmax(dim = 1)

  # compute loss
  loss = criterion(y_pred , train_y_tensor)
  
  # initiate optimizer
  CBOW_optimizer.zero_grad()
  CBOW_optimizer_p.zero_grad()
  # backpropagation
  loss.backward()
  CBOW_optimizer.step()
  CBOW_optimizer_p.step()

  if i % 500 == 0:
    print(f'epoch {i} : {loss.item()}')


epoch 0 : 3.2126963138580322
epoch 500 : 2.796860456466675
epoch 1000 : 2.5478672981262207
epoch 1500 : 2.532090902328491
epoch 2000 : 2.5286431312561035
epoch 2500 : 2.527249574661255
epoch 3000 : 2.5263795852661133
epoch 3500 : 2.5256736278533936
epoch 4000 : 2.5251193046569824
epoch 4500 : 2.5246198177337646
epoch 5000 : 2.5237510204315186


## Print lookuptable

In [22]:
print(W)
print(W_prime)

tensor([[-2.3220,  1.4787],
        [-2.4414,  1.7075],
        [-2.3341,  0.9295],
        [-1.1732,  2.3236],
        [-1.2025,  2.1571],
        [-0.9378,  2.3369],
        [-2.5139,  2.1506],
        [-2.9135,  0.4221],
        [-1.7070,  1.9217],
        [-2.0078,  2.4650],
        [-1.3215, -0.2824],
        [-1.7759,  1.2361],
        [-1.5929,  2.0696],
        [-3.2544,  1.7984],
        [ 1.3931, -4.8613],
        [ 0.5331,  2.3908],
        [-1.8330,  2.5768],
        [-0.4055,  0.9070],
        [-2.1889,  1.1575],
        [-3.0982,  3.2478],
        [-2.6938,  3.3694],
        [-1.4535,  2.2281],
        [-1.0500,  0.6617],
        [-1.4947,  1.4635],
        [-3.1037,  1.7181]], device='cuda:0', requires_grad=True)
tensor([[ 1.8155, -2.3166,  2.1170,  1.1251,  2.6273,  0.3144,  0.4515,  0.5457,
          1.2717,  1.0745,  0.2162,  0.5289,  1.2387,  1.3951,  1.6938,  2.0017,
          1.9484,  0.6135,  2.2504,  1.3866, -0.7047, -0.4065,  1.7640,  1.3886,
          2.3144],


# CBOW Using nn.Linear

## Define Model

In [29]:
# Hyperparameter
num_epochs = 5000
lr = 0.001
emb_vector_size = 2

In [39]:
# Define model, optimizer and loss
CBOW_linear = nn.Linear(document_matrix.shape[1], emb_vector_size, bias = False).to(device)
CBOW_linear_p = nn.Linear(emb_vector_size,document_matrix.shape[1], bias = False).to(device)

CBOW_linear_optimizer = optim.Adam(CBOW_linear.parameters(), lr = lr)
CBOW_linear_optimizer_p = optim.Adam(CBOW_linear_p.parameters(), lr = lr)
criterion = nn.CrossEntropyLoss()

## Train the model

In [40]:
for i in range(num_epochs+1):
  # Input : 4 neighbor vector
  y_pred = CBOW_linear(train_x_tensor)
  y_pred = torch.mean(y_pred, dim = 1)

  # Input : Embedding vector
  # Output : predict one-hot vector
  y_pred = CBOW_linear_p(y_pred)
  y_pred = y_pred.softmax(dim = 1)
  
  # compute loss
  loss = criterion(y_pred , train_y_tensor)
  
  # initiate optimizer
  CBOW_linear_optimizer.zero_grad()
  CBOW_linear_optimizer_p.zero_grad()
  # backpropagation
  loss.backward()
  CBOW_linear_optimizer.step()
  CBOW_linear_optimizer_p.step()

  if i % 500 == 0:
    print(f'epoch {i} : {loss.item()}')

epoch 0 : 3.219789743423462
epoch 500 : 2.796447992324829
epoch 1000 : 2.540860891342163
epoch 1500 : 2.5310275554656982
epoch 2000 : 2.5284292697906494
epoch 2500 : 2.527350425720215
epoch 3000 : 2.5268001556396484
epoch 3500 : 2.5264837741851807
epoch 4000 : 2.526287317276001
epoch 4500 : 2.5261573791503906
epoch 5000 : 2.526066303253174


## Print lookuptable

In [41]:
for emb in CBOW_linear.parameters():
  print(emb)
for emb in CBOW_linear_p.parameters():
  print(emb)

Parameter containing:
tensor([[ 1.6451,  1.5978,  1.4798,  1.7055,  1.4656,  1.4622,  1.4019,  1.4408,
          1.4536,  1.6541,  1.4793,  1.6066,  1.7169,  1.4799, -0.3130,  1.6305,
          1.3817,  1.4672,  1.5839,  1.5130,  1.6139,  1.4886,  1.5120,  1.5042,
          1.4586],
        [-1.6063, -1.4738, -1.6220, -1.4634, -1.5334, -1.6608, -1.7304, -1.6985,
         -1.3077, -1.6038, -1.2266, -1.4555, -1.3957, -1.4720,  0.1532, -1.4922,
         -1.6765, -1.6239, -1.3901, -1.6084, -1.7977, -1.5566, -1.6348, -1.5458,
         -1.5837]], device='cuda:0', requires_grad=True)
Parameter containing:
tensor([[-1.9625,  1.9759],
        [ 2.0504, -2.0034],
        [-1.9161,  1.7455],
        [-1.8670,  1.8951],
        [-2.2532,  1.0594],
        [-1.3523,  1.5523],
        [-1.8164,  1.5042],
        [-1.6907,  1.9174],
        [-1.7495,  1.2905],
        [-1.5355,  1.8914],
        [-1.4297,  1.0106],
        [-1.8764,  1.3475],
        [-1.9775,  1.1070],
        [-1.4960,  1.8240],
  