<a href="https://colab.research.google.com/github/castlechoi/studyingDL/blob/main/NLP/Word2Vec_Skipgram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing

## Download corpus

In [96]:
# nltk.book에 저장된 corpus
import nltk
nltk.download("book", quiet = True)

True

In [97]:
# Load nltk.book.text1 
from nltk.book import text1
ex_book = nltk.book.text1[:100]

## Stop-words 제거

In [98]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

# stop-words에 특수기호 추가
stopwords  = stopwords + ['.',',','\'','!','?','\"','[',']','(',')','*','I',':',';','-','."','--','<','>']
print(f'Stop-words의 개수 : {len(stopwords)}')

ex_book_no_stopwords = [[t] for t in ex_book if t not in stopwords]
print(f'불용어 제외한 후 문장의 길이 : {len(ex_book_no_stopwords)}')

Stop-words의 개수 : 198
불용어 제외한 후 문장의 길이 : 50


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data preprocessing hyperparameter

In [99]:
min_count = 2
window = 2

## One-hot Encoding

In [100]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

In [101]:
# 토큰 집합 추출 ( 등장횟수가 2이하면 토큰화 안함)
cut_off = 0

tokens = pd.Series(ex_book_no_stopwords).value_counts()
for i in range(len(tokens)):
  if tokens[i] == min_count-1:
    cut_off = i
    break

tokens = tokens[:cut_off].index.tolist()
print(f'Token의 개수 : {len(tokens)}')


# token에 없는 데이터 모두 '<unk>'로 변경
ex_book_process = [t if t in tokens else ['<unk>'] for t in ex_book_no_stopwords]

Token의 개수 : 4


In [102]:
# One-Hot Encoding
oe = OneHotEncoder()
document_matrix = oe.fit_transform(ex_book_process)
print(f'문서의 단의 개수 : {document_matrix.shape[0]}')
print(f'One-Hot vector의 크기 : {document_matrix.shape[1]}')

문서의 단의 개수 : 50
One-Hot vector의 크기 : 5


# Skipgram

In [103]:
import torch
import torch.nn as nn
import torch.optim as optim

## GPU 설정

In [104]:
#GPU 체크
is_cuda = torch.cuda.is_available()
if is_cuda:
  device = torch.device("cuda")
  print("GPU is available")
else:
  device = torch.device("cpu")
  print("CPU is availalbe")

GPU is available


## Train data preprocessing

In [105]:
# skipgram에 들어갈 input vector 1개
#                   output vector 4개 ( window == 2)
train_x = []
train_y = []
for i in range(document_matrix.shape[0] - (window * 2)):
  neighbor = []
  neighbor.append(document_matrix[i].toarray())
  neighbor.append(document_matrix[i+1].toarray())
  neighbor.append(document_matrix[i+3].toarray())
  neighbor.append(document_matrix[i+4].toarray())

  train_x.append(document_matrix[i+2].toarray())
  train_y.append(neighbor)

In [106]:
train_x_tensor = torch.FloatTensor(train_x).view(-1,document_matrix.shape[1]).to(device)
train_y_tensor = torch.FloatTensor(train_y).view(4,-1,document_matrix.shape[1]).to(device)

print(f'train_x의 shape : {train_x_tensor.shape}') # 단어 개수 * 4 * one_hot
print(f'train_y의 shape : {train_y_tensor.shape}') # 단어 개수 * one_hot

train_x의 shape : torch.Size([46, 5])
train_y의 shape : torch.Size([4, 46, 5])


## Define the model

In [107]:
# Hyperparameter
num_epochs = 5000
lr = 0.001
emb_vector_size = 2

In [108]:
# Define weights without bias
W = torch.randn(document_matrix.shape[1],emb_vector_size).to(device).requires_grad_()
W_prime = torch.empty((4,emb_vector_size,document_matrix.shape[1])).to(device).requires_grad_()
for w in W_prime:
  w = torch.randn(emb_vector_size,document_matrix.shape[1])

In [109]:
# Define optimizer and loss
SG_optimizer = optim.Adam([W], lr = lr)
SG_optimizer_p = optim.Adam([W_prime], lr = lr)
criterion = nn.CrossEntropyLoss()

In [110]:
for i in range(num_epochs+1):
  # Input : 1 input vector
  y_pred = train_x_tensor @ W
  # Input : Embedding vector
  # Output : predict 4 neighbor one-hot vector

  W_prime = torch.transpose(W_prime,1,2)
  y_pred = torch.transpose(y_pred,0,1)
  y_pred = W_prime @ y_pred
  y_pred = torch.transpose(y_pred,1,2)
  W_prime = torch.transpose(W_prime,1,2)
  y_pred = y_pred.softmax(dim = 2)

  # compute loss
  loss = criterion(y_pred , train_y_tensor)
  
  # initiate optimizer
  SG_optimizer.zero_grad()
  SG_optimizer_p.zero_grad()
  # backpropagation
  loss.backward()
  SG_optimizer.step()
  SG_optimizer_p.step()

  if i % 500 == 0:
    print(f'epoch {i} : {loss.item()}')

epoch 0 : 35.328975677490234
epoch 500 : 35.363277435302734
epoch 1000 : 35.2880859375
epoch 1500 : 35.202186584472656
epoch 2000 : 35.19779586791992
epoch 2500 : 35.1953125
epoch 3000 : 35.193721771240234
epoch 3500 : 35.19257736206055
epoch 4000 : 35.19169235229492
epoch 4500 : 35.190956115722656
epoch 5000 : 35.190303802490234


In [111]:
print(W)
print(W_prime)

tensor([[ 2.4469,  1.8018],
        [ 0.0357, -0.4149],
        [-2.4160,  1.5630],
        [ 1.5258, -3.8440],
        [-2.5711, -2.6538]], device='cuda:0', requires_grad=True)
tensor([[[-1.5897e+00, -1.8490e+00,  1.5906e+00, -1.8490e+00, -1.8490e+00],
         [-1.3573e+00, -1.5676e+00,  1.3675e+00, -1.5676e+00, -1.5676e+00]],

        [[ 1.5766e+00,  1.4848e+00, -2.4007e+00,  7.3007e-01, -1.7441e+00],
         [-8.1129e-01, -2.0097e+00,  1.8158e+00,  7.7866e-01, -1.3027e+00]],

        [[ 1.3365e+22,  9.9476e-01,  1.7740e+22, -9.9478e-01,  9.9476e-01],
         [ 0.0000e+00, -1.2290e+00,  2.2421e-44,  1.2290e+00, -1.2290e+00]],

        [[-5.2761e+05,  4.5916e-41,  7.9584e-01,  0.0000e+00, -7.9584e-01],
         [ 2.7774e-02,  1.3417e+22, -3.3859e+00,  1.7702e+22,  3.3859e+00]]],
       device='cuda:0', grad_fn=<TransposeBackward0>)
