##**2. Word2Vec**


---

*   주어진 단어들을 word2vec 모델에 들어갈 수 있는 형태로 만들어본다.
*   CBOW, Skip-gram 모델을 각각 구현한다.





In [1]:
!pip install konlpy



In [2]:
from tqdm import tqdm
from konlpy.tag import Okt
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict

import torch
import copy
import numpy as np

In [3]:
train_data = [
  "정말 맛있습니다. 추천합니다.",
  "기대했던 것보단 별로였네요.",
  "다 좋은데 가격이 너무 비싸서 다시 가고 싶다는 생각이 안 드네요.",
  "완전 최고입니다! 재방문 의사 있습니다.",
  "음식도 서비스도 다 만족스러웠습니다.",
  "위생 상태가 좀 별로였습니다. 좀 더 개선되기를 바랍니다.",
  "맛도 좋았고 직원분들 서비스도 너무 친절했습니다.",
  "기념일에 방문했는데 음식도 분위기도 서비스도 다 좋았습니다.",
  "전반적으로 음식이 너무 짰습니다. 저는 별로였네요.",
  "위생에 조금 더 신경 썼으면 좋겠습니다. 조금 불쾌했습니다."       
]

test_words = ["음식", "맛", "서비스", "위생", "가격"]

In [4]:
tokenizer = Okt()

In [5]:
def make_tokenized(data):
  tokenized = []
  for sent in tqdm(data):
    tokens = tokenizer.morphs(sent, stem=True)
    tokenized.append(tokens)

  return tokenized

In [6]:
train_tokenized = make_tokenized(train_data)

100%|██████████| 10/10 [00:05<00:00,  1.73it/s]


In [7]:
word_count = defaultdict(int)

for tokens in tqdm(train_tokenized):
  for token in tokens:
    word_count[token] += 1

100%|██████████| 10/10 [00:00<00:00, 1332.58it/s]


In [8]:
word_count = sorted(word_count.items(), key=lambda x: x[1], reverse=True)
print(list(word_count))

[('.', 14), ('도', 7), ('이다', 4), ('좋다', 4), ('별로', 3), ('다', 3), ('이', 3), ('너무', 3), ('음식', 3), ('서비스', 3), ('하다', 2), ('방문', 2), ('위생', 2), ('좀', 2), ('더', 2), ('에', 2), ('조금', 2), ('정말', 1), ('맛있다', 1), ('추천', 1), ('기대하다', 1), ('것', 1), ('보단', 1), ('가격', 1), ('비싸다', 1), ('다시', 1), ('가다', 1), ('싶다', 1), ('생각', 1), ('안', 1), ('드네', 1), ('요', 1), ('완전', 1), ('최고', 1), ('!', 1), ('재', 1), ('의사', 1), ('있다', 1), ('만족스럽다', 1), ('상태', 1), ('가', 1), ('개선', 1), ('되다', 1), ('기르다', 1), ('바라다', 1), ('맛', 1), ('직원', 1), ('분들', 1), ('친절하다', 1), ('기념일', 1), ('분위기', 1), ('전반', 1), ('적', 1), ('으로', 1), ('짜다', 1), ('저', 1), ('는', 1), ('신경', 1), ('써다', 1), ('불쾌하다', 1)]


In [9]:
w2i = {}
for pair in tqdm(word_count):
  if pair[0] not in w2i:
    w2i[pair[0]] = len(w2i)

100%|██████████| 60/60 [00:00<00:00, 37712.91it/s]


In [10]:
train_tokenized


[['정말', '맛있다', '.', '추천', '하다', '.'],
 ['기대하다', '것', '보단', '별로', '이다', '.'],
 ['다',
  '좋다',
  '가격',
  '이',
  '너무',
  '비싸다',
  '다시',
  '가다',
  '싶다',
  '생각',
  '이',
  '안',
  '드네',
  '요',
  '.'],
 ['완전', '최고', '이다', '!', '재', '방문', '의사', '있다', '.'],
 ['음식', '도', '서비스', '도', '다', '만족스럽다', '.'],
 ['위생',
  '상태',
  '가',
  '좀',
  '별로',
  '이다',
  '.',
  '좀',
  '더',
  '개선',
  '되다',
  '기르다',
  '바라다',
  '.'],
 ['맛', '도', '좋다', '직원', '분들', '서비스', '도', '너무', '친절하다', '.'],
 ['기념일', '에', '방문', '하다', '음식', '도', '분위기', '도', '서비스', '도', '다', '좋다', '.'],
 ['전반', '적', '으로', '음식', '이', '너무', '짜다', '.', '저', '는', '별로', '이다', '.'],
 ['위생', '에', '조금', '더', '신경', '써다', '좋다', '.', '조금', '불쾌하다', '.']]

In [11]:
w2i

{'!': 34,
 '.': 0,
 '가': 40,
 '가격': 23,
 '가다': 26,
 '개선': 41,
 '것': 21,
 '기념일': 49,
 '기대하다': 20,
 '기르다': 43,
 '너무': 7,
 '는': 56,
 '다': 5,
 '다시': 25,
 '더': 14,
 '도': 1,
 '되다': 42,
 '드네': 30,
 '만족스럽다': 38,
 '맛': 45,
 '맛있다': 18,
 '바라다': 44,
 '방문': 11,
 '별로': 4,
 '보단': 22,
 '분들': 47,
 '분위기': 50,
 '불쾌하다': 59,
 '비싸다': 24,
 '상태': 39,
 '생각': 28,
 '서비스': 9,
 '신경': 57,
 '싶다': 27,
 '써다': 58,
 '안': 29,
 '에': 15,
 '완전': 32,
 '요': 31,
 '위생': 12,
 '으로': 53,
 '음식': 8,
 '의사': 36,
 '이': 6,
 '이다': 2,
 '있다': 37,
 '재': 35,
 '저': 55,
 '적': 52,
 '전반': 51,
 '정말': 17,
 '조금': 16,
 '좀': 13,
 '좋다': 3,
 '직원': 46,
 '짜다': 54,
 '최고': 33,
 '추천': 19,
 '친절하다': 48,
 '하다': 10}

word2vec  

*   CBOW (Continuous Bag of Words) : 주변 단어를 통해 중심 단어를 예측
*   Skip-gram : 중심 단어를 보고 주변을 예측



In [12]:
class CBOWDataset(Dataset):
  def __init__(self, train_tokenized, window_size=2):
    self.x = []
    self.y = []

    for tokens in tqdm(train_tokenized):
      token_ids = [w2i[token] for token in tokens]
      for i, id in enumerate(token_ids):
        if i-window_size >= 0 and i+window_size < len(token_ids):
          self.x.append(token_ids[i-window_size:i] + token_ids[i+1:i+window_size+1])
          self.y.append(id)

    self.x = torch.LongTensor(self.x)  # (전체 데이터 개수, 2 * window_size)
    self.y = torch.LongTensor(self.y)  # (전체 데이터 개수)

  def __len__(self):
    return self.x.shape[0]

  def __getitem__(self, idx):
    return self.x[idx], self.y[idx]

In [13]:
class SkipGramDataset(Dataset):
  def __init__(self, train_tokenized, window_size=2):
    self.x = []
    self.y = []

    for tokens in tqdm(train_tokenized):
      token_ids = [w2i[token] for token in tokens]
      for i, id in enumerate(token_ids):
        if i-window_size >= 0 and i+window_size < len(token_ids):
          self.y += (token_ids[i-window_size:i] + token_ids[i+1:i+window_size+1])
          self.x += [id] * 2 * window_size

    self.x = torch.LongTensor(self.x)  # (전체 데이터 개수)
    self.y = torch.LongTensor(self.y)  # (전체 데이터 개수)

  def __len__(self):
    return self.x.shape[0]

  def __getitem__(self, idx):
    return self.x[idx], self.y[idx]

In [14]:
cbow_set = CBOWDataset(train_tokenized)
skipgram_set = SkipGramDataset(train_tokenized)
print(list(skipgram_set))

100%|██████████| 10/10 [00:00<00:00, 27165.18it/s]
100%|██████████| 10/10 [00:00<00:00, 3671.48it/s]

[(tensor(0), tensor(17)), (tensor(0), tensor(18)), (tensor(0), tensor(19)), (tensor(0), tensor(10)), (tensor(19), tensor(18)), (tensor(19), tensor(0)), (tensor(19), tensor(10)), (tensor(19), tensor(0)), (tensor(22), tensor(20)), (tensor(22), tensor(21)), (tensor(22), tensor(4)), (tensor(22), tensor(2)), (tensor(4), tensor(21)), (tensor(4), tensor(22)), (tensor(4), tensor(2)), (tensor(4), tensor(0)), (tensor(23), tensor(5)), (tensor(23), tensor(3)), (tensor(23), tensor(6)), (tensor(23), tensor(7)), (tensor(6), tensor(3)), (tensor(6), tensor(23)), (tensor(6), tensor(7)), (tensor(6), tensor(24)), (tensor(7), tensor(23)), (tensor(7), tensor(6)), (tensor(7), tensor(24)), (tensor(7), tensor(25)), (tensor(24), tensor(6)), (tensor(24), tensor(7)), (tensor(24), tensor(25)), (tensor(24), tensor(26)), (tensor(25), tensor(7)), (tensor(25), tensor(24)), (tensor(25), tensor(26)), (tensor(25), tensor(27)), (tensor(26), tensor(24)), (tensor(26), tensor(25)), (tensor(26), tensor(27)), (tensor(26), tens






*   `self.embedding`: `vocab_size` 크기의 one-hot vector를 특정 크기의 `dim` 차원으로 embedding 시키는 layer.
*   `self.linear`: 변환된 embedding vector를 다시 원래 `vocab_size`로 바꾸는 layer.


In [15]:
class CBOW(nn.Module):
  def __init__(self, vocab_size, dim):
    super(CBOW, self).__init__()
    self.embedding = nn.Embedding(vocab_size, dim, sparse=True)
    self.linear = nn.Linear(dim, vocab_size)

  # B: batch size, W: window size, d_w: word embedding size, V: vocab size
  def forward(self, x):  # x: (B, 2W)
    embeddings = self.embedding(x)  # (B, 2W, d_w)
    embeddings = torch.sum(embeddings, dim=1)  # (B, d_w)
    output = self.linear(embeddings)  # (B, V)
    return output

In [16]:
class SkipGram(nn.Module):
  def __init__(self, vocab_size, dim):
    super(SkipGram, self).__init__()
    self.embedding = nn.Embedding(vocab_size, dim, sparse=True)
    self.linear = nn.Linear(dim, vocab_size)

  # B: batch size, W: window size, d_w: word embedding size, V: vocab size
  def forward(self, x): # x: (B)
    embeddings = self.embedding(x)  # (B, d_w)
    output = self.linear(embeddings)  # (B, V)
    return output

In [17]:
cbow = CBOW(vocab_size=len(w2i), dim=256)
skipgram = SkipGram(vocab_size=len(w2i), dim=256)

In [18]:
batch_size=4
learning_rate = 5e-4
num_epochs = 5
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

cbow_loader = DataLoader(cbow_set, batch_size=batch_size)
skipgram_loader = DataLoader(skipgram_set, batch_size=batch_size)

In [19]:
cbow.train()
cbow = cbow.to(device)
optim = torch.optim.SGD(cbow.parameters(), lr=learning_rate)
loss_function = nn.CrossEntropyLoss()

for e in range(1, num_epochs+1):
  print("#" * 50)
  print(f"Epoch: {e}")
  for batch in tqdm(cbow_loader):
    x, y = batch
    x, y = x.to(device), y.to(device) # (B, W), (B)
    output = cbow(x)  # (B, V)
 
    optim.zero_grad()
    loss = loss_function(output, y)
    loss.backward()
    optim.step()

    print(f"Train loss: {loss.item()}")

print("Finished.")

100%|██████████| 16/16 [00:00<00:00, 137.54it/s]
100%|██████████| 16/16 [00:00<00:00, 516.68it/s]
100%|██████████| 16/16 [00:00<00:00, 558.88it/s]
  0%|          | 0/16 [00:00<?, ?it/s]

##################################################
Epoch: 1
Train loss: 4.5756001472473145
Train loss: 5.033390045166016
Train loss: 4.799264430999756
Train loss: 4.112525939941406
Train loss: 4.375237464904785
Train loss: 5.7022833824157715
Train loss: 5.0057854652404785
Train loss: 5.185102462768555
Train loss: 4.16326904296875
Train loss: 6.442245006561279
Train loss: 4.571232795715332
Train loss: 5.445986747741699
Train loss: 4.945189952850342
Train loss: 5.204785346984863
Train loss: 4.42657470703125
Train loss: 4.478846073150635
##################################################
Epoch: 2
Train loss: 4.413271903991699
Train loss: 4.882137775421143
Train loss: 4.675271987915039
Train loss: 4.003536224365234
Train loss: 4.258169174194336
Train loss: 5.400237083435059
Train loss: 4.836596488952637
Train loss: 5.066331386566162
Train loss: 4.047069549560547
Train loss: 6.234572410583496
Train loss: 4.388415336608887
Train loss: 5.022546291351318
Train loss: 4.817224025726318
Train los

100%|██████████| 16/16 [00:00<00:00, 601.87it/s]
100%|██████████| 16/16 [00:00<00:00, 634.25it/s]

Train loss: 5.835268974304199
Train loss: 4.065903663635254
Train loss: 4.264187812805176
Train loss: 4.565861701965332
Train loss: 4.869189262390137
Train loss: 3.9465930461883545
Train loss: 4.05630350112915
##################################################
Epoch: 5
Train loss: 3.9452064037323
Train loss: 4.442858695983887
Train loss: 4.313515663146973
Train loss: 3.6858983039855957
Train loss: 3.914898157119751
Train loss: 4.567197322845459
Train loss: 4.347438812255859
Train loss: 4.720420837402344
Train loss: 3.7329492568969727
Train loss: 5.644664764404297
Train loss: 3.926304340362549
Train loss: 3.9354019165039062
Train loss: 4.442476272583008
Train loss: 4.763086795806885
Train loss: 3.793854236602783
Train loss: 3.9232168197631836
Finished.





In [20]:
skipgram.train()
skipgram = skipgram.to(device)
optim = torch.optim.SGD(skipgram.parameters(), lr=learning_rate)
loss_function = nn.CrossEntropyLoss()

for e in range(1, num_epochs+1):
  print("#" * 50)
  print(f"Epoch: {e}")
  for batch in tqdm(skipgram_loader):
    x, y = batch
    x, y = x.to(device), y.to(device) # (B, W), (B)
    output = skipgram(x)  # (B, V)

    optim.zero_grad()
    loss = loss_function(output, y)
    loss.backward()
    optim.step()

    print(f"Train loss: {loss.item()}")

print("Finished.")

100%|██████████| 64/64 [00:00<00:00, 609.08it/s]
  0%|          | 0/64 [00:00<?, ?it/s]

##################################################
Epoch: 1
Train loss: 4.593864440917969
Train loss: 4.140463829040527
Train loss: 3.876666784286499
Train loss: 4.2731733322143555
Train loss: 4.54222297668457
Train loss: 4.357439994812012
Train loss: 3.871335029602051
Train loss: 4.991520881652832
Train loss: 3.970647096633911
Train loss: 4.631829738616943
Train loss: 4.052855968475342
Train loss: 4.532125473022461
Train loss: 4.1504082679748535
Train loss: 4.162090301513672
Train loss: 4.544067859649658
Train loss: 5.159430503845215
Train loss: 4.218289852142334
Train loss: 4.5225372314453125
Train loss: 4.241033554077148
Train loss: 4.432467937469482
Train loss: 4.3886237144470215
Train loss: 4.705142498016357
Train loss: 3.9580307006835938
Train loss: 4.83726692199707
Train loss: 3.9119443893432617
Train loss: 4.144645690917969
Train loss: 4.106499671936035
Train loss: 3.6046371459960938
Train loss: 3.765714645385742
Train loss: 4.21817684173584
Train loss: 4.602576732635498
Train 

100%|██████████| 64/64 [00:00<00:00, 735.64it/s]
  0%|          | 0/64 [00:00<?, ?it/s]

Train loss: 3.8763046264648438
Train loss: 4.091361999511719
Train loss: 4.062480449676514
Train loss: 3.5852649211883545
Train loss: 3.7334656715393066
Train loss: 4.188929557800293
Train loss: 4.56707239151001
Train loss: 3.6164755821228027
Train loss: 4.212521553039551
Train loss: 4.067230224609375
Train loss: 4.036425590515137
Train loss: 4.61393928527832
Train loss: 4.360342502593994
Train loss: 4.304457187652588
Train loss: 4.132419109344482
Train loss: 4.130619049072266
Train loss: 4.469022750854492
Train loss: 3.880309820175171
Train loss: 4.260460376739502
Train loss: 4.035883903503418
Train loss: 4.220916271209717
Train loss: 4.261582851409912
Train loss: 4.496060371398926
Train loss: 4.003221035003662
Train loss: 4.244897842407227
Train loss: 4.397211074829102
Train loss: 4.076559066772461
Train loss: 4.0608015060424805
Train loss: 3.3963139057159424
Train loss: 4.086193084716797
Train loss: 3.9168648719787598
Train loss: 3.7436137199401855
Train loss: 3.987658977508545
Trai

100%|██████████| 64/64 [00:00<00:00, 703.59it/s]
  0%|          | 0/64 [00:00<?, ?it/s]

Train loss: 4.547548294067383
Train loss: 4.030247211456299
Train loss: 3.8191850185394287
Train loss: 4.164943695068359
Train loss: 4.473720550537109
Train loss: 4.278623104095459
Train loss: 3.8122105598449707
Train loss: 4.910906791687012
Train loss: 3.9089601039886475
Train loss: 4.57113790512085
Train loss: 3.980062484741211
Train loss: 4.472718238830566
Train loss: 4.0886921882629395
Train loss: 4.112665176391602
Train loss: 4.481298446655273
Train loss: 5.09743595123291
Train loss: 4.1646409034729
Train loss: 4.462230205535889
Train loss: 4.18549919128418
Train loss: 4.375158309936523
Train loss: 4.178377151489258
Train loss: 4.515623092651367
Train loss: 3.8779876232147217
Train loss: 4.761026382446289
Train loss: 3.841055154800415
Train loss: 4.038808822631836
Train loss: 4.018852710723877
Train loss: 3.5663390159606934
Train loss: 3.7015931606292725
Train loss: 4.159965991973877
Train loss: 4.531736373901367
Train loss: 3.5891473293304443
Train loss: 4.185333728790283
Train l

100%|██████████| 64/64 [00:00<00:00, 672.23it/s]
  0%|          | 0/64 [00:00<?, ?it/s]

Train loss: 4.226255893707275
Train loss: 4.209903240203857
Train loss: 4.078884124755859
Train loss: 4.068451881408691
Train loss: 4.392324447631836
Train loss: 3.807922124862671
Train loss: 4.118447780609131
Train loss: 3.918874502182007
Train loss: 3.9759647846221924
Train loss: 4.052424430847168
Train loss: 4.318543434143066
Train loss: 3.921677827835083
Train loss: 4.186767578125
Train loss: 4.33734130859375
Train loss: 4.007908821105957
Train loss: 3.990023136138916
Train loss: 3.337123394012451
Train loss: 4.048334121704102
Train loss: 3.861300468444824
Train loss: 3.6891255378723145
Train loss: 3.8825736045837402
Train loss: 4.283895969390869
Train loss: 4.742170810699463
Train loss: 3.6784887313842773
Train loss: 4.54584264755249
Train loss: 4.149921894073486
Train loss: 4.092756271362305
Train loss: 4.834184169769287
##################################################
Epoch: 5
Train loss: 4.502539157867432
Train loss: 3.9214422702789307
Train loss: 3.762481451034546
Train loss

100%|██████████| 64/64 [00:00<00:00, 768.46it/s]

Train loss: 4.830989360809326
Train loss: 3.848078727722168
Train loss: 4.510808944702148
Train loss: 3.908432960510254
Train loss: 4.413948059082031
Train loss: 4.029417991638184
Train loss: 4.063640594482422
Train loss: 4.419077396392822
Train loss: 5.036476135253906
Train loss: 4.111546516418457
Train loss: 4.402413368225098
Train loss: 4.131351470947266
Train loss: 4.318504333496094
Train loss: 3.973139762878418
Train loss: 4.334989547729492
Train loss: 3.7994232177734375
Train loss: 4.685384750366211
Train loss: 3.771754503250122
Train loss: 3.936025857925415
Train loss: 3.932798147201538
Train loss: 3.5298008918762207
Train loss: 3.639000654220581
Train loss: 4.102899551391602
Train loss: 4.461578369140625
Train loss: 3.535172700881958
Train loss: 4.131433010101318
Train loss: 3.959528923034668
Train loss: 3.9286890029907227
Train loss: 4.501170635223389
Train loss: 4.160750389099121
Train loss: 4.165005683898926
Train loss: 4.052703857421875
Train loss: 4.037938117980957
Train l


