<a href="https://colab.research.google.com/github/bbandbass/Projects/blob/main/cbow_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt

In [None]:
window_size = 2  # context words로 왼쪽으로 2 단어, 오른쪽으로 2 단어

sentence = """
Regrets, I've had a few.
But then again, too few to mention.
I did what I had to do.
And saw it through without exemption.
I planned each charted course.
Each careful step along the byway.
And more, much more than this, I did it my way.
"""

words = sentence.split()

vocab = set(words)
vocab_size = len(vocab)

word_to_idx = {word:idx for idx, word in enumerate(vocab)}
idx_to_word = {idx:word for idx, word in enumerate(vocab)}

data = []

# context words와 centor word 
for i in range(window_size, len(words) - window_size):
  context = [words[i - window_size : i], words[i + 1 : i + window_size + 1]]
  context = context[0] + context[1]
  center = words[i]
  data.append((context, center))

In [None]:
data

[(['Regrets,', "I've", 'a', 'few.'], 'had'),
 (["I've", 'had', 'few.', 'But'], 'a'),
 (['had', 'a', 'But', 'then'], 'few.'),
 (['a', 'few.', 'then', 'again,'], 'But'),
 (['few.', 'But', 'again,', 'too'], 'then'),
 (['But', 'then', 'too', 'few'], 'again,'),
 (['then', 'again,', 'few', 'to'], 'too'),
 (['again,', 'too', 'to', 'mention.'], 'few'),
 (['too', 'few', 'mention.', 'I'], 'to'),
 (['few', 'to', 'I', 'did'], 'mention.'),
 (['to', 'mention.', 'did', 'what'], 'I'),
 (['mention.', 'I', 'what', 'I'], 'did'),
 (['I', 'did', 'I', 'had'], 'what'),
 (['did', 'what', 'had', 'to'], 'I'),
 (['what', 'I', 'to', 'do.'], 'had'),
 (['I', 'had', 'do.', 'And'], 'to'),
 (['had', 'to', 'And', 'saw'], 'do.'),
 (['to', 'do.', 'saw', 'it'], 'And'),
 (['do.', 'And', 'it', 'through'], 'saw'),
 (['And', 'saw', 'through', 'without'], 'it'),
 (['saw', 'it', 'without', 'exemption.'], 'through'),
 (['it', 'through', 'exemption.', 'I'], 'without'),
 (['through', 'without', 'I', 'planned'], 'exemption.'),
 (['

In [None]:
def make_context_vector(context, word_to_idx):
    context_index = [word_to_idx[w] for w in context]
    return torch.tensor(context_index, dtype = torch.long)

In [None]:
def make_center_vector(center, word_to_idx):
  return torch.tensor(word_to_idx[center], dtype = torch.long)

# CBOW

In [None]:
class CBOW(nn.Module):
  def __init__(self, vocab_size, projection_size):
    super(CBOW, self).__init__()
    self.projection = nn.Embedding(vocab_size, projection_size)
    self.linear = nn.Linear(projection_size, vocab_size)

  def forward(self, input):
    
    projection = self.projection(input)
    projection_mean = projection.mean(axis = 0)
    output = self.linear(projection_mean)

    return output

In [None]:
cbow = CBOW(vocab_size, 500)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(cbow.parameters(), lr = 0.01)

In [None]:
for epoch in range(5000):
  
  loss = 0

  for context, center in data:
    
    context_vector = make_context_vector(context, word_to_idx)
    center_vector = make_center_vector(center, word_to_idx)
    y_hat = cbow(context_vector)
    loss += criterion(y_hat, center_vector)
    
  
  if (epoch + 1) % 10 == 0:
    print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss))

  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

Epoch: 0010 loss = 10.682316
Epoch: 0020 loss = 4.289672
Epoch: 0030 loss = 2.684982
Epoch: 0040 loss = 1.958663
Epoch: 0050 loss = 1.544149
Epoch: 0060 loss = 1.275847
Epoch: 0070 loss = 1.087830
Epoch: 0080 loss = 0.948654
Epoch: 0090 loss = 0.841419
Epoch: 0100 loss = 0.756222
Epoch: 0110 loss = 0.686878
Epoch: 0120 loss = 0.629324
Epoch: 0130 loss = 0.580774
Epoch: 0140 loss = 0.539262
Epoch: 0150 loss = 0.503353
Epoch: 0160 loss = 0.471982
Epoch: 0170 loss = 0.444335
Epoch: 0180 loss = 0.419782
Epoch: 0190 loss = 0.397831
Epoch: 0200 loss = 0.378086
Epoch: 0210 loss = 0.360230
Epoch: 0220 loss = 0.344002
Epoch: 0230 loss = 0.329188
Epoch: 0240 loss = 0.315612
Epoch: 0250 loss = 0.303123
Epoch: 0260 loss = 0.291594
Epoch: 0270 loss = 0.280921
Epoch: 0280 loss = 0.271008
Epoch: 0290 loss = 0.261779
Epoch: 0300 loss = 0.253163
Epoch: 0310 loss = 0.245102
Epoch: 0320 loss = 0.237544
Epoch: 0330 loss = 0.230443
Epoch: 0340 loss = 0.223758
Epoch: 0350 loss = 0.217454
Epoch: 0360 loss = 

In [None]:
print(context)
print(center)

['I', 'did', 'my', 'way.']
it


In [None]:
for_test = cbow(context_vector)

In [None]:
print({idx_to_word[torch.argmax(for_test).item()]})

{'it'}


# Skip-Gram

In [None]:
class SkipGram(nn.Module):
  def __init__(self, vocab_size, projection_size, window_size):
    super(SkipGram, self).__init__()
    self.projection = nn.Embedding(vocab_size, projection_size)
    self.linear = nn.Linear(projection_size, 2 * window_size * vocab_size)
    self.activation = nn.LogSoftmax(dim = 0)

  def forward(self, input):
    projection = self.projection(input)
    output = self.linear(projection).view(2 * window_size, vocab_size)
    y_hat = self.activation(output)
    
    return y_hat

In [None]:
skipgram = SkipGram(vocab_size, 500, 2)
criterion = nn.NLLLoss()
optimizer = torch.optim.SGD(skipgram.parameters(), lr = 0.01)

In [None]:
for epoch in range(5000):
  
  loss = 0

  for context, center in data:
    
    context_vector = make_context_vector(context, word_to_idx)
    center_vector = make_center_vector(center, word_to_idx)
    y_hat = skipgram(center_vector)
    # print("y_hat: ", torch.argmax(y_hat, dim = 1))
    # print("context_vector: ", context_vector)
    loss += criterion(y_hat, context_vector)
    
  if (epoch + 1) % 10 == 0:
    print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

Epoch: 0010 cost = 5.286719
Epoch: 0020 cost = 3.707929
Epoch: 0030 cost = 3.230443
Epoch: 0040 cost = 3.000097
Epoch: 0050 cost = 2.864495
Epoch: 0060 cost = 2.775162
Epoch: 0070 cost = 2.711872
Epoch: 0080 cost = 2.664686
Epoch: 0090 cost = 2.628153
Epoch: 0100 cost = 2.599030
Epoch: 0110 cost = 2.575270
Epoch: 0120 cost = 2.555518
Epoch: 0130 cost = 2.538839
Epoch: 0140 cost = 2.524566
Epoch: 0150 cost = 2.512215
Epoch: 0160 cost = 2.501421
Epoch: 0170 cost = 2.491908
Epoch: 0180 cost = 2.483461
Epoch: 0190 cost = 2.475909
Epoch: 0200 cost = 2.469119
Epoch: 0210 cost = 2.462979
Epoch: 0220 cost = 2.457400
Epoch: 0230 cost = 2.452310
Epoch: 0240 cost = 2.447647
Epoch: 0250 cost = 2.443358
Epoch: 0260 cost = 2.439402
Epoch: 0270 cost = 2.435740
Epoch: 0280 cost = 2.432341
Epoch: 0290 cost = 2.429178
Epoch: 0300 cost = 2.426227
Epoch: 0310 cost = 2.423466
Epoch: 0320 cost = 2.420880
Epoch: 0330 cost = 2.418450
Epoch: 0340 cost = 2.416165
Epoch: 0350 cost = 2.414009
Epoch: 0360 cost = 2

In [None]:
print(context)
print(center)

['I', 'did', 'my', 'way.']
it


In [None]:
test = skipgram(center_vector)

In [None]:
print([idx_to_word[torch.argmax(i).item()] for i in test])

['I', 'saw', 'my', 'way.']


In [None]:
class SkipGram2(nn.Module):
  def __init__(self, vocab_size, projection_size, window_size):
    super(SkipGram2, self).__init__()
    self.projection = nn.Embedding(vocab_size, projection_size)
    self.linear = nn.ModuleList()
    for i in range(2 * window_size):
      self.linear.append(nn.Linear(projection_size, vocab_size))
    self.activation = nn.LogSoftmax(dim = 0)

  def forward(self, input):
    
    y_hat = []
    projection = self.projection(input)
    for i in range(len(self.linear)):
      output = self.linear[i](projection)
      y_hat.append(self.activation(output))
    
    return y_hat

In [None]:
skipgram2 = SkipGram2(vocab_size, 500, 2)

criterion = nn.NLLLoss()
optimizer = torch.optim.SGD(skipgram2.parameters(), lr = 0.01)

In [None]:
skipgram2

SkipGram2(
  (projection): Embedding(39, 500)
  (linear): ModuleList(
    (0): Linear(in_features=500, out_features=39, bias=True)
    (1): Linear(in_features=500, out_features=39, bias=True)
    (2): Linear(in_features=500, out_features=39, bias=True)
    (3): Linear(in_features=500, out_features=39, bias=True)
  )
  (activation): LogSoftmax(dim=0)
)

In [None]:
center_vector

tensor(30)

In [None]:
for epoch in range(5000):
  
  loss = 0

  for context, center in data:
    
    target_vectors = make_context_vector(context, word_to_idx)
    # print(target_vectors)
    # print(target_vectors.dim())
    center_vector = make_center_vector(center, word_to_idx)
    # print(center_vector.dim())
    y_hat = skipgram2(center_vector)
    for i in range(len(target_vectors)):
      loss += criterion(y_hat[i], target_vectors[i])
    
  if (epoch + 1) % 10 == 0:
    print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

Epoch: 0010 cost = 172.291519
Epoch: 0020 cost = 181.407959
Epoch: 0030 cost = 169.161606
Epoch: 0040 cost = 168.770111
Epoch: 0050 cost = 163.374573
Epoch: 0060 cost = 169.604935
Epoch: 0070 cost = 155.532181
Epoch: 0080 cost = 162.100311
Epoch: 0090 cost = 154.394669
Epoch: 0100 cost = 153.096313
Epoch: 0110 cost = 147.935806
Epoch: 0120 cost = 130.192886
Epoch: 0130 cost = 141.927216
Epoch: 0140 cost = 133.493256
Epoch: 0150 cost = 130.702866
Epoch: 0160 cost = 130.910324
Epoch: 0170 cost = 121.419235
Epoch: 0180 cost = 120.677597
Epoch: 0190 cost = 118.969635
Epoch: 0200 cost = 118.649361
Epoch: 0210 cost = 115.416573
Epoch: 0220 cost = 115.792038
Epoch: 0230 cost = 113.226593
Epoch: 0240 cost = 114.023605
Epoch: 0250 cost = 110.929916
Epoch: 0260 cost = 109.394470
Epoch: 0270 cost = 110.049133
Epoch: 0280 cost = 109.279243
Epoch: 0290 cost = 107.798264
Epoch: 0300 cost = 106.940117
Epoch: 0310 cost = 106.009865
Epoch: 0320 cost = 105.017769
Epoch: 0330 cost = 103.981071
Epoch: 034

In [None]:
print(context)
print(center)

['I', 'did', 'my', 'way.']
it


In [None]:
for_test = skipgram2(center_vector)

In [None]:
print([idx_to_word[torch.argmax(i).item()] for i in for_test])

['I', 'did', 'through', 'way.']


# Gensim

In [None]:
pip install konlpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
Collecting JPype1>=0.7.0
  Downloading JPype1-1.4.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (465 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.6/465.6 KB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.1 konlpy-0.6.0


In [None]:
pip install tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import urllib.request
from gensim.models.word2vec import Word2Vec
from konlpy.tag import Okt
from tqdm import tqdm

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/kocohub/korean-hate-speech/master/unlabeled/unlabeled_comments_1.txt", filename="comments.txt")

('comments.txt', <http.client.HTTPMessage at 0x7f011068da00>)

In [None]:
train_data = pd.read_table('comments.txt', on_bad_lines='skip', header = None)
train_data.columns = ['댓글']

In [None]:
train_data.head()

Unnamed: 0,댓글
0,지드래곤은 난봉꾼이란...댓글도 달렸네 ㅋㅋ 이주연 학창시절 사진 보고 와라. 요즘...
1,이주연은 알겠는데 지디는 뭐하는 듣보잡여
2,부럽네요. 나도 불과 한달전까진 허니문베이비를 꿈꿨는데 이제 다 부질없네요. 당연히...
3,이주연을 모르는 애들이 많네. 해체된 애프터스쿨 멤버로 당시는 주연이 예명. 인기나...
4,겨론했으면


In [None]:
print(len(train_data))

499995


In [None]:
print("결측치 존재:",train_data.isnull().values.any())
train_data = train_data.dropna()
print("결측치 존재:", train_data.isnull().values.any())
print(len(train_data))

결측치 존재: True
결측치 존재: False
499915


In [None]:
# 정규 표현식을 통한 한글 외 문자 제거
train_data['댓글'] = train_data['댓글'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
train_data.head()

  train_data['댓글'] = train_data['댓글'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")


Unnamed: 0,댓글
0,지드래곤은 난봉꾼이란댓글도 달렸네 ㅋㅋ 이주연 학창시절 사진 보고 와라 요즘 웬만한...
1,이주연은 알겠는데 지디는 뭐하는 듣보잡여
2,부럽네요 나도 불과 한달전까진 허니문베이비를 꿈꿨는데 이제 다 부질없네요 당연히 순...
3,이주연을 모르는 애들이 많네 해체된 애프터스쿨 멤버로 당시는 주연이 예명 인기나 포...
4,겨론했으면


In [None]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

okt = Okt()

tokenized_data = []
for sentence in tqdm(train_data['댓글']):
    # 토큰화
    tokenized_sentence = okt.morphs(sentence, stem=True) 
    # 불용어 제거
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords] 
    tokenized_data.append(stopwords_removed_sentence)

100%|██████████| 499915/499915 [52:40<00:00, 158.17it/s]


In [None]:
tokenized_data

[['지드래곤',
  '난봉',
  '꾼',
  '이란',
  '댓글',
  '달리다',
  'ㅋㅋ',
  '이주연',
  '학창시절',
  '사진',
  '보고',
  '오다',
  '요즘',
  '웬만하다',
  '여자',
  '연예인',
  '하고',
  '붙이다',
  '놓다',
  '미모',
  '최고',
  '이다',
  'ㅋ',
  '대다',
  '얼짱',
  '출신'],
 ['이주연', '알다', '지', '디', '뭐', '듣보잡', '여'],
 ['부럽다',
  '나다',
  '불과',
  '달전',
  '까진',
  '허니문',
  '베이비',
  '꿈꾸다',
  '이제',
  '다',
  '부질없다',
  '당연하다',
  '순결하다',
  '믿다',
  '그래서',
  '첫날',
  '밤',
  '까지',
  '기다리다',
  '주다',
  '배신',
  '감',
  '만',
  '듭니',
  '다',
  '첫날',
  '밤',
  '와이프',
  '피',
  '안',
  '흘리다',
  '처가',
  '집',
  '식구',
  '일부러',
  '절',
  '속이다',
  '생각',
  '화도',
  '나다',
  '어제',
  '처가',
  '지다',
  '안',
  '가다',
  '혼자',
  '울',
  '고',
  '가다',
  '오다',
  '지금',
  '까지',
  '한마디',
  '안해',
  '요',
  '이혼',
  '하고',
  '싶다'],
 ['이주연',
  '을',
  '모르다',
  '애',
  '많다',
  '해체',
  '되다',
  '애프터스쿨',
  '멤버',
  '로',
  '당시',
  '주연',
  '예명',
  '인기',
  '나',
  '포텐',
  '안',
  '터지다',
  '순',
  '수',
  '미모',
  '만으로는',
  '애프터스쿨',
  '에서',
  '원',
  '탑',
  '이다',
  '진짜',
  '자연미인',
  '이다'],
 ['겨론'],
 ['이주연', '아깝다', '

In [None]:
from gensim.models import Word2Vec

model = Word2Vec(sentences = tokenized_data, size = 100, window = 5, 
                 min_count = 5, workers = 4, sg = 1)


In [None]:
print(model.wv.most_similar("배우", topn = 5))
print(model.wv.most_similar("소녀시대", topn = 5))
print(model.wv.most_similar("대한민국", topn = 5))
print(model.wv.most_similar("김유정", topn = 5))

[('연기자', 0.8405822515487671), ('여배우', 0.7437516450881958), ('조연', 0.722025990486145), ('다작', 0.7164419889450073), ('주연', 0.7147039175033569)]
[('소시', 0.8357353210449219), ('포미닛', 0.8037329912185669), ('애프터스쿨', 0.8019363284111023), ('카라', 0.7952439188957214), ('원걸', 0.7767716646194458)]
[('우리나라', 0.7869008183479309), ('이나라', 0.761174201965332), ('헬조선', 0.7139056921005249), ('전세계', 0.6928060054779053), ('은나라', 0.6830878257751465)]
[('김소현', 0.8455526232719421), ('김새론', 0.8036403059959412), ('진지희', 0.7982784509658813), ('남주혁', 0.7837553024291992), ('송재림', 0.777401864528656)]
