<a href="https://colab.research.google.com/github/bbandbass/Projects/blob/main/gensim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt

In [None]:
window_size = 2  # context words로 왼쪽으로 2 단어, 오른쪽으로 2 단어

sentence = """
Regrets, I've had a few.
But then again, too few to mention.
I did what I had to do.
And saw it through without exemption.
I planned each charted course.
Each careful step along the byway.
And more, much more than this, I did it my way.
"""

words = sentence.split()

vocab = set(words)
vocab_size = len(vocab)

word_to_idx = {word:idx for idx, word in enumerate(vocab)}
idx_to_word = {idx:word for idx, word in enumerate(vocab)}

data = []

# context words와 centor word 
for i in range(window_size, len(words) - window_size):
  context = [words[i - window_size : i], words[i + 1 : i + window_size + 1]]
  context = context[0] + context[1]
  center = words[i]
  data.append((context, center))

In [None]:
data

[(['Regrets,', "I've", 'a', 'few.'], 'had'),
 (["I've", 'had', 'few.', 'But'], 'a'),
 (['had', 'a', 'But', 'then'], 'few.'),
 (['a', 'few.', 'then', 'again,'], 'But'),
 (['few.', 'But', 'again,', 'too'], 'then'),
 (['But', 'then', 'too', 'few'], 'again,'),
 (['then', 'again,', 'few', 'to'], 'too'),
 (['again,', 'too', 'to', 'mention.'], 'few'),
 (['too', 'few', 'mention.', 'I'], 'to'),
 (['few', 'to', 'I', 'did'], 'mention.'),
 (['to', 'mention.', 'did', 'what'], 'I'),
 (['mention.', 'I', 'what', 'I'], 'did'),
 (['I', 'did', 'I', 'had'], 'what'),
 (['did', 'what', 'had', 'to'], 'I'),
 (['what', 'I', 'to', 'do.'], 'had'),
 (['I', 'had', 'do.', 'And'], 'to'),
 (['had', 'to', 'And', 'saw'], 'do.'),
 (['to', 'do.', 'saw', 'it'], 'And'),
 (['do.', 'And', 'it', 'through'], 'saw'),
 (['And', 'saw', 'through', 'without'], 'it'),
 (['saw', 'it', 'without', 'exemption.'], 'through'),
 (['it', 'through', 'exemption.', 'I'], 'without'),
 (['through', 'without', 'I', 'planned'], 'exemption.'),
 (['

In [None]:
def make_context_vector(context, word_to_idx):
    context_index = [word_to_idx[w] for w in context]
    return torch.tensor(context_index, dtype = torch.long)

In [None]:
def make_center_vector(center, word_to_idx):
  return torch.tensor(word_to_idx[center], dtype = torch.long)

# Gensim

In [None]:
pip install konlpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
Collecting JPype1>=0.7.0
  Downloading JPype1-1.4.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (465 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.6/465.6 KB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.1 konlpy-0.6.0


In [None]:
pip install tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import urllib.request
from gensim.models.word2vec import Word2Vec
from konlpy.tag import Okt
from tqdm import tqdm

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/kocohub/korean-hate-speech/master/unlabeled/unlabeled_comments_1.txt", filename="comments.txt")

('comments.txt', <http.client.HTTPMessage at 0x7f011068da00>)

In [None]:
train_data = pd.read_table('comments.txt', on_bad_lines='skip', header = None)
train_data.columns = ['댓글']

In [None]:
train_data.head()

Unnamed: 0,댓글
0,지드래곤은 난봉꾼이란...댓글도 달렸네 ㅋㅋ 이주연 학창시절 사진 보고 와라. 요즘...
1,이주연은 알겠는데 지디는 뭐하는 듣보잡여
2,부럽네요. 나도 불과 한달전까진 허니문베이비를 꿈꿨는데 이제 다 부질없네요. 당연히...
3,이주연을 모르는 애들이 많네. 해체된 애프터스쿨 멤버로 당시는 주연이 예명. 인기나...
4,겨론했으면


In [None]:
print(len(train_data))

499995


In [None]:
print("결측치 존재:",train_data.isnull().values.any())
train_data = train_data.dropna()
print("결측치 존재:", train_data.isnull().values.any())
print(len(train_data))

결측치 존재: True
결측치 존재: False
499915


In [None]:
# 정규 표현식을 통한 한글 외 문자 제거
train_data['댓글'] = train_data['댓글'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
train_data.head()

  train_data['댓글'] = train_data['댓글'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")


Unnamed: 0,댓글
0,지드래곤은 난봉꾼이란댓글도 달렸네 ㅋㅋ 이주연 학창시절 사진 보고 와라 요즘 웬만한...
1,이주연은 알겠는데 지디는 뭐하는 듣보잡여
2,부럽네요 나도 불과 한달전까진 허니문베이비를 꿈꿨는데 이제 다 부질없네요 당연히 순...
3,이주연을 모르는 애들이 많네 해체된 애프터스쿨 멤버로 당시는 주연이 예명 인기나 포...
4,겨론했으면


In [None]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

okt = Okt()

tokenized_data = []
for sentence in tqdm(train_data['댓글']):
    # 토큰화
    tokenized_sentence = okt.morphs(sentence, stem=True) 
    # 불용어 제거
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords] 
    tokenized_data.append(stopwords_removed_sentence)

100%|██████████| 499915/499915 [52:40<00:00, 158.17it/s]


In [None]:
tokenized_data

[['지드래곤',
  '난봉',
  '꾼',
  '이란',
  '댓글',
  '달리다',
  'ㅋㅋ',
  '이주연',
  '학창시절',
  '사진',
  '보고',
  '오다',
  '요즘',
  '웬만하다',
  '여자',
  '연예인',
  '하고',
  '붙이다',
  '놓다',
  '미모',
  '최고',
  '이다',
  'ㅋ',
  '대다',
  '얼짱',
  '출신'],
 ['이주연', '알다', '지', '디', '뭐', '듣보잡', '여'],
 ['부럽다',
  '나다',
  '불과',
  '달전',
  '까진',
  '허니문',
  '베이비',
  '꿈꾸다',
  '이제',
  '다',
  '부질없다',
  '당연하다',
  '순결하다',
  '믿다',
  '그래서',
  '첫날',
  '밤',
  '까지',
  '기다리다',
  '주다',
  '배신',
  '감',
  '만',
  '듭니',
  '다',
  '첫날',
  '밤',
  '와이프',
  '피',
  '안',
  '흘리다',
  '처가',
  '집',
  '식구',
  '일부러',
  '절',
  '속이다',
  '생각',
  '화도',
  '나다',
  '어제',
  '처가',
  '지다',
  '안',
  '가다',
  '혼자',
  '울',
  '고',
  '가다',
  '오다',
  '지금',
  '까지',
  '한마디',
  '안해',
  '요',
  '이혼',
  '하고',
  '싶다'],
 ['이주연',
  '을',
  '모르다',
  '애',
  '많다',
  '해체',
  '되다',
  '애프터스쿨',
  '멤버',
  '로',
  '당시',
  '주연',
  '예명',
  '인기',
  '나',
  '포텐',
  '안',
  '터지다',
  '순',
  '수',
  '미모',
  '만으로는',
  '애프터스쿨',
  '에서',
  '원',
  '탑',
  '이다',
  '진짜',
  '자연미인',
  '이다'],
 ['겨론'],
 ['이주연', '아깝다', '

In [None]:
from gensim.models import Word2Vec

model = Word2Vec(sentences = tokenized_data, size = 100, window = 5, 
                 min_count = 5, workers = 4, sg = 1)


In [None]:
print(model.wv.most_similar("배우", topn = 5))
print(model.wv.most_similar("소녀시대", topn = 5))
print(model.wv.most_similar("대한민국", topn = 5))
print(model.wv.most_similar("김유정", topn = 5))

[('연기자', 0.8405822515487671), ('여배우', 0.7437516450881958), ('조연', 0.722025990486145), ('다작', 0.7164419889450073), ('주연', 0.7147039175033569)]
[('소시', 0.8357353210449219), ('포미닛', 0.8037329912185669), ('애프터스쿨', 0.8019363284111023), ('카라', 0.7952439188957214), ('원걸', 0.7767716646194458)]
[('우리나라', 0.7869008183479309), ('이나라', 0.761174201965332), ('헬조선', 0.7139056921005249), ('전세계', 0.6928060054779053), ('은나라', 0.6830878257751465)]
[('김소현', 0.8455526232719421), ('김새론', 0.8036403059959412), ('진지희', 0.7982784509658813), ('남주혁', 0.7837553024291992), ('송재림', 0.777401864528656)]
