In [1]:
from konlpy.tag import Komoran
import numpy as np

In [4]:
komoran = Komoran()
text = "오늘 날씨는 구름이 많아요, 그리고 컴퓨터와 마우스를 샀어요"

nouns = komoran.nouns(text)
print(nouns)

['오늘', '날씨', '구름', '컴퓨터', '마우스']


### One-Hot Encoding

In [5]:
dics = {}
for word in nouns:
    if word not in dics.keys():
        dics[word] = len(dics)
print(dics)

{'오늘': 0, '날씨': 1, '구름': 2, '컴퓨터': 3, '마우스': 4}


In [6]:
nb_classes = len(dics)
targets = list(dics.values())
one_hot_targets = np.eye(nb_classes)[targets]
print(one_hot_targets)

[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]


### Get Review Data

In [7]:
def read_review_data(filename):
    with open(filename, 'r', encoding='UTF-8') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:] # header 제거
    return data

In [8]:
review_data = read_review_data('ratings.txt')
print(len(review_data))
print(review_data[0])

200000
['8112052', '어릴때보고 지금다시봐도 재밌어요ㅋㅋ', '1']


In [9]:
komoran = Komoran()
docs = [komoran.nouns(sentence[1]) for sentence in review_data]

In [15]:
print(len(docs))
print(review_data[1])
print(docs[1])

200000
['8132799', '디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산업이 부러웠는데. 사실 우리나라에서도 그 어려운시절에 끝까지 열정을 지킨 노라노 같은 전통이있어 저와 같은 사람들이 꿈을 꾸고 이뤄나갈 수 있다는 것에 감사합니다.', '1']
['디자인', '학생', '외국', '디자이너', '전통', '발전', '문화', '산업', '사실', '우리나라', '시절', '끝', '열정', '노라', '노', '전통', '사람', '꿈', '수', '것', '감사']


### Word 2 Vec

In [17]:
from gensim.models import Word2Vec
model = Word2Vec(sentences=docs, vector_size=200, window=4, hs=1, min_count=2, sg=1)

In [18]:
print('corpus_count : ', model.corpus_count)
print('corpus_total_words : ', model.corpus_total_words)

corpus_count :  200000
corpus_total_words :  1076896


In [20]:
print(len(model.wv['사랑']))
print('사랑 : ', model.wv['사랑'])

200
사랑 :  [ 0.30014455 -0.37896034  0.26605713 -0.17693354 -0.10161664 -0.17711829
 -0.00385801  0.10621081 -0.00718793  0.0518977  -0.32260352 -0.06926456
  0.08531252  0.19559318 -0.2608176   0.5120619  -0.17823951 -0.05510797
 -0.02805732 -0.12204951  0.04775128 -0.15520588 -0.33768624  0.13396065
 -0.10914315  0.1755054   0.07545558  0.16227409 -0.22151384 -0.19070497
  0.10798252  0.27316388  0.01630103 -0.13870671  0.1736989   0.24475129
  0.33972216  0.14895688 -0.01407773  0.07725001  0.06777284  0.17228822
 -0.03721326 -0.30804497  0.13141695  0.03635028 -0.03311909  0.06359546
  0.3737605   0.12644094  0.14257959 -0.181179   -0.08510809 -0.11070123
  0.3858862   0.0415813   0.20929544 -0.11078872  0.19035281 -0.05330751
 -0.13880314  0.26767528 -0.21716571  0.15777165  0.0079304   0.05708935
 -0.18285291  0.1320555   0.06916089  0.494985    0.00289714 -0.26790208
  0.21780553 -0.04715472 -0.1162746   0.0395189   0.2305515  -0.01472676
 -0.47162125  0.16116352  0.04299794 -0.0

In [24]:
print("일요일 = 월요일\t", model.wv.similarity(w1='일요일', w2='월요일'))
print("일요일 != 삼성\t", model.wv.similarity(w1='일요일', w2='삼성'))

일요일 = 월요일	 0.6234343
일요일 != 삼성	 0.18766749


In [26]:
print(model.wv.most_similar("게임", topn=5))

[('헝거', 0.6348849534988403), ('대항해시대', 0.6286454796791077), ('라이어', 0.6095319986343384), ('오락실', 0.5812275409698486), ('포탈', 0.5726288557052612)]


In [27]:
print(model.wv.most_similar("시리즈", topn=5))

[('더 울버린', 0.6704960465431213), ('캐리비안의 해적', 0.6648099422454834), ('비포 선셋', 0.6556674838066101), ('나니아 연대기', 0.6548028588294983), ('엑스맨', 0.6525648236274719)]
