<br>
# **Naver 리뷰분석**
by Gensim [Blog](https://medium.com/@hoho0443/konlpy-nltk-gensim%EC%9D%84-%ED%99%9C%EC%9A%A9%ED%95%98%EC%97%AC-%EB%AC%B8%EC%9E%A5-%EA%B8%8D%EC%A0%95-%EB%B6%80%EC%A0%95-%EB%B6%84%EB%A5%98%ED%95%98%EA%B8%B0-6e58ca9203cc) [박은정 PPT](https://www.lucypark.kr/docs/2015-pyconkr/#60)

<br>
## **1 Nltk_run.py**
nltk 모듈을 활용하여 베이지안 모델을 생성한다

In [1]:
from konlpy.tag import Twitter
import nltk

twitter = Twitter()

print(twitter.morphs(u'한글형태소분석기 테스트 중 입니다')) # ??
print(twitter.nouns(u'한글형태소분석기 테스트 중 입니다!')) #명사
print(twitter.pos(u'한글형태소분석기 테스트 중 입니다.')) #형태소

['한글', '형태소', '분', '석기', '테스트', '중', '입니', '다']
['한글', '형태소', '석기', '테스트', '중']
[('한글', 'Noun'), ('형태소', 'Noun'), ('분', 'Suffix'), ('석기', 'Noun'), ('테스트', 'Noun'), ('중', 'Noun'), ('입니', 'Adjective'), ('다', 'Eomi'), ('.', 'Punctuation')]


In [2]:
# 전체 데이터중 1/50 의 데이터를 대상으로 학습한다
def read_data(filename):
    with open(filename, 'r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        
    from random import randint
    random_data = [data[randint(1, len(data))]  for no in range(int(len(data)/50)) ]
    return random_data

def tokenize(doc):
  # norm, stem은 optional
  return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]

def term_exists(doc):
    return {'exists({})'.format(word): (word in set(doc)) for word in selected_words}

In [3]:
# 트래이닝 데이터와 테스트 데이터를 읽기
train_data = read_data('data/ratings_train.txt')
test_data  = read_data('data/ratings_test.txt')

# row, column의 수가 제대로 읽혔는지 확인
print(len(train_data))      # nrows: 150000
print(len(train_data[0]))   # ncols: 3
print(len(test_data))       # nrows: 50000
print(len(test_data[0]))     # ncols: 3

3000
3
1000
3


In [4]:
# 형태소 분류
train_docs = [(tokenize(row[1]), row[2]) for row in train_data[1:]]
test_docs = [(tokenize(row[1]), row[2]) for row in test_data[1:]]
#Training data의 token 모으기
tokens = [t for d in train_docs for t in d[0]]
print(len(tokens))

44795


In [5]:
# Load tokens with nltk.Text()
text = nltk.Text(tokens, name='NMSC')
print(text.vocab().most_common(10))

[('./Punctuation', 1347), ('하다/Verb', 1052), ('영화/Noun', 1030), ('이/Josa', 806), ('보다/Verb', 716), ('../Punctuation', 616), ('의/Josa', 605), ('가/Josa', 553), ('에/Josa', 549), ('도/Josa', 482)]


In [6]:
%%time
# 텍스트간의 연어 빈번하게 등장하는 단어 구하기
# text.collocations()
# term이 존재하는지에 따라서 문서를 분류
selected_words = [f[0] for f in text.vocab().most_common(2000)] # 여기서는 최빈도 단어 2000개를 피쳐로 사용
train_docs     = train_docs[:10000] # 시간 단축을 위한 꼼수로 training corpus의 일부만 사용할 수 있음
train_xy       = [(term_exists(d), c) for d, c in train_docs]
test_xy        = [(term_exists(d), c) for d, c in test_docs]
# nltk의 NaiveBayesClassifier으로 데이터를 트래이닝 시키고, test 데이터로 확인
classifier     = nltk.NaiveBayesClassifier.train(train_xy) #Naive Bayes classifier 적용
print(nltk.classify.accuracy(classifier, test_xy))
# => 0.80418

0.8078078078078078
CPU times: user 18.7 s, sys: 304 ms, total: 19 s
Wall time: 19 s


In [7]:
classifier.show_most_informative_features(10)
#nltk.polarity_scores("i love you")

Most Informative Features
         exists(최악/Noun) = True                0 : 1      =     28.1 : 1.0
        exists(쓰레기/Noun) = True                0 : 1      =     16.9 : 1.0
         exists(실망/Noun) = True                0 : 1      =     16.2 : 1.0
          exists(냐/Josa) = True                0 : 1      =     12.1 : 1.0
        exists(0/Number) = True                0 : 1      =     11.6 : 1.0
         exists(알바/Noun) = True                0 : 1      =     11.6 : 1.0
         exists(짜증/Noun) = True                0 : 1      =     11.3 : 1.0
  exists(재미없다/Adjective) = True                0 : 1      =     10.6 : 1.0
         exists(최고/Noun) = True                1 : 0      =     10.1 : 1.0
         exists(보지/Noun) = True                0 : 1      =      9.7 : 1.0


<br>
## **2 doc2Vec_train**
nltk 모듈을 활용하여 베이지안 모델을 생성한다

In [8]:
from collections import namedtuple
from gensim.models import doc2vec
from konlpy.tag import Twitter
import multiprocessing
from pprint import pprint

In [9]:
twitter = Twitter()
def read_data(filename):
    with open(filename, 'r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
    from random import randint
    random_data = [data[randint(1, len(data))]  for no in range(int(len(data)/50)) ]
    return random_data

def tokenize(doc):
  # norm, stem은 optional
  return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]

In [10]:
#doc2vec parameters
cores = multiprocessing.cpu_count()
vector_size = 300
window_size = 15
word_min_count = 2
sampling_threshold = 1e-5
negative_size = 5
train_epoch = 100
dm = 1
worker_count = cores

In [11]:
# 트래이닝 데이터 읽기
train_data = read_data('data/ratings_train.txt')
# 형태소 분류
train_docs = [(tokenize(row[1]), row[2]) for row in train_data[1:]]
# doc2vec 에서 필요한 데이터 형식으로 변경
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]

In [12]:
# 사전 구축
doc_vectorizer = doc2vec.Doc2Vec(vector_size=300, alpha=0.025, min_alpha=0.025, seed=1234)
doc_vectorizer.build_vocab(tagged_train_docs)

# Train document vectors!
for epoch in range(10):
    doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
    doc_vectorizer.alpha -= 0.002  # decrease the learning rate
    doc_vectorizer.min_alpha = doc_vectorizer.alpha  # fix the learning rate, no decay

  import sys


In [13]:
#To save
doc_vectorizer.save('data/doc2vec.model')
pprint(doc_vectorizer.wv.most_similar('공포/Noun'))

[('중요하다/Adjective', 0.7721543312072754),
 ('판타지/Noun', 0.7557740211486816),
 ('성/Noun', 0.7301464080810547),
 ('에겐/Josa', 0.7294729351997375),
 ('코미디/Noun', 0.7204558849334717),
 ('주제/Noun', 0.7145422101020813),
 ('설득/Noun', 0.7118157744407654),
 ('공포영화/Noun', 0.7100128531455994),
 ('다큐/Noun', 0.7097640037536621),
 ('섹스/Noun', 0.7035242915153503)]


  if np.issubdtype(vec.dtype, np.int):


In [14]:
pprint(doc_vectorizer.wv.similarity('공포/Noun', 'ㅋㅋ/KoreanParticle'))

0.044130173


  if np.issubdtype(vec.dtype, np.int):


<br>
## **3 doc2Vec_Test**
nltk 모듈을 활용하여 베 모델을 생성한다

In [15]:
from collections import namedtuple
from gensim.models import doc2vec
from konlpy.tag import Twitter
import multiprocessing
from pprint import pprint
from gensim.models import Doc2Vec
from sklearn.linear_model import LogisticRegression
import numpy
import pickle

In [16]:
twitter = Twitter()
def read_data(filename):
    with open(filename, 'r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
    from random import randint
    random_data = [data[randint(1, len(data))]  for no in range(int(len(data)/50)) ]
    return random_data

def tokenize(doc):
  # norm, stem은 optional
  return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]

In [17]:
# 테스트 데이터를 읽기
train_data = read_data('data/ratings_train.txt')
test_data = read_data('data/ratings_test.txt')
# 형태소 분류
train_docs = [(tokenize(row[1]), row[2]) for row in train_data[1:]]
test_docs = [(tokenize(row[1]), row[2]) for row in test_data[1:]]

In [18]:
# doc2vec 에서 필요한 데이터 형식으로 변경
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]
tagged_test_docs = [TaggedDocument(d, [c]) for d, c in test_docs]
# load train data
doc_vectorizer = Doc2Vec.load('data/doc2vec.model')

In [19]:
# 분류를 위한 피쳐 생성
train_x = [doc_vectorizer.infer_vector(doc.words) for doc in tagged_train_docs]
train_y = [doc.tags[0] for doc in tagged_train_docs]
test_x = [doc_vectorizer.infer_vector(doc.words) for doc in tagged_test_docs]
test_y = [doc.tags[0] for doc in tagged_test_docs]

In [20]:
classifier = LogisticRegression(random_state=1234)
classifier.fit(train_x, train_y)
# 테스트 socre 확인
print( classifier.score(test_x, test_y) )
# 0.63904

0.7037037037037037


In [21]:
# save the model to disk
filename = 'data/finalized_model.sav'
pickle.dump(classifier, open(filename, 'wb'))

<br>
## **4 doc2Vec_run**
nltk 모듈을 활용하여 베이지안 모델을 생성한다

In [22]:
from collections import namedtuple
from gensim.models import doc2vec
from konlpy.tag import Twitter
import multiprocessing
from pprint import pprint
from gensim.models import Doc2Vec
from sklearn.linear_model import LogisticRegression
import numpy
import pickle

In [23]:
twitter = Twitter()
def read_data(filename):
    with open(filename, 'r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
    return data

def tokenize(doc):
  # norm, stem은 optional
  return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]

In [24]:
# 실제 구동 데이터를 읽기
run_data = read_data('data/ratings_run.txt')
# 형태소 분류
run_docs = [(tokenize(row[1]), row[2]) for row in run_data[1:]]
# doc2vec 에서 필요한 데이터 형식으로 변경
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_run_docs = [TaggedDocument(d, [c]) for d, c in run_docs]
# load train data
doc_vectorizer = Doc2Vec.load('data/doc2vec.model')

In [25]:
# 분류를 위한 피쳐 생성
run_x = [doc_vectorizer.infer_vector(doc.words) for doc in tagged_run_docs]
run_y = [doc.tags[0] for doc in tagged_run_docs]
# load the model from disk
filename = 'data/finalized_model.sav'

In [26]:
# 실제 분류 확인
loaded_model = pickle.load(open(filename, 'rb'))
print(loaded_model.predict(run_x[0].reshape(1, -1)))
print(loaded_model.predict(run_x[1].reshape(1, -1)))

['1']
['1']
