# bag of words를 이용한 문장 긍/부정 분류

### bag of words
- 단어의 분포로 문서를 판단하는 방법이다.
- 그래서 많이 나오는 단어는 신경을 쓰지만 그렇다고 단어의 순서는 신경쓰지 않는다.

## TEXT CLASSIFICATION FOR SENTIMENT ANALYSIS – NAIVE BAYES CLASSIFIER
http://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/

In [5]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
 
def word_feats(words):
    return dict([(word, True) for word in words])
 
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
 
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
 
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4
 
#trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
#testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
trainfeats = negfeats[:750] + posfeats[:750]
testfeats = negfeats[750:] + posfeats[750:]
print ('train on ', len(trainfeats), ' instances, test on ', len(testfeats), ' instances')
 
classifier = NaiveBayesClassifier.train(trainfeats)
print ('accuracy:', nltk.classify.util.accuracy(classifier, testfeats))
classifier.show_most_informative_features()

train on  1500  instances, test on  500  instances
accuracy: 0.728
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0


### (임의의) 문장으로 테스트를 해보고 싶다!!

In [6]:
test = 'the quick brown fox jumps over the lazy dog'
classifier.classify(word_feats(test))

'neg'

In [7]:
pos = 'outstanding fascination'
classifier.classify(word_feats(pos))

'neg'

### 왜죠?

In [8]:
word_feats(pos)

{' ': True,
 'a': True,
 'c': True,
 'd': True,
 'f': True,
 'g': True,
 'i': True,
 'n': True,
 'o': True,
 's': True,
 't': True,
 'u': True}

### 직접 dict 형태로 만들어보면 제대로 나올까?

In [9]:
pos = {'outstanding':True, 'fascination':True}
classifier.classify(pos)

'pos'

In [10]:
neg = {'ludicrous':True, 'fascination':True}
classifier.classify(neg)

'neg'

### 그럼 좀 전의 test = 'the quick brown fox jumps over the lazy dog' 를 제대로 테스트를 해보고 싶다면?

In [11]:
t = {t: True for t in test.split(' ')}
print (t)

{'jumps': True, 'dog': True, 'over': True, 'the': True, 'fox': True, 'brown': True, 'quick': True, 'lazy': True}


In [12]:
classifier.classify(t)

'neg'

### 그렇다면 한글은...?

### thanks to
## 한국어와 NLTK, Gensim의 만남
https://www.lucypark.kr/slides/2015-pyconkr/

### 위에 했던 프로세스 그대로 해보겠습니다.

In [13]:
def read_data(filename):
    with open(filename, 'r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:]   # header 제외
    return data

train_data = read_data('ratings_train.txt')
test_data = read_data('ratings_test.txt')

print ('train on ', len(train_data), ' instances, test on ', len(test_data), ' instances')

train on  150000  instances, test on  50000  instances


### 사이즈를 좀 줄여서 진행하겠습니다.

In [14]:
train_data = train_data[:1500]                         
test_data = test_data[:500]                                                          

from konlpy.tag import Twitter
pos_tagger = Twitter()
def tokenize(doc):
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

train_docs = [(tokenize(row[1]), row[2]) for row in train_data]
test_docs = [(tokenize(row[1]), row[2]) for row in test_data]

In [15]:
def term_exists(doc):
    #return {'exists({})'.format(word): (word in set(doc)) for word in selected_words}
    return {word: True for word in doc}

train_xy = [(term_exists(d), c) for d, c in train_docs]
test_xy = [(term_exists(d), c) for d, c in test_docs]

In [16]:
classifier = nltk.NaiveBayesClassifier.train(train_xy)
print ('accuracy:', nltk.classify.util.accuracy(classifier, test_xy))
classifier.show_most_informative_features()

accuracy: 0.786
Most Informative Features
       ㅡㅡ/KoreanParticle = True                0 : 1      =     11.6 : 1.0
           아깝다/Adjective = True                0 : 1      =     11.5 : 1.0
                 최고/Noun = True                1 : 0      =     11.1 : 1.0
          재미없다/Adjective = True                0 : 1      =     10.6 : 1.0
                쓰레기/Noun = True                0 : 1      =     10.4 : 1.0
           괜찮다/Adjective = True                1 : 0      =      9.5 : 1.0
           멋지다/Adjective = True                1 : 0      =      9.5 : 1.0
           재밌다/Adjective = True                1 : 0      =      9.1 : 1.0
                  냐/Josa = True                0 : 1      =      8.5 : 1.0
                 최악/Noun = True                0 : 1      =      7.7 : 1.0


### 한글도 임의의 문장으로 테스트 해봐야겠죠?

In [18]:
t = '빠른 갈색 여우는 게으른 개를 뛰어 넘습니다.'

In [19]:
t = tokenize(t)
t = term_exists(t)
classifier.classify(t)

'0'

In [20]:
t = '오늘 이 시간 정말 멋지고 재밌고 괜찮다!!'

In [21]:
t = tokenize(t)
t = term_exists(t)
classifier.classify(t)

'1'