# 영화 리뷰데이터를 이용한 정서분류
## 1. 가설
- 나이브 베이스 분류기와 SVM 분류기를 사용하여 영화 리뷰데이터를 긍정 부정 감정으로 분류할 수 있다.

## 2. 연구방법

### 2.1 데이터 수집 
 - 데이터 출처 : 네이버 영화 평가 댓글 https://github.com/e9t/nsmc/
 - This is a movie review dataset in the Korean language. Reviews were scraped from Naver Movies.
 - The dataset construction is based on the method noted in Large movie review dataset from Maas et al., 2011
 - Each file is consisted of three columns:  id ,  document ,  label  ◦ id : The review id, provieded by Naver
     > - id : The review id, provieded by Naver 
     > - document : The actual review
     > - label : The sentiment class of the review. (0: negative, 1: positive)
     > - Columns are delimited with tabs (i.e.,  .tsv  format; but the file extension is  .txt  for easy access for novices)
 - Quick peek
<img src= 'dataSample.PNG' width=600 height=500>

### 2.2 데이터 읽기
 - 텍스트 파일에서 헤더를 제거하고 document 부분과 label 부분을 분리하여 리스트로 저장

In [22]:
def loadDataSet(file):
    with open(file,'r',encoding="utf8") as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:] #header 제외 : 텍스트 맨 위 id, document, label 
        dataSet = [inst[1] for inst in data] # document 
        labels = [int(inst[2]) for inst in data] # label
        
    dataSet500 = []; labels500 = []
    for i in range(500):
        dataSet500.append(dataSet[i])
        labels500.append(labels[i])
    return dataSet500, labels500

train_dataSet, train_labels = loadDataSet('ratings_test.txt')

In [23]:
print(len(train_dataSet))
print(len(train_labels))

500
500


### 2.3 형태소 분리
 - 각 document를 형태소 단위로 tokenizing하기
 - KoNLPy API 사용
 - 참고 사이트 : http://konlpy-ko.readthedocs.org/ko/latest/api/konlpy.tag/#module-konlpy.tag._twitter
 - Twitter Class의 nouns 함수 사용

In [24]:
from konlpy.tag import Twitter
tagger = Twitter()
def tokenize(doc):
    return tagger.nouns(doc)

train_doc = [tokenize(row) for row in train_dataSet]

### 2.4 나이브 베이즈 분류기 생성

In [25]:
from numpy import *

# function createVocabList: 유일한 단어 목록 생성
def createVocabList(dataSet):
    vocabSet = set([]) 
    for document in dataSet:
        vocabSet = vocabSet|set(document) 
    return list(vocabSet)

# function setOfWords2Vec: 주어진 문서 내에 어휘 목록에 있는 단어가 존재하는지 아닌지를 확인
def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList) 
    for word in inputSet:
        if word in vocabList: 
            returnVec[vocabList.index(word)] = 1 
    return returnVec

# function trainNB0: 나이브 베이스 분류기 훈련
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    p0Num = ones(numWords); p1Num = ones(numWords)
    p0Denom=2.0; p1Denom=2.0

    for i in range(numTrainDocs):
        if trainCategory[i]==1:
            p1Num += (trainMatrix[i])
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += (trainMatrix[i])
            p0Denom += sum(trainMatrix[i])

    p1Vect = p1Num/p1Denom
    p0Vect = p0Num/p0Denom

    return p0Vect, p1Vect, pAbusive

In [26]:
trainMat = []
myVocabList = createVocabList(train_doc)

for postinDoc in train_doc:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
print (trainMat[0])

# trainNB0 example
p0V, p1V, pAb = trainNB0(trainMat, train_labels)
print ("\ntrainNB0 example:")
print (p0V)
print (p1V)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

### 2.5 SVM 분류기 생성

In [27]:
from svmutil import *
svm_model.predict = lambda self, x: svm_predict([0], [x], self)[0][0]
prob = svm_problem(train_labels, trainMat)
param = svm_parameter()
param.kernel_type = LINEAR
param.C = 10
    
m = svm_train(prob, param)

### 2.6 나이브 베이즈 분류기 평가

In [39]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec)
    p0 = sum(vec2Classify * p0Vec)
    if p1>p0:
        return 1
    else:
        return 0

def testingNB():
    listOPosts, listClasses = loadDataSet('ratings_test.txt')
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    # training
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V,p1V,pAb = trainNB0(array(trainMat),array(train_labels))
    # test
    errorCount = 0
    for testIndex in range(0, len(trainMat)):
        thisDoc = array(setOfWords2Vec(myVocabList, listOPosts[testIndex]))    
        if classifyNB(thisDoc,p0V,p1V,pAb) != train_labels[testIndex]:
            errorCount += 1  
    errorRate = float(errorCount)/len(listOPosts)*100
    print ("Accuracy = ", 100-errorRate,"%")

In [40]:
testingNB()

Accuracy =  74.6 %


### 2.7 SVM 분류기 평가

In [28]:
svm_predict(train_labels, trainMat, m)

Accuracy = 97.4% (487/500) (classification)


([1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  1.0,
  1.0,
  0.0,
  1.0,
  1.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  1.0,
  1.0,
  0.0,
  1.0,
  1.0,
  1.0,
  0.0,
  1.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  1.0,
  0.0,
  1.0,
  1.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  1.0,
  1.0,
  0.0,
  0.0,
  1.0,
  1.0,
  1.0,
  0.0,
  1.0,
  1.0,
  0.0,
  1.0,
  1.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  1.0,
  0.0,
  1.0,
  1.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  1.0,
  1.0,
  0.0,
  1.0,
  1.0,
  1.0,
  1.0,
  0.0,
  1.0,
  1.0,
  1.0,
  1.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  1.0,
  1.0,
  0.0,
  1.0,
  1.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,

## 3. 결론

- 나이브 베이즈 분류기와 SVM 분류기의 정서 분류 테스트 결과 나이브 베이즈 분류기의 정확도는 74.6%인 반면, SVM 분류기의 정확도는 97.4%로 분류 정확도가 22.8%나 상승하였음

- 한글 데이터를 사용할 경우 나이브 베이즈 알고리즘을 사용한 분류기 보다는 SVM을 사용하는 것이 더 적합하다고 판단됨

