# SentiwordNet
- 대규모의 어휘 데이터베이스(lexical database)인 워드넷(wordnet)을 확장해 각 단어의 긍정/부정 척도를 더함
- nltk.download()를 통해 sentiwordent corpora를 다운받는다

In [2]:
import nltk
from nltk.corpus import sentiwordnet as swn

## 단어의 synset (유의어 집합) 찾기
- senti_synset()의 결과는 filter object이므로 내용을 보고싶으면 리스트로 변환한다
- 품사에 따라 같은 단어라도 다른 유의어 집합이 존재한다

In [3]:
swn.senti_synsets('hate')

<filter at 0xf294080>

In [4]:
list(swn.senti_synsets('hate'))

[SentiSynset('hate.n.01'), SentiSynset('hate.v.01')]

In [5]:
list(swn.senti_synsets('hate', 'v'))

[SentiSynset('hate.v.01')]

## 단어의 긍정/부정 척도
- pos_score(): 단어의 긍정 척도
- neg_score(): 단어의 부정 척도

In [6]:
list(swn.senti_synsets('hate', 'v'))[0].pos_score()

0.0

In [7]:
list(swn.senti_synsets('hate', 'v'))[0].neg_score()

0.75

## 특정 다넝의 긍정/부정 지수를 계산하는 함수 정의
- 특정 단어가 속한 유의어 집합의 positive score와 negative score를 모두 합한 후 평균을 낸다
- Positive score와 negative score를 tuple로 표현하여 반환

In [8]:
def word_sentiment_calculator(word, tag):
    pos_score = 0
    neg_score = 0
    
    if 'NN' in tag and len(list(swn.senti_synsets(word, 'n')))>0:
        syn_set = list(swn.senti_synsets(word, 'n'))
    elif 'VB' in tag and len(list(swn.senti_synsets(word, 'v')))>0:
        syn_set = list(swn.senti_synsets(word, 'v'))
    elif 'JJ' in tag and len(list(swn.senti_synsets(word, 'a')))>0:
        syn_set = list(swn.senti_synsets(word, 'a'))
    elif 'RB' in tag and len(list(swn.senti_synsets(word, 'r')))>0:
        syn_set = list(swn.senti_synsets(word, 'r'))
    else:
        return (0,0)
    
    for syn in syn_set:
        pos_score += syn.pos_score()
        neg_score += syn.neg_score()
    return (pos_score/len(syn_set), neg_score/len(syn_set))

In [10]:
nltk.pos_tag(['love'])

[('love', 'NN')]

In [11]:
word_sentiment_calculator('love', 'NN')

(0.22916666666666666, 0.0)

In [12]:
word_sentiment_calculator('love', 'VB')

(0.625, 0.03125)

## 특정 문장의 긍정/부정 지수 계산하기
- 문장을 토큰화한 후 품사 태깅을 한다
- 각 토큰의 부정 지수와 긍정 지수를 모두 합한다

In [13]:
sent = 'I hate you'

In [14]:
tokens = nltk.word_tokenize(sent)
pos_tags = nltk.pos_tag(tokens)
pos_tags

[('I', 'PRP'), ('hate', 'VBP'), ('you', 'PRP')]

In [15]:
pos_score = 0
neg_score = 0

In [16]:
for word, tag in pos_tags:
    pos_score += word_sentiment_calculator(word, tag)[0]
    neg_score += word_sentiment_calculator(word, tag)[1]
print(pos_score)
print(neg_score)

0.0
0.75


문장의 감성 지수를 계산하기 위한 함수

In [17]:
def sentence_sentiment_calculator(sent):
    tokens =  nltk.word_tokenize(sent)
    pos_tags = nltk.pos_tag(tokens)
    
    pos_score = 0
    neg_score = 0
    for word, tag in pos_tags:
        pos_score += word_sentiment_calculator(word, tag)[0]
        neg_score += word_sentiment_calculator(word, tag)[1]
    return (pos_score, neg_score)

## 실습 4-1-1. 센티워드넷을 활용한 문장 감성분석
- 아래 문장의 감성분석을 수행해 본다
    - 문장1: “In the Echo Dot, Amazon has created a near perfect blend of hardware and software.”
    - 문장2: “The author does a good job of presenting a wide range of psychological traps and irrational tendencies to which humans fall prey”
    - 문장3: “Pulp Fiction is inane, self-indulgent, and bloated”

In [1]:
def word_sentiment_calculator(word, tag):
    pos_score = 0
    neg_score = 0
    
    if 'NN' in tag and len(list(swn.senti_synsets(word, 'n')))>0:
        syn_set = list(swn.senti_synsets(word, 'n'))
    elif 'VB' in tag and len(list(swn.senti_synsets(word, 'v')))>0:
        syn_set = list(swn.senti_synsets(word, 'v'))
    elif 'JJ' in tag and len(list(swn.senti_synsets(word, 'a')))>0:
        syn_set = list(swn.senti_synsets(word, 'a'))
    elif 'RB' in tag and len(list(swn.senti_synsets(word, 'r')))>0:
        syn_set = list(swn.senti_synsets(word, 'r'))
    else:
        return (0,0)
    
    for syn in syn_set:
        pos_score += syn.pos_score()
        neg_score += syn.neg_score()
    return (pos_score/len(syn_set), neg_score/len(syn_set))

In [2]:
def sentence_sentiment_calculator(sent):
    tokens =  nltk.word_tokenize(sent)
    pos_tags = nltk.pos_tag(tokens)
    
    pos_score = 0
    neg_score = 0
    for word, tag in pos_tags:
        pos_score += word_sentiment_calculator(word, tag)[0]
        neg_score += word_sentiment_calculator(word, tag)[1]
    return (pos_score, neg_score)

In [7]:
sent1 = 'In the Echo Dot, Amazon has created a near perfect blend of hardware and software.'
sent2 = 'The author does a good job of presenting a wide range of psychological traps and irrational tendencies to which humans fall prey'
sent3 = 'Pulp Fiction is inane, self-indulgent, and bloated'

In [8]:
print(sentence_sentiment_calculator(sent1))
print(sentence_sentiment_calculator(sent2))
print(sentence_sentiment_calculator(sent3))

(1.1239035087719298, 0.2609649122807018)
(1.1782757173382172, 0.882884043040293)
(0.7788461538461539, 0.9942307692307693)


## 실습 4-1-2. 센티워드넷을 활용한 영화 리뷰 감성분석 (1)
- 실습 1-3-4에서 수집했던 영화 ‘다크 나이트’의 첫 번째 리뷰를 불러와 감성분석을 수행해 본다
- 센티워드넷을 활용한다

In [9]:
with open('result-1-3-4.txt', 'r', encoding = 'utf-8') as f:
    review = f.readline()
    f.close()

In [10]:
sentence_sentiment_calculator(review)

(11.405738623218884, 6.483186195266968)

## 실습 4-1-3. 센티워드넷을 활용한 영화 리뷰 감성분석 (2)
- 실습 1-3-4에서 수집했던 영화 ‘다크 나이트’의 모든 리뷰의 감성분석을 수행한다
- 모든 리뷰의 감성 지수를 2차원 NumPy 배열에 저장한 후 긍정/부정 지수의 평균을 계산한다
- np.mean() 함수를 활용한다(axis 인자값을 활용할 것)

In [17]:
import numpy as np

In [14]:
with open('result-1-3-4.txt', 'r', encoding = 'utf-8') as f:
    all_reviews = f.readlines()
    f.close()

In [None]:
scores = []
for review in all_reviews:
    scores.append(sentence_sentiment_calculator(review))
scores = np.array(scores)
scores

In [19]:
mean_scores = np.mean(scores, axis = 0)

In [20]:
mean_scores

array([ 13.4369763 ,  10.19789012])

## 실습 4-1-4. 센티워드넷을 활용한 영화 리뷰 감성분석 (3)
- 실습 1-3-5에서 수집했던 임의의 영화의 모든 리뷰의 감성분석을 수행한다
- 모든 리뷰의 감성 지수를 2차원 NumPy 배열에 저장한다
- 이를 open()함수를 활용해 텍스트 파일에 저장한다
    - 각 리뷰당 한 줄에 출력되고, 긍정과 부정 지수의 구분은 탭(tab)으로 한다

In [21]:
with open('result-1-3-5-inception.txt', 'r', encoding = 'utf-8') as f:
    all_reviews = f.readlines()
    f.close()

In [22]:
scores = []
for review in all_reviews:
    scores.append(sentence_sentiment_calculator(review))
scores = np.array(scores)

In [24]:
with open('result-4-1-4.txt', 'w', encoding = 'utf-8') as f:
    for score in scores:
        f.write(str(score[0]) + '\t' + str(score[1]) + '\r')
    f.close()

## 실습 4-1-5. 센티워드넷을 활용한 IMDb Large Movie Review Dataset 감성분석 (1)
- IMDb Large Movie Review Dataset에서 첫 번째 긍정적인 학습 리뷰 데이터를 불러와 감성 분석을 해본다
- [aclImdb] > [train] > [pos]> 0_9.txt의 데이터를 분석해 본다

In [25]:
import os
files = os.listdir('aclImdb/train/pos')

In [26]:
first_file = files[0]
with open('aclImdb/train/pos/{}'.format(first_file), 'r', encoding = 'utf-8') as f:
    review = f.read()
    f.close()

In [27]:
print(review)
sentence_sentiment_calculator(review)

Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!


(5.110205982026105, 3.502637966031258)

## 실습 4-1-6. 센티워드넷을 활용한 IMDb Large Movie Review Dataset 감성분석 (2)
- IMDb Large Movie Review Dataset에서 긍정적인 학습 리뷰 데이터 10개와 부정적인 학습 리뷰 데이터 10개를 불러와 감성 분석을 해본다
- 총 20개 중에 정확도(accuracy)가 몇이나 되는지 확인해 본다
    - 긍정 척도가 부정 척도보다 높으면 positive로, 부정 척도가 긍정 척도보다 높으면 negative로 분류한다

In [28]:
import os
pos_files = os.listdir('aclImdb/train/pos')[:10]
neg_files = os.listdir('aclImdb/train/neg')[:10]

In [29]:
actual = [1] *10 + [0]*10 
predicted = []

for file in pos_files:
    with open('aclImdb/train/pos/{}'.format(file), 'r', encoding = 'utf-8') as f:
        scores = sentence_sentiment_calculator(f.read())
        
        if scores[0] >= scores[1]:
            predicted.append(1)
        else:
            predicted.append(0)
        f.close()

for file in neg_files:
    with open('aclImdb/train/neg/{}'.format(file), 'r', encoding = 'utf-8') as f:
        scores = sentence_sentiment_calculator(f.read())
        
        if scores[0] >= scores[1]:
            predicted.append(1)
        else:
            predicted.append(0)
        f.close()

In [None]:
correct = 0
incorrect = 0
for i in range(20):
    if actual[i] == predicted[i]:
        correct += 1
    else:
        incorrect += 1    

In [None]:
print(actual)
print(predicted)

In [None]:
print('Number of correct instances: ', correct)
print('Number of incorrect instances: ', incorrect)

## 실습 4-1-7. 센티워드넷을 활용한 IMDb Large Movie Review Dataset 감성분석 (3)
- IMDb Large Movie Review Dataset에서 긍정적인 검증 리뷰 데이터와 부정적인 검증 리뷰 데이터를 1000개씩 불러와 감성 분석을 해본다
- 총 2000개 중에 정확도(accuracy)가 얼마나 되는지 살펴본다
    - 긍정 척도가 부정 척도보다 높으면 positive로, 부정 척도가 긍정 척도보다 높으면 negative로 분류한다

In [40]:
import os
pos_files = os.listdir('aclImdb/test/pos')[:1000]
neg_files = os.listdir('aclImdb/test/neg')[:1000]

In [41]:
actual = [1] *1000 + [0]*1000 
predicted = []

for file in pos_files:
    with open('aclImdb/test/pos/{}'.format(file), 'r', encoding = 'utf-8') as f:
        scores = sentence_sentiment_calculator(f.read())
        
        if scores[0] >= scores[1]:
            predicted.append(1)
        else:
            predicted.append(0)
        f.close()

for file in neg_files:
    with open('aclImdb/test/neg/{}'.format(file), 'r', encoding = 'utf-8') as f:
        scores = sentence_sentiment_calculator(f.read())
        
        if scores[0] >= scores[1]:
            predicted.append(1)
        else:
            predicted.append(0)
        f.close()

In [44]:
correct = 0
incorrect = 0
for i in range(2000):
    if actual[i] == predicted[i]:
        correct += 1
    else:
        incorrect += 1  

In [45]:
print('Number of correct instances: ', correct)
print('Number of incorrect instances: ', incorrect)

Number of correct instances:  1425
Number of incorrect instances:  575


## 실습 4-1-8. 센티워드넷을 활용한 IMDb Large Movie Review Dataset 감성분석 (4)
- IMDb Large Movie Review Dataset에서 긍정적인 검증 리뷰 데이터와 부정적인 검증 리뷰 데이터 전체를 불러와 감성 분석을 해본다
- 총 25000개 중에 정확도(accuracy)가 얼마나 되는지 살펴본다
    - 긍정 척도가 부정 척도보다 높으면 positive로, 부정 척도가 긍정 척도보다 높으면 negative로 분류한다

In [6]:
import os
pos_files = os.listdir('aclImdb/test/pos')
neg_files = os.listdir('aclImdb/test/neg')

In [7]:
actual = [1] *12500 + [0]*12500 
predicted = []

for file in pos_files:
    with open('aclImdb/test/pos/{}'.format(file), 'r', encoding = 'utf-8') as f:
        scores = sentence_sentiment_calculator(f.read())
        
        if scores[0] >= scores[1]:
            predicted.append(1)
        else:
            predicted.append(0)
        f.close()

for file in neg_files:
    with open('aclImdb/test/neg/{}'.format(file), 'r', encoding = 'utf-8') as f:
        scores = sentence_sentiment_calculator(f.read())
        
        if scores[0] >= scores[1]:
            predicted.append(1)
        else:
            predicted.append(0)
        f.close()

In [8]:
correct = 0
incorrect = 0
for i in range(25000):
    if actual[i] == predicted[i]:
        correct += 1
    else:
        incorrect += 1  

In [9]:
print('Number of correct instances: ', correct)
print('Number of incorrect instances: ', incorrect)

Number of correct instances:  17057
Number of incorrect instances:  7943
