# 비지도학습 감성분석 - Lexicon 기반

In [1]:
import numpy as np
import pandas as pd

In [87]:
df = pd.read_csv('labeledTrainData.tsv', sep = '\t', quoting=3)  #  = quote_none
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


## Wordnet Synset

## Sentiwordnet SentiSynet

In [3]:
from nltk.corpus import wordnet

In [4]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [10]:
term = 'love'
synsets = wordnet.synsets(term)

In [11]:
type(synsets), len(synsets)

(list, 10)

In [12]:
for synset in synsets:
    print(f'##### name : {synset.name()} #####')
    print('POS : ', synset.lexname())
    print('정의 : ', synset.definition())
    print('표제어 : ', synset.lemma_names())
    print()

##### name : love.n.01 #####
POS :  noun.feeling
정의 :  a strong positive emotion of regard and affection
표제어 :  ['love']

##### name : love.n.02 #####
POS :  noun.cognition
정의 :  any object of warm affection or devotion; 
표제어 :  ['love', 'passion']

##### name : beloved.n.01 #####
POS :  noun.person
정의 :  a beloved person; used as terms of endearment
표제어 :  ['beloved', 'dear', 'dearest', 'honey', 'love']

##### name : love.n.04 #####
POS :  noun.feeling
정의 :  a deep feeling of sexual desire and attraction
표제어 :  ['love', 'sexual_love', 'erotic_love']

##### name : love.n.05 #####
POS :  noun.quantity
정의 :  a score of zero in tennis or squash
표제어 :  ['love']

##### name : sexual_love.n.02 #####
POS :  noun.act
정의 :  sexual activities (often including sexual intercourse) between two people
표제어 :  ['sexual_love', 'lovemaking', 'making_love', 'love', 'love_life']

##### name : love.v.01 #####
POS :  verb.emotion
정의 :  have a great affection or liking for
표제어 :  ['love']

##### name : love.

### 어휘 간의 유사도

In [47]:
# 품사를 모를 경우에는 synsets()로 알아냄
for synset in wordnet.synsets('love'):
    print(synset.name(), synset.definition())

love.n.01 a strong positive emotion of regard and affection
love.n.02 any object of warm affection or devotion; 
beloved.n.01 a beloved person; used as terms of endearment
love.n.04 a deep feeling of sexual desire and attraction
love.n.05 a score of zero in tennis or squash
sexual_love.n.02 sexual activities (often including sexual intercourse) between two people
love.v.01 have a great affection or liking for
love.v.02 get pleasure from
love.v.03 be enamored or in love with
sleep_together.v.01 have sexual intercourse with


In [37]:
# 품사를 아는 경우에는 synset()
hope = wordnet.synset('hope.n.01')
disappointment  = wordnet.synset('disappointment.n.01')
love = wordnet.synset('love.n.01')
expectation = wordnet.synset('expectation.n.01')
sadness = wordnet.synset('sadness.n.01 ')

In [26]:
# 단어 간의 유사도
hope.path_similarity(love), hope.path_similarity(sadness)

(0.16666666666666666, 0.2)

In [38]:
# 5개 단어 간의 유사도
simil = []
entities = [hope, disappointment, love, expectation, sadness]
for entity in entities:
    simiilaryity = [entity.path_similarity(another) for another in entities]
    simil.append(simiilaryity)

In [39]:
df = pd.DataFrame(simil, columns = ['hope', 'disappointment', 'love', 'expectation', 'sadness'],
                  index = ['hope', 'disappointment', 'love', 'expectation', 'sadness'])
df

Unnamed: 0,hope,disappointment,love,expectation,sadness
hope,1.0,0.111111,0.166667,0.083333,0.2
disappointment,0.111111,1.0,0.125,0.071429,0.142857
love,0.166667,0.125,1.0,0.090909,0.25
expectation,0.083333,0.071429,0.090909,1.0,0.1
sadness,0.2,0.142857,0.25,0.1,1.0


In [30]:
nltk.download('sentiwordnet')

[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.


True

In [32]:
from nltk.corpus import sentiwordnet
senti_syn = list(sentiwordnet.senti_synsets('slow'))

In [33]:
senti_syn

[SentiSynset('decelerate.v.01'),
 SentiSynset('slow.v.02'),
 SentiSynset('slow.v.03'),
 SentiSynset('slow.a.01'),
 SentiSynset('slow.a.02'),
 SentiSynset('dense.s.04'),
 SentiSynset('slow.a.04'),
 SentiSynset('boring.s.01'),
 SentiSynset('dull.s.08'),
 SentiSynset('slowly.r.01'),
 SentiSynset('behind.r.03')]

In [40]:
# father 단어의 긍부정 객관성 지수
father = sentiwordnet.senti_synset('father.n.01')
father.pos_score(), father.neg_score(), father.obj_score()

(0.0, 0.0, 1.0)

In [41]:
love = sentiwordnet.senti_synset('love.n.01')
love.pos_score(), love.neg_score(), love.obj_score()

(0.625, 0.0, 0.375)

In [42]:
sadness = sentiwordnet.senti_synset('sadness.n.01')
sadness.pos_score(), sadness.neg_score(), sadness.obj_score()

(0.0, 0.75, 0.25)

In [46]:
love = sentiwordnet.senti_synset('love.v.01')
love.pos_score(), love.neg_score(), love.obj_score()

(0.5, 0.0, 0.5)

In [48]:
wordnet.NOUN, wordnet.ADJ, wordnet.ADV, wordnet.VERB

('n', 'a', 'r', 'v')

### 감성지수 계산

In [52]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [56]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [58]:
from nltk import word_tokenize, pos_tag
sentence = "i love you so much but it's too tough for me"
word_list = word_tokenize(sentence)
word_list

['i',
 'love',
 'you',
 'so',
 'much',
 'but',
 'it',
 "'s",
 'too',
 'tough',
 'for',
 'me']

In [59]:
pos_tag(word_list)

[('i', 'NN'),
 ('love', 'VBP'),
 ('you', 'PRP'),
 ('so', 'RB'),
 ('much', 'RB'),
 ('but', 'CC'),
 ('it', 'PRP'),
 ("'s", 'VBZ'),
 ('too', 'RB'),
 ('tough', 'JJ'),
 ('for', 'IN'),
 ('me', 'PRP')]

In [60]:
# ('n', 'a', 'r', 'v')
def penn_to_wordnet(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    if tag.startswith('N'):
        return wordnet.NOUN
    if tag.startswith('R'):
        return wordnet.ADV
    if tag.startswith('V'):
        return wordnet.VERB     

In [64]:
for word, pos in pos_tag(word_list):
    print(word, '\t', penn_to_wordnet(pos))

i 	 n
love 	 v
you 	 None
so 	 r
much 	 r
but 	 None
it 	 None
's 	 v
too 	 r
tough 	 a
for 	 None
me 	 None


- 문장으로부터 Senti_Synset

In [67]:
sentence = "I love you so much but it's so hard I want to stop now"
word_list = [word for word in word_tokenize(sentence) if len(word) > 2]

In [68]:
for word, pos in pos_tag(word_list):
    print(word, '\t', penn_to_wordnet(pos))

love 	 v
you 	 None
much 	 r
but 	 None
hard 	 a
want 	 v
stop 	 n
now 	 r


In [73]:
for word, pos in pos_tag(word_list):
    wn_tag = penn_to_wordnet(pos)
    if wn_tag:
        synsets = list(sentiwordnet.senti_synsets(word, wn_tag))
        synset = synsets[0]
        print(synset)

<love.v.01: PosScore=0.5 NegScore=0.0>
<much.r.01: PosScore=0.125 NegScore=0.0>
<difficult.a.01: PosScore=0.0 NegScore=0.75>
<desire.v.01: PosScore=0.25 NegScore=0.0>
<stop.n.01: PosScore=0.0 NegScore=0.0>
<now.r.01: PosScore=0.0 NegScore=0.0>


In [75]:
sentiment = 0
for word, pos in pos_tag(word_list):
    wn_tag = penn_to_wordnet(pos)
    if wn_tag:
        synsets = list(sentiwordnet.senti_synsets(word, wn_tag))
        synset = synsets[0]
        sentiment += synset.pos_score() - synset.neg_score()

sentiment

0.125

- 표제어 추출

In [76]:
from nltk import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [77]:
sentiment = 0
for word, pos in pos_tag(word_list):
    wn_tag = penn_to_wordnet(pos)
    if wn_tag:
        lemma = lemmatizer.lemmatize(word, wn_tag)
        synsets = list(sentiwordnet.senti_synsets(lemma, wn_tag))
        synset = synsets[0]
        sentiment += synset.pos_score() - synset.neg_score()

In [78]:
sentiment

0.125

- document에서 감성지수를 계산하는 과정 및 함수

In [79]:
from nltk import sent_tokenize
document = '''I went to see this film with my anime and Diana Wynne Jones-loving teenage daughter. 
And while I enjoyed the film immensely due to its excellent animation, story and overall sense of fun, it was also interesting to see how much my daughter hated it! 
It was as if we saw two entirely different films. 
Why? Well, she is a huge Diana Wynne Jones fan and has read and re-read just about everything she ever wrote. 
And, according to her, the story was so different and so inferior to the book that she disliked the film and said some very nasty things about director Miyazaki. 
However, my advice is DON'T read the book and just go and see the movie. 
Then, after enjoying it, read the book ONLY if you are 100% sure you can do this without freaking out because they are so different!! 
I didn't know the difference and had a great time seeing the film!'''

In [82]:
sentiment = 0.0
for sentence in sent_tokenize(document):
    word_list = [word for word in word_tokenize(sentence) if len(word) > 2]

    for word, pos in pos_tag(word_list):
        wn_tag = penn_to_wordnet(pos)

        if wn_tag:
            lemma = lemmatizer.lemmatize(word, wn_tag)
            synsets = list(sentiwordnet.senti_synsets(lemma, wn_tag))
            if not synsets:
                print(word)
                continue
            synset = synsets[0]
            sentiment += synset.pos_score() - synset.neg_score()
print('긍정' if sentiment >= 0 else '부정')

Wynne
Jones-loving
teenage
Wynne
re-read
everything
inferior
Miyazaki
advice
N'T
ONLY
n't
긍정


In [92]:
def swn_polarity(text):
    lemmatizer = WordNetLemmatizer()
    sentiment = 0.0
    for sentence in sent_tokenize(text):
        word_list = [word for word in word_tokenize(sentence) if len(word) > 2]

        for word, pos in pos_tag(word_list):
            wn_tag = penn_to_wordnet(pos)

            if wn_tag:
                lemma = lemmatizer.lemmatize(word, wn_tag)
                synsets = list(sentiwordnet.senti_synsets(lemma, wn_tag))
                if not synsets:
                    continue
                synset = synsets[0]
                sentiment += synset.pos_score() - synset.neg_score()
    return 1 if sentiment >=0 else 0

In [85]:
text = '''A very cinematically beautiful film with characters you just fall in love with. I won't lie, the plot is very hard to follow. If you've read the book, you can tell that miyazaki focused less on following the plot, more on making the film beautiful to watch, and as a result we are left with a story that has huge chunks missing and frankly doesn't make a lot of sense in places. But I'm not mad, because it is very, very beautiful to watch.'''
swn_polarity(text)

cinematically
n't
've
miyazaki
n't


1

### IMDB 영화평 감성분석

In [88]:
df.review = df.review.str.replace('<br />', ' ')

In [89]:
df.review = df.review.str.replace('[^A-Za-z]', ' ',).str.strip()

  """Entry point for launching an IPython kernel.


In [90]:
df = df.head(10000)

In [93]:
%time df['pred'] = df.review.apply(lambda x : swn_polarity(x))

CPU times: user 3min 20s, sys: 1.11 s, total: 3min 21s
Wall time: 3min 22s


In [94]:
df.head(10)

Unnamed: 0,id,sentiment,review,pred
0,"""5814_8""",1,With all this stuff going down at the moment w...,1
1,"""2381_9""",1,The Classic War of the Worlds by Timothy Hin...,1
2,"""7759_3""",0,The film starts with a manager Nicholas Bell ...,0
3,"""3630_4""",0,It must be assumed that those who praised this...,0
4,"""9495_8""",1,Superbly trashy and wondrously unpretentious ...,0
5,"""8196_8""",1,I dont know why people think this is such a ba...,0
6,"""7166_2""",0,This movie could have been very good but come...,1
7,"""10633_1""",0,I watched this video at a friend s house I m ...,0
8,"""319_1""",0,A friend of mine bought this film for and ...,0
9,"""8713_10""",1,This movie is full of references Like Mad M...,1


In [95]:
# 정확도 계산
from sklearn.metrics import accuracy_score
accuracy_score(df.sentiment, df.pred)

0.6309

### VADER Lexicon을 이용한 감성분석

In [97]:
 nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [98]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
senti_anal = SentimentIntensityAnalyzer()
senti_anal.polarity_scores(df.review[0])

{'compound': -0.7943, 'neg': 0.13, 'neu': 0.743, 'pos': 0.127}

In [99]:
def vader_polarity(doc, threshold = 0.1):
    score = senti_anal.polarity_scores(doc)
    return 1 if score['compound'] >=0 else 0

In [100]:
%time df['vader'] = df.review.apply(lambda x: vader_polarity(x, 0.1))

CPU times: user 32.8 s, sys: 272 ms, total: 33.1 s
Wall time: 34.9 s


In [101]:
accuracy_score(df.sentiment, df.vader)

0.6956