# 비지도학습 감성분석 - Lexicon 기반

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

### Wordnet Synset 및 SentiWordnet SentiSynset 클래스

In [2]:
from nltk.corpus import wordnet

term = 'present'
synsets = wordnet.synsets(term)

In [3]:
type(synsets), len(synsets)

(list, 18)

In [5]:
# synsets는 객체들의 list이다

print(synsets)

[Synset('present.n.01'), Synset('present.n.02'), Synset('present.n.03'), Synset('show.v.01'), Synset('present.v.02'), Synset('stage.v.01'), Synset('present.v.04'), Synset('present.v.05'), Synset('award.v.01'), Synset('give.v.08'), Synset('deliver.v.01'), Synset('introduce.v.01'), Synset('portray.v.04'), Synset('confront.v.03'), Synset('present.v.12'), Synset('salute.v.06'), Synset('present.a.01'), Synset('present.a.02')]


In [6]:
for synset in synsets[:5]:
    print(f'#### name: {synset.name()} ####')
    print('POS :', synset.lexname())
    print('정의 :', synset.definition())
    print('표제어 :', synset.lemma_names())

#### name: present.n.01 ####
POS : noun.time
정의 : the period of time that is happening now; any continuous stretch of time including the moment of speech
표제어 : ['present', 'nowadays']
#### name: present.n.02 ####
POS : noun.possession
정의 : something presented as a gift
표제어 : ['present']
#### name: present.n.03 ####
POS : noun.communication
정의 : a verb tense that expresses actions or states at the time of speaking
표제어 : ['present', 'present_tense']
#### name: show.v.01 ####
POS : verb.perception
정의 : give an exhibition of to an interested audience
표제어 : ['show', 'demo', 'exhibit', 'present', 'demonstrate']
#### name: present.v.02 ####
POS : verb.communication
정의 : bring forward and present to the mind
표제어 : ['present', 'represent', 'lay_out']


- 어휘간의 유사도

In [7]:
# 단어를 입력할 때는 synsets()

for synset in wordnet.synsets('tiger'):
    print(synset.name(), synset.definition())

tiger.n.01 a fierce or audacious person
tiger.n.02 large feline of forests in most of Asia having a tawny coat with black stripes; endangered


In [12]:
# 단어, 품사를 아는 경우에는 synset()

tiger = wordnet.synset('tiger.n.02')
tree = wordnet.synset('tree.n.01')
lion = wordnet.synset('lion.n.01')
cat = wordnet.synset('cat.n.01')
dog = wordnet.synset('dog.n.01')
tire = wordnet.synset('tire.n.01')


In [14]:
# 단어간의 유사도

tiger.path_similarity(lion), tiger.path_similarity(dog), tiger.path_similarity(tree)

(0.3333333333333333, 0.16666666666666666, 0.07142857142857142)

In [15]:
# 5개 단어간의 유사도
similarities = []
entities = [tree, lion, tiger, cat, dog]
for entity in entities:
    similarity = [entity.path_similarity(another) for another in entities]
    similarities.append(similarity)

In [16]:
df = pd.DataFrame(similarities, columns=['tree', 'lion', 'tiger', 'cat', 'dog'])
df

Unnamed: 0,tree,lion,tiger,cat,dog
0,1.0,0.071429,0.071429,0.076923,0.125
1,0.071429,1.0,0.333333,0.25,0.166667
2,0.071429,0.333333,1.0,0.25,0.166667
3,0.076923,0.25,0.25,1.0,0.2
4,0.125,0.166667,0.166667,0.2,1.0


* SentiSynset 객체

In [18]:
from nltk.corpus import sentiwordnet as swn

senti_synsets = list(swn.senti_synsets('slow'))  # list 붙여주기

In [19]:
print(type(senti_synsets))
print(len(senti_synsets))
print(senti_synsets)

<class 'list'>
11
[SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'), SentiSynset('slow.v.03'), SentiSynset('slow.a.01'), SentiSynset('slow.a.02'), SentiSynset('dense.s.04'), SentiSynset('slow.a.04'), SentiSynset('boring.s.01'), SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'), SentiSynset('behind.r.03')]


In [20]:
senti_synsets = list(swn.senti_synsets('father'))
print(type(senti_synsets))
print(len(senti_synsets))
print(senti_synsets)

<class 'list'>
9
[SentiSynset('father.n.01'), SentiSynset('forefather.n.01'), SentiSynset('father.n.03'), SentiSynset('church_father.n.01'), SentiSynset('father.n.05'), SentiSynset('father.n.06'), SentiSynset('founder.n.02'), SentiSynset('don.n.03'), SentiSynset('beget.v.01')]


In [22]:
# father 단어의 긍정 감성지수, 부정 감성지수, 객관성 지수(중립 감성지수)

father = swn.senti_synset('father.n.01')
father.pos_score(), father.neg_score(), father.obj_score()

(0.0, 0.0, 1.0)

In [43]:
# mother 단어의 긍정 감성지수, 부정 감성지수, 객관성 지수

mother = swn.senti_synset('mother.n.01')
mother.pos_score(), mother.neg_score(), mother.obj_score()

(0.0, 0.0, 1.0)

In [30]:
# fabulous 단어의 긍정 감성지수, 부정 감성지수, 객관성 지수

fabulous = swn.senti_synset('fabulous.a.01')
fabulous.pos_score(), fabulous.neg_score(), fabulous.obj_score()

(0.875, 0.125, 0.0)

In [45]:
# precisely 단어의 긍정 감성지수, 부정 감성지수, 객관성 지수

precisely = swn.senti_synset('precisely.r.01')
precisely.pos_score(), precisely.neg_score(), precisely.obj_score()

(0.125, 0.0, 0.875)

In [49]:
# love 단어의 긍정 감성지수, 부정 감성지수, 객관성 지수

love = swn.senti_synset('love.v.01')
love.pos_score(), love.neg_score(), love.obj_score()

(0.5, 0.0, 0.5)

In [47]:
wordnet.NOUN, wordnet.ADJ, wordnet.ADV, wordnet.VERB

('n', 'a', 'r', 'v')

* 감성지수 계산

In [53]:
from nltk import word_tokenize, pos_tag

sentence = "It's good to see you again."
word_list = word_tokenize(sentence)
word_list

['It', "'s", 'good', 'to', 'see', 'you', 'again', '.']

In [54]:
pos_tag(word_list)

[('It', 'PRP'),
 ("'s", 'VBZ'),
 ('good', 'JJ'),
 ('to', 'TO'),
 ('see', 'VB'),
 ('you', 'PRP'),
 ('again', 'RB'),
 ('.', '.')]

In [58]:
tag = ('good', 'JJ')
tag[1].startswith('J')

True

In [59]:
def penn_to_wn(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    elif tag.startswith('V'):
        return wordnet.VERB
    return None

In [60]:
for word, tag in pos_tag(word_list):
    print(word, penn_to_wn(tag))

It None
's v
good a
to None
see v
you None
again r
. None


In [61]:
# Sentence로부터 Senti_Synset 객체를 만드는 과정

sentence = "It's good to see you again"
word_list = [word for word in word_tokenize(sentence) if len(word) > 2]
word_list

['good', 'see', 'you', 'again']

In [62]:
for word, tag in pos_tag(word_list):
    wn_tag = penn_to_wn(tag)
    if wn_tag:          # 품사가 None으로 나오는 경우 함수 통과 못함
        synsets = list(swn.senti_synsets(word, wn_tag))
        synset = synsets[0]
        print(synset)

<good.a.01: PosScore=0.75 NegScore=0.0>
<see.n.01: PosScore=0.0 NegScore=0.0>
<again.r.01: PosScore=0.0 NegScore=0.0>


In [64]:
sentiment = 0
for word, tag in pos_tag(word_list):
    wn_tag = penn_to_wn(tag)
    if wn_tag:          # 품사가 None으로 나오는 경우 함수 통과 못함
        synsets = list(swn.senti_synsets(word, wn_tag))  # 나오는 의미를 한정시키기 위해서 wn_tag를 쓸 수 있다
        synset = synsets[0]
        sentiment += synset.pos_score() - synset.neg_score()

sentiment

0.75

In [66]:
from nltk import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [67]:
sentiment = 0
for word, tag in pos_tag(word_list):
    wn_tag = penn_to_wn(tag)
    if wn_tag:          # 품사가 None으로 나오는 경우 함수 통과 못함
        lemma = lemmatizer.lemmatize(word, wn_tag)
        synsets = list(swn.senti_synsets(lemma, wn_tag))
        synset = synsets[0]
        sentiment += synset.pos_score() - synset.neg_score()

sentiment

0.75

In [68]:
from nltk import sent_tokenize

document = "I watched this video at a friend's house. I'm glad I did not waste money buying this one. The video cover has a scene from the 1975 movie Capricorn One. The movie starts out with several clips of rocket blow-ups, most not related to manned flight. Sibrel's smoking gun is a short video clip of the astronauts preparing a video broadcast. He edits in his own voice-over instead of letting us listen to what the crew had to say. The video curiously ends with a showing of the Zapruder film. His claims about radiation, shielding, star photography, and others lead me to believe is he extremely ignorant or has some sort of ax to grind against NASA, the astronauts, or American in general. His science is bad, and so is this video."


In [70]:
sentiment = 0

for sentence in sent_tokenize(document):
    word_list = [word for word in word_tokenize(sentence) if len(word) > 2]     # 이 word_list에서 앞선 for문을 써야한다
    for word, tag in pos_tag(word_list):
        wn_tag = penn_to_wn(tag)
        if wn_tag:          # 품사가 None으로 나오는 경우 함수 통과 못함
            lemma = lemmatizer.lemmatize(word, wn_tag)
            synsets = list(swn.senti_synsets(lemma, wn_tag))
            if not synsets:         # 어떤 단어의 경우 synsets가 없을 수 있다(고유명사 등)
                print(word)
                continue
            synset = synsets[0]
            sentiment += synset.pos_score() - synset.neg_score()
    
print('긍정' if sentiment >= 0 else '부정')

scene
blow-ups
Sibrel
voice-over
Zapruder
others
부정


* 감성을 계산해주는 함수

In [78]:
def swn_polarity(text):
    # 감성 지수 초기화 
    sentiment = 0.0
    tokens_count = 0
    
    lemmatizer = WordNetLemmatizer()
    raw_sentences = sent_tokenize(text)
    # 분해된 문장별로 단어 토큰 -> 품사 태깅 후에 SentiSynset 생성 -> 감성 지수 합산 
    for raw_sentence in raw_sentences:
        # NTLK 기반의 품사 태깅 문장 추출  
        word_list = [word for word in word_tokenize(raw_sentence) if len(word) > 2]
        tagged_sentence = pos_tag(word_list)
        for word, tag in tagged_sentence:
            # WordNet 기반 품사 태깅과 어근 추출
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wordnet.NOUN, wordnet.ADJ, wordnet.ADV, wordnet.VERB):
                continue                   
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            if not lemma:
                continue
            # 어근을 추출한 단어와 WordNet 기반 품사 태깅을 입력해 Synset 객체를 생성. 
            synsets = wordnet.synsets(lemma, pos=wn_tag)
            if not synsets:
                continue
            # sentiwordnet의 감성 단어 분석으로 감성 synset 추출
            # 모든 단어에 대해 긍정 감성 지수는 +로 부정 감성 지수는 -로 합산해 감성 지수 계산. 
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
            sentiment += (swn_synset.pos_score() - swn_synset.neg_score())           
            tokens_count += 1
    
    if not tokens_count:
        return 0
    
    # 총 score가 0 이상일 경우 긍정(Positive) 1, 그렇지 않을 경우 부정(Negative) 0 반환
    return 1 if sentiment >= 0 else 0

* IMDB 영화평 감성분석

In [101]:
df = pd.read_csv('data/labeledTrainData.tsv', sep = '\t', quoting=3)   # quoting = 3은 quote - None(quote하지 말라)는 의미
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [102]:
# <br /> 태그는 공백으로 변환 (데이터프레임에서)
df['review'] = df['review'].str.replace('<br />', ' ')

In [103]:
# 구둣점, 숫자 제거 - 영문자가 아닌 글자는 공백으로 변환
df['review'] = df['review'].str.replace('[^A-Za-z]', ' ').str.strip()  # 좌우 공백 제거할 경우 str.strip()으로 다시 한 번 붙여주기

In [104]:
df.shape

(25000, 3)

In [77]:
# df = df.iloc[:1000, :]
# df.shape

(1000, 3)

In [105]:
%time df['pred'] = df['review'].apply(lambda x : swn_polarity(x))

Wall time: 6min 22s


In [106]:
from sklearn.metrics import accuracy_score

accuracy_score(df['sentiment'], df['pred'])

0.62568

# VADER Lexicon을 이용한 감성 분석

In [107]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

senti_analyzer = SentimentIntensityAnalyzer()
senti_score = senti_analyzer.polarity_scores(df['review'][0])
senti_score

{'neg': 0.13, 'neu': 0.743, 'pos': 0.127, 'compound': -0.7943}

In [108]:
def vader_polarity(document, threshold):
    score = senti_analyzer.polarity_scores(document)
    return 1 if score['compound'] >= threshold else 0

In [109]:
%time df['vader pred'] = df['review'].apply(lambda x: vader_polarity(x, 0.1))

Wall time: 55.4 s


In [110]:
accuracy_score(df['sentiment'], df['vader pred'])

0.69556

In [111]:
df

Unnamed: 0,id,sentiment,review,pred,vader pred
0,"""5814_8""",1,With all this stuff going down at the moment w...,1,0
1,"""2381_9""",1,The Classic War of the Worlds by Timothy Hin...,1,1
2,"""7759_3""",0,The film starts with a manager Nicholas Bell ...,0,0
3,"""3630_4""",0,It must be assumed that those who praised this...,0,1
4,"""9495_8""",1,Superbly trashy and wondrously unpretentious ...,0,1
...,...,...,...,...,...
24995,"""3453_3""",0,It seems like more consideration has gone into...,0,1
24996,"""5064_1""",0,I don t believe they made this film Completel...,1,1
24997,"""10905_3""",0,Guy is a loser Can t get girls needs to buil...,1,1
24998,"""10194_3""",0,This minute documentary Bu uel made in the ...,0,0
