# 비지도 학습 감성분석 - LExicon 기반

In [83]:
import numpy as np 
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

### Wordnet Synset 및 Sentiwordnet SentiSynset 클래스

In [84]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [85]:
from nltk.corpus import wordnet

term = 'present' # 여러가지 단어의 뜻
synsets = wordnet.synsets(term)


In [86]:
type(synsets), len(synsets)

(list, 18)

In [87]:
print(synsets)

[Synset('present.n.01'), Synset('present.n.02'), Synset('present.n.03'), Synset('show.v.01'), Synset('present.v.02'), Synset('stage.v.01'), Synset('present.v.04'), Synset('present.v.05'), Synset('award.v.01'), Synset('give.v.08'), Synset('deliver.v.01'), Synset('introduce.v.01'), Synset('portray.v.04'), Synset('confront.v.03'), Synset('present.v.12'), Synset('salute.v.06'), Synset('present.a.01'), Synset('present.a.02')]


In [88]:
# wordnet에 있는 synsets의 자료구조

for synset in synsets[:5]:
    print(f'#### name: {synset.name()}####')
    print('POS:', synset.lexname())
    print('정의:', synset.definition())
    print('표제어:', synset.lemma_names())

#### name: present.n.01####
POS: noun.time
정의: the period of time that is happening now; any continuous stretch of time including the moment of speech
표제어: ['present', 'nowadays']
#### name: present.n.02####
POS: noun.possession
정의: something presented as a gift
표제어: ['present']
#### name: present.n.03####
POS: noun.communication
정의: a verb tense that expresses actions or states at the time of speaking
표제어: ['present', 'present_tense']
#### name: show.v.01####
POS: verb.perception
정의: give an exhibition of to an interested audience
표제어: ['show', 'demo', 'exhibit', 'present', 'demonstrate']
#### name: present.v.02####
POS: verb.communication
정의: bring forward and present to the mind
표제어: ['present', 'represent', 'lay_out']


- 어휘간의 유사도 파악 가능

In [89]:
# 단어를 입력할때는 synsets()
for synset in wordnet.synsets('tiger'):
    print(synset.name(), synset.definition())

tiger.n.01 a fierce or audacious person
tiger.n.02 large feline of forests in most of Asia having a tawny coat with black stripes; endangered


In [90]:
# 단어, 품사를 아는 경우에는 synset()
tiger = wordnet.synset('tiger.n.02')
tree = wordnet.synset('tree.n.01')
lion = wordnet.synset('lion.n.01')
cat = wordnet.synset('cat.n.01')
dog = wordnet.synset('dog.n.01')

In [91]:
# 단어간의 유사도

tiger.path_similarity(lion), tiger.path_similarity(dog), tiger.path_similarity(tree)
# 호랑이와 사자의 유사도가 더 비슷하다

(0.3333333333333333, 0.16666666666666666, 0.07142857142857142)

In [92]:
# 5개 단어간의 유사도 
similarities = []
entities = [tree, lion, tiger, cat, dog]
for entity in entities:
    similarities = [entity.path_similarity(another) for another in entities]
    similarities.append(similarity)

NameError: name 'similarity' is not defined

In [None]:
df = pd.DataFrame(similarities, columns=['tree', 'lion', 'tiger', 'cat', 'dog'])
df

ValueError: Shape of passed values is (5, 1), indices imply (5, 5)

- SentiSynset 객체

In [None]:
import nltk
nltk.download('sentiwordnet') 

[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\sentiwordnet.zip.


True

In [None]:
from nltk.corpus import sentiwordnet

senti_synsets = list(sentiwordnet.senti_synsets('slow'))

In [None]:
print(type(senti_synsets))
print(len(senti_synsets))
print(senti_synsets)

<class 'list'>
11
[SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'), SentiSynset('slow.v.03'), SentiSynset('slow.a.01'), SentiSynset('slow.a.02'), SentiSynset('dense.s.04'), SentiSynset('slow.a.04'), SentiSynset('boring.s.01'), SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'), SentiSynset('behind.r.03')]


In [None]:
senti_synsets = list(sentiwordnet.senti_synsets('father'))
print(type(senti_synsets))
print(len(senti_synsets))
print(senti_synsets)

<class 'list'>
9
[SentiSynset('father.n.01'), SentiSynset('forefather.n.01'), SentiSynset('father.n.03'), SentiSynset('church_father.n.01'), SentiSynset('father.n.05'), SentiSynset('father.n.06'), SentiSynset('founder.n.02'), SentiSynset('don.n.03'), SentiSynset('beget.v.01')]


In [None]:
# father 단어의 긍정감성 지수, 부정감성지수, 객관성 지수
father = sentiwordnet.senti_synset('father.n.01')
father.pos_score(), father.neg_score(), father.obj_score()

(0.0, 0.0, 1.0)

In [None]:
# mother 단어의 긍정감성 지수, 부정감성지수, 객관성 지수
mother = sentiwordnet.senti_synset('mother.n.01')
mother.pos_score(), mother.neg_score(), mother.obj_score()

(0.0, 0.0, 1.0)

In [None]:
# fabulous 단어의 긍정감성 지수, 부정감성지수, 객관성 지수
fabulous = sentiwordnet.senti_synset('fabulous.a.01')
fabulous.pos_score(), fabulous.neg_score(), fabulous.obj_score()

(0.875, 0.125, 0.0)

In [None]:
# precisely 단어의 긍정감성 지수, 부정감성지수, 객관성 지수
precisely = sentiwordnet.senti_synset('precisely.r.01')
precisely.pos_score(), precisely.neg_score(), precisely.obj_score()

(0.125, 0.0, 0.875)

In [None]:
# lvoe 단어의 긍정감성 지수, 부정감성지수, 객관성 지수
love = sentiwordnet.senti_synset('love.v.01')
love.pos_score(), love.neg_score(),love.obj_score()

(0.5, 0.0, 0.5)

In [None]:
wordnet.NOUN, wordnet.ADJ, wordnet.ADV, wordnet.VERB

('n', 'a', 'r', 'v')

- 감정지수 계산

In [None]:
from nltk import word_tokenize, pos_tag
sentence = "It's good to see you again"
word_tokenize(sentence)
word_list

NameError: name 'word_list' is not defined

In [None]:
pos_tag(word_list)


NameError: name 'word_list' is not defined

In [None]:
def penn_to_wn(tag):
    if tag.startwith('J'):
        return wordnet.ADJ
    elif tag.startwith('N'):
        return wordnet.NOUN
    elif tag.startwith('R'):
        return wordnet.ADV
    elif tag.startwith('V'):
        return wordnet.VERB
    return None

In [None]:
for word, tag in pos_tag(world_list):
    print(word, tag)

In [None]:
# Sentence로부터 SEnti_Synset 객체를 만드는 과정
sentence = "It's good to see you again."
word_list = [word for word in word_tokenize(sentence) if len(word) >2]
word_list

['good', 'see', 'you', 'again']

In [None]:
for word, tag in pos_tag(word_list):
    wn_tag = penn_to_wn(tag)
    if wn_tag:
        synsets = list(sentiwordnet.senti_synsets(word, wn_tag))
        synsets = synsets[0]
        print(synset)
    

LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle[0m

  Searched in:
    - 'C:\\Users\\MSI/nltk_data'
    - 'C:\\Users\\MSI\\Downloads\\Anaconda\\nltk_data'
    - 'C:\\Users\\MSI\\Downloads\\Anaconda\\share\\nltk_data'
    - 'C:\\Users\\MSI\\Downloads\\Anaconda\\lib\\nltk_data'
    - 'C:\\Users\\MSI\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [None]:
sentiment = 0.
for word, tag in pos_tag(word_list):
    wn_tag = penn_to_wn(tag)
    if wn_tag:
        synsets = list(sentiwordnet.senti_synsets(word, wn_tag))
        synset = synsets[0]
        sentiment +=synset.pos_score() - synset.neg_score()
sentiment

AttributeError: 'str' object has no attribute 'startwith'

In [None]:
from nltk import WordNetLemmatizer
lemma = WordNetLemmatizer()

In [None]:
sentiment = 0.
for word, tag in pos_tag(word_list):
    wn_tag = penn_to_wn(tag)
    if wn_tag:
        lemma = lemmatizer.lemmatize(word, wn_tag)
        synsets = list(sentiwordnet.senti_synsets(lemma, wn_tag))
        synset = synsets[0]
        sentiment += synset.pos_score() - synset.neg_score()
sentiment

AttributeError: 'str' object has no attribute 'startwith'

In [None]:
from nltk import sent_tokenize
document = "I watched this video at a friend's house. I'm glad I did not waste money buying this one. The video cover has a scene from the 1975 movie Capricorn One. The movie starts out with several clips of rocket blow-ups, most not related to manned flight. Sibrel's smoking gun is a short video clip of the astronauts preparing a video broadcast. He edits in his own voice-over instead of letting us listen to what the crew had to say. The video curiously ends with a showing of the Zapruder film. His claims about radiation, shielding, star photography, and others lead me to believe is he extremely ignorant or has some sort of ax to grind against NASA, the astronauts, or American in general. His science is bad, and so is this video."


In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [None]:
sentiment = 0.0 
for sentence in sent_tokenize(document):
    word_list = [word for word in word_tokenize(sentence) if len(word) >2]
    for word, tag in pos_tag(word_list):
        wn_tag = penn_to_wn(tag)
        if wn_tag:
            lemma = lemmatizer.lemmatize(word, wn_tag)
            synsets = list(sentiwordnet.senti_synsets(word, wn_tag))
            if not synsets:
                continue 
            synset = synsets[0]
            sentiment +=synset.pos_score() - synset.neg_score()
print('긍정' if sentiment >= 0 else '부정')
    

AttributeError: 'str' object has no attribute 'startwith'

- 감성을 계산해주는 함수

In [None]:
def swn_polarity(text):
    # 감성 지수 초기화 
    sentiment = 0.0
    tokens_count = 0
    
    lemmatizer = WordNetLemmatizer()
    raw_sentences = sent_tokenize(text)
    # 분해된 문장별로 단어 토큰 -> 품사 태깅 후에 SentiSynset 생성 -> 감성 지수 합산 
    for raw_sentence in raw_sentences:
        # NTLK 기반의 품사 태깅 문장 추출  
        word_list = [word for word in word_tokenize(raw_sentence) if len(word) > 2]
        tagged_sentence = pos_tag(word_list)
        for word, tag in tagged_sentence:
            # WordNet 기반 품사 태깅과 어근 추출
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wordnet.NOUN, wordnet.ADJ, wordnet.ADV, wordnet.VERB):
                continue                   
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            if not lemma:
                continue
            # 어근을 추출한 단어와 WordNet 기반 품사 태깅을 입력해 Synset 객체를 생성. 
            synsets = wordnet.synsets(lemma, pos=wn_tag)
            if not synsets:
                continue
            # sentiwordnet의 감성 단어 분석으로 감성 synset 추출
            # 모든 단어에 대해 긍정 감성 지수는 +로 부정 감성 지수는 -로 합산해 감성 지수 계산. 
            synset = synsets[0]
            swn_synset = sentiwordnet.senti_synset(synset.name())
            sentiment += (swn_synset.pos_score() - swn_synset.neg_score())           
            tokens_count += 1
    
    if not tokens_count:
        return 0


### - IDBM

In [None]:
df = pd.read_csv('data/labeledTrainData.tsv', sep='\t', quoting=3)      # 3: QUOTE_NONE
df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [None]:
df.review = df.review.str.replace('<br />',' ')

In [None]:
df.review = df.review.str.replace('[^A-Za-z]',' ').str.strip()
df.review[0][:1000]

'With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay   Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him   The actual feature film bit when it finally starts is only on for  

In [None]:
df.shape

(25000, 3)

In [None]:
#df = df.iloc[:1000, :]
#df.shape

(1000, 3)

In [None]:
%time df['pred'] = df.review.apply(lambda x: swn_polarity(x))

AttributeError: 'str' object has no attribute 'startwith'

### VADER Lexicon을 이용한 감성 분석

In [None]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

senti_analyzer = SentimentIntensityAnalyzer()
senti_score = senti_analyzer.polarity_scores(df.review[0])
senti_score

{'neg': 0.13, 'neu': 0.743, 'pos': 0.127, 'compound': -0.7943}

In [None]:
def vader_polarity(document, threshold):
    score = senti_analyzer.polarity_scores(document)
    return 1 if score['compound'] >= threshold else 0


In [None]:
%time df['vader pred'] = df.review.apply(lambda x: varder_polarity(x, 0.1))

NameError: name 'varder_polarity' is not defined

- 예측 비교

In [None]:
cdf = df[['sentiment', 'pred','varder pred']]
cdf.head(10)

KeyError: "['pred', 'varder pred'] not in index"