In [5]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

In [6]:
# 파일 로드 및 처음 몇 줄 확인
file_path = './data/amazon.csv'
review_df = pd.read_csv(file_path)

review_df.head()

Unnamed: 0,class_index,review_title,review_text
0,2,Great CD,My lovely Pat has one of the GREAT voices of h...
1,2,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
2,1,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
3,2,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
4,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...


In [22]:
review_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   class_index   400000 non-null  int64 
 1   review_title  399976 non-null  object
 2   review_text   400000 non-null  object
dtypes: int64(1), object(2)
memory usage: 9.2+ MB


In [23]:
review_org = review_df.copy()
review_df = review_df.head(10000)
review_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   class_index   10000 non-null  int64 
 1   review_title  9998 non-null   object
 2   review_text   10000 non-null  object
dtypes: int64(1), object(2)
memory usage: 234.5+ KB


In [24]:
# 파이썬의 정규 표현식 모듈인 re를 이용하여 영어 문자열이 아닌 문자는 모두 공백으로 변환 
review_df['review'] = review_df['review_text'].apply( lambda x : re.sub("[^a-zA-Z]", " ", x) )

In [25]:
class_df = review_df['class_index'].replace({1: 0, 2: 1})

In [26]:
class_df[:5]

0    1
1    1
2    0
3    1
4    1
Name: class_index, dtype: int64

In [27]:
feature_df = review_df[['review_text']]
feature_df.head()

Unnamed: 0,review_text
0,My lovely Pat has one of the GREAT voices of h...
1,Despite the fact that I have only played a sma...
2,I bought this charger in Jul 2003 and it worke...
3,Check out Maha Energy's website. Their Powerex...
4,Reviewed quite a bit of the combo players and ...


In [28]:
feature_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review_text  10000 non-null  object
dtypes: object(1)
memory usage: 78.3+ KB


In [29]:
X_train, X_test, y_train, y_test= train_test_split(feature_df, class_df, test_size=0.3, random_state=156)

X_train.shape, X_test.shape

((7000, 1), (3000, 1))

In [30]:
from nltk.corpus import wordnet as wn

# 간단한 NTLK PennTreebank Tag를 기반으로 WordNet기반의 품사 Tag로 변환
def penn_to_wn(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return

In [31]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag

def swn_polarity(text):
    # 감성 지수 초기화 
    sentiment = 0.0
    tokens_count = 0
    
    lemmatizer = WordNetLemmatizer() # 영어 단어인경우에 표제어를 추출하는 과정
    # 표제어: 변형가능한 단어의 중심 단어
    # 예) am, is, are, was, were => be
    # 표제어 관련 전처리는 영어에 대해서 진행한다. 
    # (한국어는 한국어의 특성상) 어근 분리 정도만 한다.
    raw_sentences = sent_tokenize(text)
    # 분해된 문장별로 단어 토큰 -> 품사 태깅 후에 SentiSynset 생성 -> 감성 지수 합산 
    for raw_sentence in raw_sentences:
        # NTLK 기반의 품사 태깅 문장 추출  
        tagged_sentence = pos_tag(word_tokenize(raw_sentence))
        for word , tag in tagged_sentence:
            
            # WordNet 기반 품사 태깅과 어근 추출
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wn.NOUN , wn.ADJ, wn.ADV):
                continue                   
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            if not lemma:
                continue
            # 어근을 추출한 단어와 WordNet 기반 품사 태깅을 입력해 Synset 객체를 생성. 
            synsets = wn.synsets(lemma , pos=wn_tag)
            if not synsets:
                continue
            # sentiwordnet의 감성 단어 분석으로 감성 synset 추출
            # 모든 단어에 대해 긍정 감성 지수는 +로 부정 감성 지수는 -로 합산해 감성 지수 계산. 
            synset = synsets[0] # 감성분석을 하기 위해 synsets의 첫번째 값을 선택
            swn_synset = swn.senti_synset(synset.name())
            sentiment += (swn_synset.pos_score() - swn_synset.neg_score())           
            tokens_count += 1
    
    if not tokens_count:
        return 0
    
    # 총 score가 0 이상일 경우 긍정(Positive) 1, 그렇지 않을 경우 부정(Negative) 0 반환
    if sentiment >= 0 :
        return 1
    
    return 0

In [32]:
# review_df['preds'] = feature_df.apply( lambda x : swn_polarity(x) )
y_target = class_df.values

In [33]:
preds = review_df['review_text'].apply( lambda x : swn_polarity(x) )

In [35]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score 
from sklearn.metrics import recall_score, f1_score, roc_auc_score
import numpy as np

print(confusion_matrix( y_target, preds))
print("정확도:", np.round(accuracy_score(y_target , preds), 4))
print("정밀도:", np.round(precision_score(y_target , preds),4))
print("재현율:", np.round(recall_score(y_target, preds), 4))
print("F-1 Score:", np.round(f1_score(y_target, preds), 4))

[[2841 2034]
 [1305 3820]]
정확도: 0.6661
정밀도: 0.6525
재현율: 0.7454
F-1 Score: 0.6959
