In [89]:
#library loading for preprocessing
import re
import os
import sys
import json

from pykospacing import spacing #의미에 맞게 띄어쓰기 정리해주는 것
import kss
from konlpy.tag import Kkma #konlpy는 형태소 분석기
from konlpy.tag import Okt
import soynlp 

kkma = Kkma()
okt = Okt()

In [142]:
#read review files
iphone12_reviews = []
for i in range(2):
    f = open("data0" + str(i+1) + ".txt", "r")
    lines = f.readlines()
    raw_data = ""
    for line in lines:
        raw_data += line
    iphone12_reviews.append(raw_data)

In [150]:
#cleansing function
def cleansing(review):
    cleansed_data = re.sub(r'[0-9]+:[0-9]+', '', review) #remove timestamp
    cleansed_data = re.sub(r'\s+', ' ', cleansed_data)
    cleansed_data = cleansed_data.replace(" ", "")
    cleansed_data = spacing(cleansed_data)
    review = ''
    for sentence in kss.split_sentences(cleansed_data):
        if sentence[-1] == '?':
            sentence += ' '
        elif sentence[-1] != '.':
            sentence += '. '
        else:
            sentence += ' '
        review += sentence
    return review

In [152]:
cleansed_reviews=[]
for review in iphone12_reviews:
    cleansed_reviews.append(cleansing(review))

In [160]:
#customized sentiment dictionary for sentimental analysis
#original categories: 서비스 / 맛 / 가격 / 분위기 / 방문

color_feature = {'색':['로즈골드', '퍼플', '레드', '유광', '무광'],
                '컬러':['로즈골드', '퍼플', '레드', '유광', '무광'],
                '칼라':['로즈골드', '퍼플', '레드', '유광', '무광']}
color_good_feature = {'색':['고급', '좋', '괜찮', '최고', '짱', '훌륭', '추천', '최고', '만족', '세련', '최고', '감동', '특별', '새로운'],
                     '컬러':['고급', '좋', '괜찮', '최고', '짱', '훌륭', '추천', '최고', '만족', '세련', '최고', '감동', '특별', '새로운'],
                     '칼라':['고급', '좋', '괜찮', '최고', '짱', '훌륭', '추천', '최고', '만족', '세련', '최고', '감동', '특별', '새로운']}
color_bad_feature = {'색':['별로', '최악', '질리'],
                    '컬러':['별로', '최악', '질리'],
                    '칼라':['별로', '최악', '질리']}

size_big_feature = {'크기':['크','큰','시원', '부담'], '사이즈':['크','큰','시원', '부담']}
size_small_feature = {'크기':['작', '아담', '조그', '쪼그', '미니', '스몰'], '사이즈':['작', '아담', '조그', '쪼그', '미니', '스몰']}

cap_big_feature = {'용량':['크','큰'], '기가':['크','큰']}
cap_small_feature = {'용량':['작', '적'], '기가':['작', '적']}

weight_heavy_feature = {'무게':['증가', '무거', '무겁']}
weight_light_feature = {'무게':['감소', '가벼', '가볍']}

display_good_feature = {'디스플레이':['만족', '괜찮', '얇'], '베젤':['만족', '괜찮', '얇']}
display_bad_feature = {'디스플레이':['별로', '두껍', '거슬'], '베젤':['별로', '두껍', '거슬']}

price_good_feature = {'가격': ['괜찮','착하다','저렴','적당','싸다','좋다','합리적','훌륭','최고','만족','알맞다', '무난','괜춘','최상','최상','굿', '추천', '뛰어난'],
                     '가성비':['괜찮','착하다','저렴','적당','싸다','좋다','합리적','훌륭','최고','만족','알맞다', '무난','괜춘','최상', '굿', '추천', '뛰어난']}
price_bad_feature ={'가격':['비싸','있는','있다','나쁘','사악','비효율','높다','부담','아쉽','별로','그닥','그다지','쎄','높','거품'],
                   '가성비':['비싸','있는','있다','나쁘','사악','비효율','높다','부담','아쉽','별로','그닥','그다지','쎄','높','거품']}

charge_good_feature = {'배터리':['만족','괜찮','상향','대폭','오래','길','충분','빠른'],
                      '충전':['만족','괜찮','상향','대폭','오래','길','충분','빠른']}
charge_bad_feature = {'배터리':['별로','부족'],
                     '충전':['별로','부족']}

negative_word_emotion = ['안','않','못','없','아닌','아니']

In [107]:
#특징 키워드를 추출하는 함수
#특징 키워드가 속한 문장에서 뒤로 단어 + 키워드 + 단어 / 키워드 + 단어 + 단어 를 추출
def get_feature_keywords(feature_keywords, review):
    feature_temp = []
    for keyword in feature_keywords:
        if re.findall(keyword, review):
            sub_list = ['게','고','음','며','데','만','도','면']
            
            for sub in sub_list:
                if sub+' ' in review:
                    review = re.sub(sub+' ', sub+',', review)
                
            a = re.findall(keyword +'+[ㄱ-ㅎ|ㅏ-ㅣ|가-힣]+\s+[ㄱ-ㅎ|ㅏ-ㅣ|가-힣]+\s+[ㄱ-ㅎ|ㅏ-ㅣ|가-힣]+',review) # K한 한 한글
            b = re.findall(keyword + '+\s+[ㄱ-ㅎ|ㅏ-ㅣ|가-힣]+\s+[ㄱ-ㅎ|ㅏ-ㅣ|가-힣]+',review) # K 한 한글 
            c = re.findall('[ㄱ-ㅎ|ㅏ-ㅣ|가-힣]+\s+' + keyword +'[ㄱ-ㅎ|ㅏ-ㅣ|가-힣]+',review) # 한 K한글 예쁜 분위기가
                
            for ngram in a:
                t = ()
                feature_temp.append(t + (ngram,keyword))
            for ngram in b:
                t = ()
                feature_temp.append(t + (ngram,keyword))
            for ngram in c:
                t = ()
                feature_temp.append(t + (ngram,keyword))     
    return feature_temp

In [108]:
def get_feature_emotions(feature_good_dict,feature_bad_dict,feature_temp):
    good_feature_emotion_list = []
    bad_feature_emotion_list = []
    
    for ngrams in feature_temp:
        keyword = ngrams[1]
        ngram = ngrams[0]
        is_bad_feature = None
        
        good_emotion_list = feature_good_dict[keyword]
        bad_emotion_list = feature_bad_dict[keyword]
        for emotion in good_emotion_list:
            if re.findall(emotion, ngram):
                is_bad_feature = False  
        for emotion in bad_emotion_list:
            if re.findall(emotion, ngram):
                is_bad_feature = True    
        for negative in negative_word_emotion:
            if re.findall(negative, ngram):
                if is_bad_feature == True:
                    is_bad_feature = False
                    break
                elif is_bad_feature == False:
                    is_bad_feature = True
                    break
                else:
                    is_bad_feature = True
                    break   
        if is_bad_feature:
            bad_feature_emotion_list.append(ngram)
        elif is_bad_feature == False:
            good_feature_emotion_list.append(ngram)
        else:
            pass
    return good_feature_emotion_list, bad_feature_emotion_list

In [166]:
good_color_cnt = 0
bad_color_cnt = 0
big_size_cnt = 0
small_size_cnt = 0
big_cap_cnt = 0
small_cap_cnt = 0
heavy_cnt = 0
light_cnt = 0
good_display_cnt = 0
bad_display_cnt = 0
good_price_cnt = 0
bad_price_cnt = 0

for review in cleansed_reviews: 
    color_temp = get_feature_keywords(color_good_feature.keys(), review)
    good_color, bad_color = get_feature_emotions(color_good_feature, color_bad_feature, color_temp)

    size_temp = get_feature_keywords(size_big_feature.keys(), review)
    big_size, small_size = get_feature_emotions(size_big_feature, size_small_feature, size_temp)
    
    cap_temp = get_feature_keywords(cap_big_feature.keys(), review)
    big_cap, small_cap = get_feature_emotions(cap_big_feature, cap_small_feature, cap_temp)
    
    weight_temp = get_feature_keywords(weight_heavy_feature.keys(), review)
    heavy_weight, light_weight = get_feature_emotions(weight_heavy_feature, weight_light_feature, weight_temp)
    
    display_temp = get_feature_keywords(display_good_feature.keys(), review)
    good_display, bad_display = get_feature_emotions(display_good_feature, display_bad_feature, display_temp)
    
    price_temp = get_feature_keywords(price_good_feature.keys(), review)
    good_price, bad_price = get_feature_emotions(price_good_feature, price_bad_feature, price_temp)
    
if len(good_color) > len(bad_color):
    good_color_cnt += 1
elif len(good_color) < len(bad_color):
    bad_color_cnt +=1
else:
    pass

if len(big_size) > len(small_size):
    big_size_cnt += 1
elif len(big_size) < len(small_size):
    small_size_cnt +=1
else:
    pass

if len(big_cap) > len(small_cap):
    big_cap_cnt += 1
elif len(big_cap) < len(small_cap):
    small_cap_cnt +=1
else:
    pass

if len(heavy_weight) > len(light_weight):
    heavy_cnt += 1
elif len(heavy_weight) < len(light_weight):
    light_cnt +=1
else:
    pass

if len(good_display) > len(bad_display):
    good_display_cnt += 1
elif len(good_display) < len(bad_display):
    bad_display_cnt +=1
else:
    pass

if len(good_price) > len(bad_price):
    good_price_cnt += 1
elif len(good_price) < len(bad_price):
    bad_price_cnt +=1
else:
    pass

In [169]:
check_division = lambda x, y: y if y ==0 else round((x / float(y)),2)
print('color is good? {}/{} = {}%'.format(
    good_color_cnt, good_color_cnt + bad_color_cnt,
    100*check_division(good_color_cnt, good_color_cnt + bad_color_cnt)))
print('size is big? {}/{} = {}%'.format(
    big_size_cnt, big_size_cnt + small_size_cnt,
    100*check_division(big_size_cnt, big_size_cnt + small_size_cnt)))
print('capacity is big? {}/{} = {}%'.format(
    big_cap_cnt, big_cap_cnt + small_cap_cnt,
    100*check_division(big_cap_cnt, big_cap_cnt + small_cap_cnt)))
print('weight is heavy? {}/{} = {}%'.format(
    heavy_cnt, heavy_cnt + light_cnt,
    100*check_division(heavy_cnt, heavy_cnt + light_cnt)))
print('display is good? {}/{} = {}%'.format(
    good_display_cnt, good_display_cnt + bad_display_cnt,
    100*check_division(good_display_cnt, good_display_cnt + bad_display_cnt)))
print('price is good? {}/{} = {}%'.format(
    good_price_cnt, good_price_cnt + bad_price_cnt,
    100*check_division(good_price_cnt, good_price_cnt + bad_price_cnt)))

color is good? 1/1 = 100.0%
size is big? 1/1 = 100.0%
capacity is big? 0/1 = 0.0%
weight is heavy? 0/0 = 0%
display is good? 1/1 = 100.0%
price is good? 0/1 = 0.0%
