In [11]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import numpy as np
from konlpy.tag import Okt
from tqdm import tqdm
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk import sent_tokenize

In [12]:
#detokenize
def detokenize(tokenized_index):
    detokenized_doc = []
    for i in tokenized_index:
        t = ' '.join(document[i])
        detokenized_doc.append(t)

    return detokenized_doc
    
#data preprocessing
def preprocess(origin_data,colname):
   #remove all except Hangul and spaces
    origin_data[colname] = origin_data[colname].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 | . ]","",regex=True)
    ##remove null rows 
    origin_data = origin_data.dropna(how = 'any')
    return origin_data[colname]
    

##Sentence Tokenization
def tokenize_sentence(origin_data,colname):
    sentence_tokenized=[]
    for text in tqdm(origin_data['Title']):
        sent_tokenized_text=sent_tokenize(text)
        for pp in sent_tokenized_text:
            sentence_tokenized.append(pp)
    return sentence_tokenized

#Word Tokenization
def tokenized_word(sentence_data):
    ## korean stopwords
    stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다','이다','가다','오다','가면','일본','그것','되다','니다','후지산',
            '가야','위해','할','갈','수','또','을','로','때','것','월','시','모','후','고','무','조금','.']

    okt = Okt()

    for sentence in tqdm(sentence_data):
        pos_tokenized=okt.pos(sentence) ##POS tagging
        removed=[]
        removed2=[]
        for word in pos_tokenized:
            if(word[0] not in stopwords):
                removed.append(word[0])
                removed2.append([word[0]])

        if(len(removed)>=1):
                document.append(removed)
                result.append(removed2)
            
    return document, result

##extract sentences associated with specific keywords
def extract_sentence(keywords_all):
    ex_sentence=[]
    ex_words=[]
    flag=False
    index=0
    for sentence in tqdm(document):
        w=[]
        ## save the keywords array
        for word in sentence:
            if(word in keywords_all): #if containing specific keywords
                flag=True
                w.append(word)
        if(flag==True):
            ##save index of sentence(recall result[index])
            ex_sentence.append(index) 
            ex_words.append(w)
            flag=False
        index+=1
    
    return ex_sentence, ex_words

##calculate
#DTM, word dictionary mapped to DTM, emotional score dictionary, weight dictionary by topic
def calScore(arr,re_vocab,sent_dict,weight):
    score=[]
    for arr_idx in range(len(arr)):
        dtm=arr[arr_idx][0]
        voca=re_vocab[arr_idx] 
        score.append(0) ##Initialize the score
        for dtm_idx in range(len(dtm)):
            value=voca[dtm_idx] #
            if(value in sent_dict): ##If DTM has a word for emotion,
                score[arr_idx]+=dtm[dtm_idx]*sent_dict[value]*weight[value]
    return score

##DTM
def getDTM(detokenized_data):
    j=0
    arr=[]
    re_vocab=[]

    for sentence in detokenized_data:
        print(j)
        corpus=[sentence]
        vector=CountVectorizer()
        try:
            # Record the frequency of each word from the corpus
            arr.append(vector.fit_transform(corpus).toarray())
            vocab=vector.vocabulary_ # vocabularies for each review
            #reverse vocabulary 
            re_vocab.append({v:k for k,v in vocab.items()})

            print('bag of words vector :', arr[j]) 
            print('vocabulary :',re_vocab[j])

            print()
            j+=1
        except:
            arr.append([[1,1]])
            re_vocab.append({0:'테스트',1:'단어'})
            print("Error")
    return arr, re_vocab

##mapping from pandas table as python dictionary
def getDict(pd_table,colname1, colname2):
    dict={}
    for idx in range(len(pd_table[colname1])):
        if(pd_table[colname2][idx]!=0):##save only if the data!=0
            dict[pd_table[colname1][idx]]=pd_table[colname2][idx]
    return dict

def showScore(score):
    cnt=0;
    sum=0;
    for i in score:
        if(i!=0):
            sum+=i
            cnt+=1
    return sum,cnt

In [13]:
#load data
origin_data=pd.read_table(r'japan_fujimountain_origin.csv',sep=",")

In [14]:
## data preprocessing
origin_data['Title']=preprocess(origin_data,'Title')

##change word as prototype
word_origin_data=pd.read_table(r'word\origin_restore.csv',sep=",")
word_origin_list=word_origin_data['단어']
word_origin_list= word_origin_list.dropna(how = 'any')

for i in range(len(origin_data['Title'])):
    sen=origin_data['Title'][i]
    
    for word in word_origin_list:
        candidate=word_origin_data[word]
        candidate= candidate.dropna(how = 'any')
        for ww in candidate:
            if ww in sen:
                origin_data['Title'][i]=origin_data['Title'][i].replace(ww,word)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin_data['Title'][i]=origin_data['Title'][i].replace(ww,word)


In [15]:
#sentence tokenization
sentence_tokenized=tokenize_sentence(origin_data,'Title')

#word tokenization
document=[] # save as 1-demensional array
result=[] # save as 2-demensional array
document,result=tokenized_word(sentence_tokenized)

100%|█████████████████████████████████████████████████████████████████████████████| 453/453 [00:00<00:00, 12249.02it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1601/1601 [00:02<00:00, 568.28it/s]


In [16]:
##load topic keyword data
keyword_data=pd.read_table(r'word\keyword.csv',sep=",")

cost_keywords=list(keyword_data['money'])
land_keywords=list(keyword_data['landscape'])
fun_keywords=list(keyword_data['fun'])

##load topic-word weight data
topic_word_weight_data=pd.read_table(r'word\topic-word_weight.csv',sep=",")

#load word sentiment score vocabulary
sentdic_cost_data=pd.read_table(r'word\money_sentdic.csv',sep=",")
sentdic_land_data=pd.read_table(r'word\landscape_sentdic.csv',sep=",")
sentdic_fun_data=pd.read_table(r'word\fun_sentdic.csv',sep=",")

sentdic_cost=getDict(sentdic_cost_data,'word','score')
sentdic_land=getDict(sentdic_land_data,'word','score')
sentdic_fun=getDict(sentdic_fun_data,'word','score')

cost_weight=getDict(topic_word_weight_data,'word','money')
land_weight=getDict(topic_word_weight_data,'word','landscape')
fun_weight=getDict(topic_word_weight_data,'word','fun')


In [21]:
#xx_sentence : array that stores sentence of index associated with xx keywords
#xx_words : keywords contained by each sentence
cost_sentence, cost_words=extract_sentence(cost_keywords)
land_sentence, land_words=extract_sentence(land_keywords)
fun_sentence, fun_words=extract_sentence(fun_keywords)
print("Number of 'cost' related reviews :  " ,len(cost_sentence))
print("Number of 'landscape' related reviews :  " ,len(land_sentence))
print("Number of 'fun' related reviews :  " ,len(fun_sentence))

100%|██████████████████████████████████████████████████████████████████████████| 1567/1567 [00:00<00:00, 174206.81it/s]
100%|██████████████████████████████████████████████████████████████████████████| 1567/1567 [00:00<00:00, 195965.13it/s]
100%|██████████████████████████████████████████████████████████████████████████| 1567/1567 [00:00<00:00, 174169.87it/s]

Number of 'cost' related reviews :   35
Number of 'landscape' related reviews :   632
Number of 'fun' related reviews :   78





In [18]:
#DTM
# detokenize
detokenized_cost = detokenize(cost_sentence)
detokenized_land=detokenize(land_sentence)
detokenized_fun=detokenize(fun_sentence)

##DTM
arr_cost,re_vocab_cost=getDTM(detokenized_cost)
arr_land,re_vocab_land=getDTM(detokenized_land)
arr_fun,re_vocab_fun=getDTM(detokenized_fun)

##calculate score
cost_score=calScore(arr_cost,re_vocab_cost,sentdic_cost,cost_weight)
land_score=calScore(arr_land,re_vocab_land,sentdic_land,land_weight)
fun_score=calScore(arr_fun,re_vocab_fun,sentdic_fun,fun_weight)

sentence_origin=cost_sentence
cal_score=cost_score
sss=0
for sd in sentence_origin:
    print(document[sd]," : ",cal_score[sss])
    print()
    sss+=1

0
bag of words vector : [[1 1 1 1 1 1 1 1 1 1 1 1 1 1]]
vocabulary : {0: '거기', 3: '도착', 13: '하는', 4: '매우', 12: '편리', 9: '자동차', 7: '요금', 5: '비밀리', 6: '비싸다', 2: '날씨', 10: '좋다', 1: '구매', 11: '티켓', 8: '있습니다'}

1
bag of words vector : [[1 1 1 1 2 1 1 1]]
vocabulary : {1: '경치', 3: '아름답다', 6: '풍경', 4: '재미', 2: '비용', 7: '효과', 5: '저렴', 0: '가격'}

2
bag of words vector : [[1 1 3 1 1 1 1 1 1 1]]
vocabulary : {8: '호텔', 2: '매우', 3: '비용', 9: '효율', 6: '이며', 0: '객실', 1: '깨끗하며', 4: '서비스', 5: '세심', 7: '합니다'}

3
bag of words vector : [[2 1 1 1 1 1]]
vocabulary : {0: '가격', 3: '시간', 2: '동안', 1: '고정', 5: '패키지', 4: '입니다'}

4
bag of words vector : [[1 1 1 1 1 1 1]]
vocabulary : {3: '약시', 0: '가격', 5: '잠겨', 4: '있으므로', 1: '당황', 6: '하지', 2: '않습니다'}

5
bag of words vector : [[1 1 1 1 1 1 1 1 1 1 1 1 1 1]]
vocabulary : {11: '펀치', 10: '카드', 9: '적합', 2: '매우', 8: '재미', 7: '장소', 3: '비용', 12: '효율', 5: '이고', 13: '흥미', 0: '기회', 6: '있으면', 1: '많이', 4: '야합니다'}

6
bag of words vector : [[1 1 1 1 1 2 1 1]]
vocabulary : {6: '풍경'

bag of words vector : [[1 1 1 1 1]]
vocabulary : {1: '내려가는', 0: '구름', 4: '점차', 2: '사라지고', 3: '있습니다'}

397
bag of words vector : [[1 1 1 1 1 1 1]]
vocabulary : {3: '우리', 2: '빨리', 4: '주차', 6: '하고', 0: '마침내', 5: '파노라마', 1: '보았습니다'}

398
bag of words vector : [[1 1 1 1 1 1]]
vocabulary : {5: '자연', 1: '대칭', 3: '산이', 4: '있는', 0: '경우', 2: '드물다'}

399
bag of words vector : [[1 1]]
vocabulary : {0: '여전히', 1: '추웠다'}

400
bag of words vector : [[1 1 1 1 1 1 1 1]]
vocabulary : {7: '중턱', 6: '있는', 2: '물건', 0: '매우', 3: '비싸다', 5: '위안', 4: '옥수수', 1: '먹었습니다'}

401
bag of words vector : [[1 1 1 1 1 1 1 1 2 1 1 1 1 1 2]]
vocabulary : {14: '후지', 2: '그룹', 4: '보고', 10: '첫날', 7: '야마나시', 8: '온천', 13: '호텔', 9: '정상', 3: '둘째', 0: '가와구치', 12: '호수', 1: '고텐', 6: '아울렛', 5: '실행', 11: '했습니다'}

402
bag of words vector : [[1 1 1 1 1 1 2]]
vocabulary : {5: '정상', 4: '온천', 0: '기분', 6: '좋다', 1: '담그지', 2: '않아도', 3: '야경'}

403
bag of words vector : [[1 2 1 2 1 1 1 2 1]]
vocabulary : {8: '주변', 2: '명소', 0: '대한', 3: '설명', 1: '매우'

In [20]:
print("Total 'cost' property score : ",showScore(cost_score)[0])
print("Average 'cost' property score : ",showScore(cost_score)[0]/showScore(cost_score)[1])
print()

        
print("Total 'landscape' property score : ",showScore(land_score)[0])
print("Average 'landscape' property score : ",showScore(land_score)[0]/showScore(land_score)[1])
print()

print("Total 'fun' property score : ",showScore(fun_score)[0])
print("Average 'fun' property score : ",showScore(fun_score)[0]/showScore(fun_score)[1])
print()

Total 'cost' property score :  -55.20000000000001
Average 'cost' property score :  -1.672727272727273

Total 'landscape' property score :  1601.7999999999993
Average 'landscape' property score :  4.086224489795916

Total 'fun' property score :  421.5
Average 'fun' property score :  6.909836065573771



In [None]:
#0~1 scaling MinMaxScaler()
from sklearn.preprocessing import MinMaxScaler
# scorelist=[-2.567045,-4.126865,-2.45609,-1.8,-2.5023,-1.038095,-2.914705,-3.4028]
scorelist=[1,2,3.5,4]
X_train=pd.DataFrame({'score':scorelist})
minmax_scaler=MinMaxScaler()

minmax_scaler.fit(X_train)
X_train_scaled=minmax_scaler.transform(X_train)

print(X_train_scaled)