In [None]:
"""
knu로 추출한 상위 기사에서, 새로운 긍정/부정 단어장을 생성한다. 
tf-idf로 word와 polarity를 형성한다. 
tf-idf의 기준을 0.2에서부터 다양하게 바꾸어 본다.
setiment_score에서 사용 시에는 polarity를 count로 나누도록 한다. 
"""


import os
import pandas as pd
import json
import glob

def generic_regroup(values, keys):
    groups = dict()
    valkeys = [k for k in values[0] if k not in key]
    for d in values:
        key = tuple(d[k] for k in keys)
        if key in groups:
            group = groups[key]
            for k in valkeys:
                group[k] += d[k]
        else:
            groups[key] = d.copy()
    return list(groups.values())

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from konlpy.tag import Okt

okt = Okt()

df_pos_name = os.path.join(os.path.dirname(os.getcwd()), 'xlsx_data', '#3_knu_score', f'{'knu_score_pos'}.xlsx')
df_pos = pd.read_excel(df_pos_name)

p_corpus, p_result = list()

for i in range(len(df_pos)):
    text = df_pos.loc[i, 'text'].split('.')
    p_corpus.append( ". ".join([" ".join([f for f in okt.pos(e) if not f[1] in ['Josa', 'Eomi', 'Punctuation']]) 
                              for e in text]))

vectorizer = TfidfVectorizer()
sp_matrix = vectorizer.fit_transform(p_corpus)

word2id = defaultdict(lambda : 0)
for idx, feature in enumerate(vectorizer.get_feature_names()):
    word2id[feature] = idx


for i, sent in enumerate(p_corpus):
    seen = set()
    p_result =  [ {'word': token, 'polarity': sp_matrix[i, word2id[token]], 'count': 1} 
               for token in sent.split() if sp_matrix[i, word2id[token]] > 0.2 ] # 0.02
    p_result =  [x for x in result if not (x in seen or seen.add(x))]

p_res = generic_regroup(p_result, ("word"))


with open(os.path.join(os.path.dirname(os.getcwd()), 'xlsx_data', '#3_new_dict_score', f'{'pos_dict'}.json'), 'w') as json_file:
    json.dump(p_res, json_file)

In [None]:
df_neg_name = os.path.join(os.path.dirname(os.getcwd()), 'xlsx_data', '#3_knu_score', f'{'knu_score_neg'}.xlsx')
df_neg = pd.read_excel(df_neg_name)

n_corpus, n_result = list()

for i in range(len(df_neg)):
    text = df_neg.loc[i, 'text'].split('.')
    n_corpus.append( ". ".join([" ".join([f for f in okt.pos(e) if not f[1] in ['Josa', 'Eomi', 'Punctuation']]) 
                              for e in text]))

vectorizer = TfidfVectorizer()
sp_matrix = vectorizer.fit_transform(p_corpus)

word2id = defaultdict(lambda : 0)
for idx, feature in enumerate(vectorizer.get_feature_names()):
    word2id[feature] = idx


for i, sent in enumerate(n_corpus):
    seen = set()
    n_result =  [ {'word': token, 'polarity': -sp_matrix[i, word2id[token]], 'count': 1} 
               for token in sent.split() if sp_matrix[i, word2id[token]] > 0.2 ] # 0.02
    n_result =  [x for x in result if not (x in seen or seen.add(x))]

n_res = generic_regroup(n_result, ("word"))


with open(os.path.join(os.path.dirname(os.getcwd()), 'xlsx_data', '#3_new_dict_score', f'{'neg_dict'}.json'), 'w') as json_file:
    json.dump(n_res, json_file)

In [None]:
final = []
for f in glob.glob(os.path.join(os.path.dirname(os.getcwd()), 'xlsx_data', '#3_new_dict_score')+"/*.json"):
    with open(f, "rb") as infile:
        final.append(json.load(infile))

with open(os.path.join(os.path.dirname(os.getcwd()), 'xlsx_data', '#3_new_dict_score', f'{'pos_neg_dict'}.json'), "wb") as outfile:
     json.dump(final, outfile)