In [None]:
"""
knu로 추출한 상위 기사에서, 새로운 긍정/부정 단어장을 생성한다. 
tf-idf로 word와 polarity를 형성한다. 
tf-idf의 기준을 0.2에서부터 다양하게 바꾸어 본다.
setiment_score에서 사용 시에는 polarity를 count로 나누도록 한다. 
"""


import os
import pandas as pd
import json
import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from eunjeon import Mecab

mecab = Mecab()
stopword = ['NNBC', 'SF', 'SE', 'SSO', 'SSC', 'SC', 'SY', 'SL', 'SH', 'SN', 'JKS', 'JKC', 'JKG',
           'JKO', 'JKB', 'JKV', 'JKQ', 'JX', 'JC', 'NNP']


def generic_regroup(values, keys):
    groups = dict()
    valkeys = [k for k in values[0] if k not in keys]
    for d in values:
        key = tuple(d[k] for k in keys)
        if key in groups:
            group = groups[key]
            for k in valkeys:
                group[k] += d[k]
        else:
            groups[key] = d.copy()
    return list(groups.values())

In [None]:
df_pos_name = os.path.join(os.path.dirname(os.getcwd()), 'xlsx_data', '#3_knu_score', f"{'knu_score_pos'}.xlsx")
df_pos = pd.read_excel(df_pos_name)

p_corpus, p_result = list(), list()

for i in range(len(df_pos)):
    text = (str(df_pos.loc[i, 'title'])+".\n"+str(df_pos.loc[i, 'text'])).split('.\n')
    p_corpus.append( ". ".join([" ".join([f[0] for f in mecab.pos(e) if not f[1] in stopword]) for e in text]))

vectorizer = TfidfVectorizer()
sp_matrix = vectorizer.fit_transform(p_corpus)

word2id = defaultdict(lambda : 0)
for idx, feature in enumerate(vectorizer.get_feature_names()):
    word2id[feature] = idx


for i, sent in enumerate(p_corpus):
    seen = set()
    temp =  [ {'word': token, 'polarity': sp_matrix[i, word2id[token]], 'count': 1} 
               for token in sent.split() if sp_matrix[i, word2id[token]] > 0.05 and sp_matrix[i, word2id[token]] < 0.5] # 0.02
    p_result.extend([x for x in temp if not (x['word'] in seen or seen.add(x['word']))])

p_res = generic_regroup(p_result, ["word"])
p_res = [x for x in n_res if x['count'] > 10]

In [None]:
df_neg_name = os.path.join(os.path.dirname(os.getcwd()), 'xlsx_data', '#3_knu_score', f"{'knu_score_neg'}.xlsx")
df_neg = pd.read_excel(df_neg_name)

n_corpus, n_result = list(), list()

for i in range(len(df_neg)):
    text = (str(df_neg.loc[i, 'title'])+".\n"+str(df_neg.loc[i, 'text'])).split('.\n')
    n_corpus.append( ". ".join([" ".join([f[0] for f in mecab.pos(e) if not f[1] in stopword]) for e in text]))

vectorizer = TfidfVectorizer()
sp_matrix = vectorizer.fit_transform(n_corpus)

word2id = defaultdict(lambda : 0)
for idx, feature in enumerate(vectorizer.get_feature_names()):
    word2id[feature] = idx


for i, sent in enumerate(n_corpus):
    seen = set()
    temp =  [ {'word': token, 'polarity': -sp_matrix[i, word2id[token]], 'count': 1} 
               for token in sent.split() if sp_matrix[i, word2id[token]] > 0.05 and sp_matrix[i, word2id[token]] < 0.5 ] # 0.02
    n_result.extend([x for x in temp if not (x['word'] in seen or seen.add(x['word']))])

n_res = generic_regroup(n_result, ["word"])
n_res = [x for x in n_res if x['count'] > 10]

In [None]:
#조사 불용어에 포함
#f오류 해결
#e딕셔너리 h해결
#긍정/부정에 동시에 나오는 단어 제거
#대명사 제거

In [None]:
pos_value = [p['word'] for p in p_res]
neg_value = [n['word'] for n in n_res]
z = set(pos_value).intersection(set(neg_value)) 

pos_json = [x for x in p_res if x['word'] not in z or 
            (x['word'] in z and (np.abs(x['polarity'])/np.sqrt(x['count'])) > 
                                 (np.abs(n_res[neg_value.index(x['word'])]['polarity']))
                                 /np.sqrt(n_res[neg_value.index(x['word'])]['count']))]

neg_json = [x for x in n_res if x['word'] not in z or 
            (x['word'] in z and (np.abs(x['polarity'])/np.sqrt(x['count'])) > 
                                 (np.abs(p_res[pos_value.index(x['word'])]['polarity']))
                                 /np.sqrt(p_res[pos_value.index(x['word'])]['count']))]


total = pos_json + neg_json

with open(os.path.join(os.path.dirname(os.getcwd()), 'xlsx_data', '#3_new_dict_score', f"{'pos_neg_dict'}.json"), "w", encoding='utf-8') as outfile:
     json.dump(total, outfile)

In [None]:
dff = pd.read_json(os.path.join(os.path.dirname(os.getcwd()), 'xlsx_data', '#3_new_dict_score', f"{'pos_neg_dict'}.json"))

In [None]:
dff['divided'] = dff['polarity'] / np.sqrt(dff['count'])
dff.to_excel("neg_pos_dict.xlsx")