In [4]:
import joblib
import pandas as pd
import numpy as np
import jieba

# ============================== Category ==============================
baidu_emotions = ['angry', 'disgusting', 'fearful',
                  'happy', 'sad', 'neutral', 'pessimistic', 'optimistic']
baidu_emotions.sort()

baidu_emotions_2_index = dict(zip(baidu_emotions, [i for i in range(len(baidu_emotions))]))


def baidu_arr(emotions_dict):
    arr = np.zeros(len(baidu_emotions))

    if emotions_dict is None:
        return arr

    for k, v in emotions_dict.items():
        # like -> happy
        if k == 'like':
            arr[baidu_emotions_2_index['happy']] += v
        else:
            arr[baidu_emotions_2_index[k]] += v

    return arr

# ============================== Lexicon and Intensity ==============================


# load negation words
negation_words = []
with open('../../resources/Chinese/others/negative/negationWords.txt', 'r') as src:
    lines = src.readlines()
    for line in lines:
        negation_words.append(line.strip())

print('\nThe num of negation words: ', len(negation_words))


# load degree words
how_words_dict = dict()
with open('../../resources/Chinese/HowNet/intensifierWords.txt', 'r') as src:
    lines = src.readlines()
    for line in lines:
        how_word = line.strip().split()
        how_words_dict[' '.join(how_word[:-1])] = float(how_word[-1])

print('The num of degree words: ', len(how_words_dict),
      '. eg: ', list(how_words_dict.items())[0])



# negation value and degree value
def get_not_and_how_value(cut_words, i, windows):
    
    # not_cnt means negation value
    # how_v means degree value
    
    not_cnt = 0
    how_v = 1

    left = 0 if (i - windows) < 0 else (i - windows)
    window_text = ' '.join(cut_words[left:i])

    for w in negation_words:
        if w in window_text:
            not_cnt += 1
    for w in how_words_dict.keys():
        if w in window_text:
            how_v *= how_words_dict[w]

    # for w in cut_words[left:i]:
    #     if w in negation_words:
    #         not_cnt += 1
    #     if w in how_words_dict:
    #         how_v *= how_words_dict[w]

    return (-1) ** not_cnt, how_v


_, words2array = joblib.load('../../resources/Chinese/大连理工大学情感词汇本体库/preprocess/words2array_27351.pkl')

print('[Dalianligong]\tThere are {} words, the dimension is {}'.format(
    len(words2array), words2array['快乐'].shape))


def dalianligong_arr(cut_words, windows=2):
    arr = np.zeros(29)

    for i, word in enumerate(cut_words):
        if word in words2array:
            not_v, how_v = get_not_and_how_value(cut_words, i, windows)
            arr += not_v * how_v * words2array[word]

    return arr

# ============================== Sentiment Scores ==============================


boson_words_dict = dict()
with open('../../resources/Chinese/BosonNLP/BosonNLP_sentiment_score.txt', 'r') as src:
    lines = src.readlines()
    for line in lines:
        boson_word = line.strip().split()
        if len(boson_word) != 2:
            # print(line)
            continue
        else:
            boson_words_dict[boson_word[0]] = float(boson_word[1])
print('[BosonNLP]\t There are {} words'.format(len(boson_words_dict)))


def boson_value(cut_words, windows=2):
    value = 0

    for i, word in enumerate(cut_words):
        if word in boson_words_dict:
            not_v, how_v = get_not_and_how_value(cut_words, i, windows)
            value += not_v * how_v * boson_words_dict[word]

    return value

# ============================== Auxilary Features ==============================


# Emotion
emoticon_df = pd.read_csv(
    '../../resources/Chinese/others/emoticon/emoticon.csv')
emoticons = emoticon_df['emoticon'].tolist()
emoticon_types = list(set(emoticon_df['label'].tolist()))
emoticon_types.sort()
emoticon2label = dict(
    zip(emoticon_df['emoticon'].tolist(), emoticon_df['label'].tolist()))
emoticon2index = dict(
    zip(emoticon_types, [i for i in range(len(emoticon_types))]))

print('[Emoticon]\tThere are {} emoticons, including {} categories'.format(
    len(emoticons), len(emoticon_types)))


def emoticon_arr(text, cut_words):
    arr = np.zeros(len(emoticon_types))

    if len(cut_words) == 0:
        return arr

    for i, emoticon in enumerate(emoticons):
        if emoticon in text:
            arr[emoticon2index[emoticon2label[emoticon]]
                ] += text.count(emoticon)

    return arr / len(cut_words)


# Punctuation
def symbols_count(text):
    excl = (text.count('!') + text.count('！')) / len(text)
    ques = (text.count('?') + text.count('？')) / len(text)
    comma = (text.count(',') + text.count('，')) / len(text)
    dot = (text.count('.') + text.count('。')) / len(text)
    ellip = (text.count('..') + text.count('。。')) / len(text)

    return excl, ques, comma, dot, ellip


# Sentimental Words
def init_words(file):
    with open(file, 'r', encoding='utf-8') as src:
        words = src.readlines()
        words = [l.strip() for l in words]
    # print('File: {}, Words_sz = {}'.format(file.split('/')[-1], len(words)))
    return list(set(words))


pos_words = init_words('../../resources/Chinese/HowNet/正面情感词语（中文）.txt')
pos_words += init_words('../../resources/Chinese/HowNet/正面评价词语（中文）.txt')
neg_words = init_words('../../resources/Chinese/HowNet/负面情感词语（中文）.txt')
neg_words += init_words('../../resources/Chinese/HowNet/负面评价词语（中文）.txt')

pos_words = set(pos_words)
neg_words = set(neg_words)
print('[HowNet]\tThere are {} positive words and {} negative words'.format(
    len(pos_words), len(neg_words)))


def sentiment_words_count(cut_words):
    if len(cut_words) == 0:
        return [0, 0, 0, 0]

    # positive and negative words
    sentiment = []
    for words in [pos_words, neg_words]:
        c = 0
        for word in words:
            if word in cut_words:
                # print(word)
                c += 1
        sentiment.append(c)
    sentiment = [c / len(cut_words) for c in sentiment]

    # degree words
    degree = 0
    for word in how_words_dict:
        if word in cut_words:
            # print(word)
            degree += how_words_dict[word]

    # negation words
    negation = 0
    for word in negation_words:
        negation += cut_words.count(word)
    negation /= len(cut_words)

    sentiment += [degree, negation]

    return sentiment


# Personal Pronoun
first_pronoun = init_words(
    '../../resources/Chinese/others/pronoun/1-personal-pronoun.txt')
second_pronoun = init_words(
    '../../resources/Chinese/others/pronoun/2-personal-pronoun.txt')
third_pronoun = init_words(
    '../../resources/Chinese/others/pronoun/3-personal-pronoun.txt')
pronoun_words = [first_pronoun, second_pronoun, third_pronoun]


def pronoun_count(cut_words):
    if len(cut_words) == 0:
        return [0, 0, 0]

    pronoun = []
    for words in pronoun_words:
        c = 0
        for word in words:
            c += cut_words.count(word)
        pronoun.append(c)

    return [c / len(cut_words) for c in pronoun]


# Auxilary Features
def auxilary_features(text, cut_words):
    arr = np.zeros(17)

    arr[:5] = emoticon_arr(text, cut_words)
    arr[5:10] = symbols_count(text)
    arr[10:14] = sentiment_words_count(cut_words)
    arr[14:17] = pronoun_count(cut_words)

    return arr


The num of negation words:  44
The num of degree words:  214 . eg:  ('百分之百', 2.0)
[Dalianligong]	There are 27351 words, the dimension is (29,)
[BosonNLP]	 There are 114766 words
[Emoticon]	There are 104 emoticons, including 5 categories
[HowNet]	There are 4528 positive words and 4320 negative words


In [5]:
# ============================== Main ==============================

# tokenize by jieba
def cut_words_from_text(text):
    return list(jieba.cut(text))

# 新聞發布者的情緒
def extract_publisher_emotion(content, content_words, emotions_dict):
    text, cut_words = content, content_words

    arr = np.zeros(55)
    arr[:8] = baidu_arr(emotions_dict)
    #arr[8:37] = dalianligong_arr(cut_words)
    #arr[37:38] = boson_value(cut_words)
    #arr[38:55] = auxilary_features(text, cut_words)

    return arr


def extract_dual_emotion(piece):
    publisher_emotion = extract_publisher_emotion(piece['content'], 
                                                  piece['content_words'], 
                                                  piece['content_emotions'])
    return publisher_emotion

In [4]:
import os
import json
from tqdm import tqdm
import time
import numpy as np
import sys
sys.path.append('../emotion')
import extract_emotion_ch

save_dir = './temp'
if not os.path.exists(save_dir):
    os.mkdir(save_dir)

datasets_ch = ["Taiwan"]

for dataset in datasets_ch :
    print('\n\n{} [{}]\tProcessing the dataset: {} {}\n'.format(
        '-'*20, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), dataset, '-'*20))

    if dataset in datasets_ch:
        extract_pkg = extract_emotion_ch

    data_dir = os.path.join('../../dataset', dataset)
    output_dir = os.path.join(save_dir, dataset)
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    emotion_dir = os.path.join(output_dir, 'emotions')
    if not os.path.exists(emotion_dir):
        os.mkdir(emotion_dir)

    split_datasets = [json.load(open(os.path.join(
        data_dir, '{}.json'.format(t)), 'r')) for t in ['train', 'test']]
    split_datasets = dict(zip(['train', 'test'], split_datasets))

    for t, pieces in split_datasets.items():
        arr_is_saved = False
        json_is_saved = False
        for f in os.listdir(output_dir):
            if '.npy' in f and t in f:
                arr_is_saved = True
            if t in f:
                json_is_saved = True

        if arr_is_saved:
            continue

        if json_is_saved:
            pieces = json.load(
                open(os.path.join(output_dir, '{}.json'.format(t)), 'r'))

        # words cutting
        if 'content_words' not in pieces[0].keys():
            print('[{}]\tWords Cutting...'.format(
                time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())))
            for p in tqdm(pieces):
                p['content_words'] = extract_pkg.cut_words_from_text(p['content'])
                
            # add content_words in dataset and save it in output_dir
            with open(os.path.join(output_dir, '{}.json'.format(t)), 'w') as f:
                json.dump(pieces, f, indent=4, ensure_ascii=False)
                
                # indent=4 prettify json format
       
        emotion_arr = [extract_pkg.extract_dual_emotion(p) for p in tqdm(pieces)]
        emotion_arr = np.array(emotion_arr)
        print(emotion_arr)
        print('{} dataset: got a {} emotion arr'.format(t, emotion_arr.shape))
        np.save(os.path.join(emotion_dir, '{}_{}.npy'.format( t, emotion_arr.shape)), emotion_arr)


The num of negation words:  44
The num of degree words:  214 . eg:  ('百分之百', 2.0)
[Dalianligong]	There are 27351 words, the dimension is (29,)
[BosonNLP]	 There are 114766 words
[Emoticon]	There are 104 emoticons, including 5 categories
[HowNet]	There are 4528 positive words and 4320 negative words


-------------------- [2021-07-10 16:14:36]	Processing the dataset: Weibo-20 --------------------



  0%|          | 0/3816 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache


[2021-07-10 16:14:36]	Words Cutting...


Loading model cost 0.530 seconds.
Prefix dict has been built successfully.
100%|██████████| 3816/3816 [00:02<00:00, 1804.72it/s]
100%|██████████| 3816/3816 [00:19<00:00, 194.14it/s]
 19%|█▉        | 248/1274 [00:00<00:00, 2471.70it/s]

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.02325581 0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.01219512 0.        ]
 [0.         0.         0.         ... 0.         0.         0.015625  ]]
train dataset: got a (3816, 55) emotion arr
[2021-07-10 16:15:07]	Words Cutting...


100%|██████████| 1274/1274 [00:00<00:00, 2394.64it/s]
100%|██████████| 1274/1274 [00:06<00:00, 191.05it/s]

[[0.         0.         0.         ... 0.         0.         0.02222222]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.04411765 0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.886281   0.         ... 0.         0.01123596 0.        ]]
test dataset: got a (1274, 55) emotion arr





In [9]:
_, words2array = joblib.load('../../resources/Chinese/大连理工大学情感词汇本体库/preprocess/words2array_27351.pkl')

print('[Dalianligong]\tThere are {} words, the dimension is {}'.format(
    len(words2array), words2array['快乐'].shape))

[Dalianligong]	There are 27351 words, the dimension is (29,)


In [11]:
_

{'NA': 0,
 'NB': 1,
 'NC': 2,
 'ND': 3,
 'NE': 4,
 'NG': 5,
 'NH': 6,
 'NI': 7,
 'NJ': 8,
 'NK': 9,
 'NL': 10,
 'NN': 11,
 'PA': 12,
 'PB': 13,
 'PC': 14,
 'PD': 15,
 'PE': 16,
 'PF': 17,
 'PG': 18,
 'PH': 19,
 'PK': 20}

In [10]:
words2array

{'脏乱': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 7., 0., 0., 0., 2., 0., 0., 0.]),
 '糟报': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 5., 0., 0., 0., 2., 0., 0., 0.]),
 '早衰': array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 5., 0., 0., 0., 2., 0., 0., 0.]),
 '责备': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 5., 0., 0., 0., 2., 0., 0., 0.]),
 '贼眼': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 5., 0., 0., 0., 2., 0., 0., 0.]),
 '战祸': array([0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 5., 5., 0., 0., 2., 2., 0., 0.]),
 '招灾': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 5., 0., 0., 0., 2., 0., 0., 0.]),
 '折辱': array([0., 0.