In [91]:
import json
import pickle
from collections import Counter
from senticnet.senticnet import SenticNet
from nltk.stem.snowball import SnowballStemmer
import numpy as np
import os

stemmer = SnowballStemmer("english")
sn = SenticNet()
sn_labels = ["polarity_intense", "pleasantness", "attention", "sensitivity", "aptitude"]

In [19]:
num_keys = len(sn.data)


values_max = {"polarity_intense": -100.0, "pleasantness": -100.0, "attention": -100.0, "sensitivity": -100.0, "aptitude": -100.0}
values_min = {"polarity_intense": 100.0, "pleasantness": 100.0, "attention": 100.0, "sensitivity": 100.0, "aptitude": 100.0}

sums = {"polarity_intense": 0, "pleasantness": 0, "attention": 0, "sensitivity": 0, "aptitude": 0}


for key in sn.data.keys():
    item = sn.concept(key)
    values_max["polarity_intense"] = max(float(item["polarity_intense"]), values_max["polarity_intense"])
    values_max["pleasantness"] = max(float(item["sentics"]["pleasantness"]), values_max["pleasantness"])
    values_max["attention"] = max(float(item["sentics"]["attention"]), values_max["attention"])
    values_max["sensitivity"] = max(float(item["sentics"]["sensitivity"]), values_max["sensitivity"])
    values_max["aptitude"] = max(float(item["sentics"]["aptitude"]), values_max["aptitude"])
    
    values_min["polarity_intense"] = min(float(item["polarity_intense"]), values_min["polarity_intense"])
    values_min["pleasantness"] = min(float(item["sentics"]["pleasantness"]), values_min["pleasantness"])
    values_min["attention"] = min(float(item["sentics"]["attention"]), values_min["attention"])
    values_min["sensitivity"] = min(float(item["sentics"]["sensitivity"]), values_min["sensitivity"])
    values_min["aptitude"] = min(float(item["sentics"]["aptitude"]), values_min["aptitude"])
    
    sums["polarity_intense"] += float(item["polarity_intense"])
    sums["pleasantness"] += float(item["sentics"]["pleasantness"])
    sums["attention"] += float(item["sentics"]["attention"])
    sums["sensitivity"] += float(item["sentics"]["sensitivity"])
    sums["aptitude"] += float(item["sentics"]["aptitude"])
                                              
sums["polarity_intense"] /= num_keys
sums["pleasantness"] /= num_keys
sums["attention"] /= num_keys
sums["sensitivity"] /= num_keys
sums["aptitude"] /= num_keys

In [16]:
values_max

{'polarity_intense': 1.0,
 'pleasantness': 1.0,
 'attention': 1.0,
 'sensitivity': 1.0,
 'aptitude': 1.0}

In [17]:
values_min

{'polarity_intense': -1.0,
 'pleasantness': -1.0,
 'attention': -1.0,
 'sensitivity': -1.0,
 'aptitude': -1.0}

In [20]:
sums

{'polarity_intense': 0.003717920000000197,
 'pleasantness': -0.0033726700000000194,
 'attention': -0.0069745999999990356,
 'sensitivity': 0.14064387000000453,
 'aptitude': -0.0446053700000004}

In [89]:
def build_vocab():
    word2id = {'<unk>': 0}
    id2word = {0: '<unk>'}
    
    faces_concepts = json.load(open("../data/features/visual_features/face_concepts.json"))
    obj_concepts = json.load(open("../data/features/visual_features/object_concepts.json"))
    action_concepts = json.load(open("../data/features/visual_features/action_concepts.json"))
    
    audio_concepts = json.load(open("../data/features/audio_concepts.json"))
    text_concepts = json.load(open("../data/features/text_concepts.json"))
    concepts = dict()
    for key in faces_concepts.keys():
        concepts[key] = faces_concepts[key] + obj_concepts[key] + action_concepts[key] #  + [audio_concepts[key]] + text_concepts[key]

    # 这里之所以用counter是参考了原始实现，实际上使用dict也能达到同样的效果
    # counter最后没用上，留着备用
    # id2word因为不需要采样也没用上，留着备用
    words_frequency = Counter()
    for _, items in concepts.items():
        words_frequency.update(items)
        
    for _, items in audio_concepts.items():
        words_frequency.update([items])
        
    for _, items in text_concepts.items():
        words_frequency.update(items)

    # SenticNet标签加入
    words_frequency.update(sn_labels)
    
    # words = [(word, counter) for word, counter in words_frequency.items() if counter >= int(config["knowledge"]["min_frequency"])]
    words = [(word, counter) for word, counter in words_frequency.items()]
    # words = sorted(list(words_frequency.keys()))

    # 构建词汇表的过程中顺便词频过滤
    for index, (word, counter) in enumerate(words, start=1):
        if counter >= 1:
            word2id[word] = index
            id2word[index] = word
        else:
            # 如果这个词出现的太少，替换成UNK
            word2id[word] = 0

    return word2id, id2word

In [90]:
concept2id, id2concept = build_vocab()
# print(os.getcwd())b

In [104]:
sn_items = dict()
v_max, v_min = -100, 100

for word in concept2id.keys():
    word_pre = word
    if word not in sn.data and stemmer.stem(word) in sn.data:
        word = stemmer.stem(word)
    
    if word in sn.data:
        item = sn.concept(word)
        po, pl, at, se, ap = float(item["polarity_intense"]), float(item["sentics"]["pleasantness"]), float(item["sentics"]["attention"]), float(item["sentics"]["sensitivity"]), float(item["sentics"]["aptitude"])
        sn_items[word_pre] = np.linalg.norm(np.array([po, pl, at, se, ap]))

1.222604187789327 1.222604187789327
1.8056525136359984 1.222604187789327
1.8056525136359984 1.222604187789327
1.8056525136359984 0.9333809511662428
1.8056525136359984 0.9333809511662428
1.8056525136359984 0.9333809511662428
1.8056525136359984 0.9333809511662428
1.8056525136359984 0.12947586647711612
1.8056525136359984 0.12947586647711612
1.8056525136359984 0.12947586647711612
1.8056525136359984 0.066783231428256
1.8056525136359984 0.066783231428256
1.8056525136359984 0.066783231428256
1.8056525136359984 0.066783231428256
1.8056525136359984 0.066783231428256
1.8056525136359984 0.066783231428256
1.8056525136359984 0.066783231428256
1.8056525136359984 0.066783231428256
1.8056525136359984 0.066783231428256
1.8056525136359984 0.066783231428256
1.8056525136359984 0.066783231428256
1.8056525136359984 0.066783231428256
1.8056525136359984 0.066783231428256
1.8056525136359984 0.066783231428256
1.8056525136359984 0.066783231428256
1.8056525136359984 0.066783231428256
1.8056525136359984 0.06678323

1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.05385

1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.05385

1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.053851648071345036
1.935024806042548 0.05385

1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036


1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036
1.9421369673635276 0.053851648071345036


1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665


1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665
1.9421369673635276 0.048948953002081665


In [105]:
print(v_max, v_min)

1.9421369673635276 0.048948953002081665


In [110]:
(0.048948953002081665 - 0.04894) / (1.9422 - 0.04894)

4.728881443471693e-06