In [1]:
from tqdm import tqdm
import pandas as pd
import pickle
import spacy
import json
import numpy as np
import csv
from collections import defaultdict
from ast import literal_eval
import pandas as pd
from senticnet.senticnet import SenticNet

spacy_en = spacy.load("en_core_web_sm")

### Labels

#### Labels of text: utterance tokens

In [2]:
# TEXT: collect sentences from data.json

data = json.load(open('../data/data.json', 'r'))

sentences = list()
num_sentences = 0
for key, item in data.items():
    sentences.extend(item['sentences'])
    num_sentences += len(item['sentences'])
#     print('{0} has {1} sentences, {2} sentences in total'.format(key, len(item['sentences']), num_sentences))
    
assert len(sentences) == num_sentences

# tokenization: clean words for conceptNet

sentences_cleaned = list()
for sentence in sentences:
    sentence = sentence.lower()
    sentences_cleaned.append([token.text for token in spacy_en.tokenizer(sentence)])
    
# get n-grams set, 1-gram for good

n = 1
ngrams_all = list()
for sentence in sentences_cleaned:
    ngrams = list()
    for i in range(len(sentence)):
        for j in range(i, max(i - n, -1), -1):
            ngrams.append("_".join(sentence[j: i + 1]))
    ngrams_all.extend(ngrams)
    
ngrams_set = set(ngrams_all)

In [3]:
labels_text_set = ngrams_set

#### Labels of audio: emotions

In [4]:
emotions_audio = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful']

In [6]:
labels_audio_set = set(emotions_audio)
labels_audio_set_aug = set.union(labels_text_set, labels_audio_set) # 将文本中的知识信息扩充到音频中

#### Labels of video: objects, face expressions, actions

In [7]:
# prepare object labels

file_name = json.load(open('../data/labels/labels_name.json'))
label_name = dict()
label_object = list()
for index, name in file_name.items():
    label_name[int(index)] = '_'.join(name)
    label_object.append('_'.join(name))
    
labels_object_set = set(label_object)
    
# prepare action labels
file_action = json.load(open('../data/labels/memor_2.json'))

label_action = list()
flag = False
for item in file_action:
    name_video = item['video']
    name_actions = list()
    for segment in item['clips']:
        name_actions.append(segment['label'])
        
    label_action.extend(['_'.join(label.split(' ')) for label in name_actions])
    
labels_action_set = set(label_action)

# prepare face expressions
expressions_face = ['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']

labels_expression = set(expressions_face)

In [24]:
# labels_video_set = set.union(labels_expression, labels_object_set, labels_action_set)
labels_video_set = set.union(labels_object_set, labels_action_set)

In [25]:
labels_video_set

{'starfish',
 'mitten',
 'batter',
 'bell',
 'measuring_cup',
 'parchment',
 'sushi',
 'bowler_hat',
 'license_plate',
 'candy_bar',
 'chocolate_bar',
 'stew',
 'perfume',
 'blowing_glass',
 'grits',
 'water_gun',
 'dining',
 'cream_pitcher',
 'wagon_wheel',
 'tiger',
 'crouton',
 'mascot',
 'bookmark',
 'owl',
 'blinker',
 'ski_boot',
 'cooking_sausages',
 'dancing_charleston',
 'cylinder',
 'taillight',
 'water_heater',
 'sail',
 'clarinet',
 'record_player',
 'Sharpie',
 'pear',
 'bulldozer',
 'boot',
 'sausage',
 'scratcher',
 'napkin',
 'boom_microphone',
 'cooking_utensil',
 'green_onion',
 'breakdancing',
 'dodgeball',
 'cassette',
 'peach',
 'sombrero',
 'necklace',
 'mint_candy',
 'pelican',
 'quiche',
 'soap',
 'poncho',
 'dress',
 'projector',
 'can',
 'life_jacket',
 'kitten',
 'cooking_chicken',
 'fire_extinguisher',
 'mat',
 'flower_arrangement',
 'strawberry',
 'igniter',
 'gun',
 'turban',
 'crutch',
 'armband',
 'tennis_ball',
 'tarp',
 'towel_rack',
 'hamburger',
 'do

### ConceptNet

In [26]:
# labels_set = set.union(labels_text_set, labels_audio_set, labels_video_set)
labels_set = set.union(labels_text_set, labels_video_set)

In [27]:
conceptnet = csv.reader(open('../data/kb/conceptnet-assertions-5.7.0.csv', 'r'), delimiter='\t')

concept_dict = defaultdict(set)

for i, row in enumerate(conceptnet):
        if i % 1000000 == 0:
            print("Processed {0} rows".format(i))
        
        lang1 = row[2].split("/")[2]
        lang2 = row[3].split("/")[2]
        if lang1 == 'en' and lang2 == 'en':
            c1 = row[2].split("/")[3]
            c2 = row[3].split("/")[3]
            weight = literal_eval(row[-1])["weight"]
            if c1 in labels_set:
                concept_dict[c1].add((c2, weight))
            if c2 in labels_set:
                concept_dict[c2].add((c1, weight))

Processed 0 rows
Processed 1000000 rows
Processed 2000000 rows
Processed 3000000 rows
Processed 4000000 rows
Processed 5000000 rows
Processed 6000000 rows
Processed 7000000 rows
Processed 8000000 rows
Processed 9000000 rows
Processed 10000000 rows
Processed 11000000 rows
Processed 12000000 rows
Processed 13000000 rows
Processed 14000000 rows
Processed 15000000 rows
Processed 16000000 rows
Processed 17000000 rows
Processed 18000000 rows
Processed 19000000 rows
Processed 20000000 rows
Processed 21000000 rows
Processed 22000000 rows
Processed 23000000 rows
Processed 24000000 rows
Processed 25000000 rows
Processed 26000000 rows
Processed 27000000 rows
Processed 28000000 rows
Processed 29000000 rows
Processed 30000000 rows
Processed 31000000 rows
Processed 32000000 rows
Processed 33000000 rows
Processed 34000000 rows


In [28]:
len(concept_dict)

11565

In [29]:
from collections import Counter
cntr = Counter()
cntr.update([len(v) for k, v in concept_dict.items()])

In [31]:
cntr

Counter({50: 58,
         24: 118,
         15: 134,
         101: 26,
         41: 67,
         178: 15,
         132: 20,
         111: 23,
         152: 14,
         349: 4,
         374: 4,
         127: 19,
         88: 36,
         473: 2,
         126: 16,
         199: 10,
         83: 27,
         333: 8,
         143: 21,
         176: 10,
         238: 9,
         52: 55,
         197: 9,
         164: 15,
         81: 36,
         328: 8,
         187: 13,
         134: 12,
         432: 4,
         184: 11,
         273: 7,
         45: 66,
         58: 38,
         138: 12,
         173: 15,
         48: 58,
         38: 58,
         461: 1,
         272: 6,
         248: 6,
         148: 15,
         87: 31,
         42: 69,
         284: 1,
         56: 36,
         359: 6,
         26: 98,
         216: 10,
         91: 28,
         656: 1,
         956: 1,
         222: 6,
         121: 15,
         320: 4,
         123: 13,
         519: 4,
         902: 3,
         

In [34]:
from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS as spacy_stopwords
nltk_stopwords = stopwords.words('english')
# spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS # older version of spacy
stopwords = set(nltk_stopwords).union(spacy_stopwords)

In [50]:
filtered_conceptnet = {}
for key_concept in concept_dict:
    if key_concept in labels_set and key_concept not in stopwords:
        filtered_conceptnet[key_concept] = set()
        for connect_concept, connect_weight in concept_dict[key_concept]:
            if connect_concept not in stopwords:
                filtered_conceptnet[key_concept].add((connect_concept, connect_weight))

In [51]:
len(filtered_conceptnet)

11276

In [52]:
cntr2 = Counter()
cntr2.update([len(v) for k, v in filtered_conceptnet.items()])

In [53]:
for k, v in filtered_conceptnet.items():
    if len(v) > 300:
        print(k, len(v))

gain 335
join 362
ability 468
normal 331
fix 320
present 407
leave 445
purpose 348
acid 655
base 941
peace 310
act 877
real 320
action 1239
activity 768
rest 694
future 338
remove 557
speaker 337
biological 644
birth 313
child 1180
children 721
adult 308
childish 325
advance 317
marriage 420
relationship 492
negative 410
stop 576
away 425
age 528
grow 385
human 1919
number 955
old 517
young 379
agent 544
patient 357
defeat 328
agreement 351
air 991
earth 1121
fire 1127
ground 877
land 1055
water 2699
window 611
alcoholic 335
dead 507
free 560
crowd 499
people 2912
short 453
red 699
analysis 505
order 943
ancient 427
new 346
anger 376
animal 2281
bird 3529
man 1142
mineral 5626
person 9992
plant 5674
vegetable 505
separate 423
problem 406
question 369
wrong 322
post 624
low 411
pattern 458
sense 340
social 646
specific 323
house 1364
hide 389
appearance 409
apple 758
religious 646
fall 608
lay 304
lie 436
set 970
sleep 741
arm 561
leg 722
clothing 672
army 400
bow 404
art 1053
logic 725

In [54]:
cntr2.most_common(50)

[(1, 473),
 (2, 348),
 (3, 281),
 (4, 270),
 (6, 243),
 (5, 216),
 (7, 215),
 (9, 187),
 (8, 185),
 (11, 182),
 (10, 181),
 (17, 156),
 (12, 155),
 (20, 152),
 (13, 150),
 (16, 147),
 (18, 139),
 (15, 137),
 (14, 134),
 (21, 132),
 (19, 128),
 (29, 122),
 (24, 119),
 (22, 117),
 (27, 109),
 (26, 98),
 (25, 96),
 (35, 91),
 (30, 90),
 (23, 90),
 (28, 88),
 (33, 80),
 (31, 77),
 (45, 75),
 (32, 74),
 (36, 71),
 (43, 70),
 (40, 69),
 (38, 67),
 (34, 66),
 (41, 64),
 (48, 63),
 (37, 63),
 (39, 63),
 (49, 57),
 (42, 55),
 (44, 55),
 (46, 52),
 (50, 50),
 (71, 48)]

In [42]:
filtered_conceptnet['football']

{('11', 1.0),
 ('ba', 1.0),
 ('ball', 1.0),
 ('ball', 2.0),
 ('bent', 1.0),
 ('bladder', 1.0),
 ('bladder', 2.0),
 ('boot', 1.0),
 ('canada', 1.0),
 ('casual', 1.0),
 ('center', 2.0),
 ('complete', 2.0),
 ('completed', 2.0),
 ('corner', 1.0),
 ('cross', 1.0),
 ('crowd', 1.015),
 ('dresser', 1.0),
 ('end', 2.0),
 ('foot', 1.0),
 ('forward', 1.0),
 ('fun', 1.0),
 ('game', 1.0),
 ('game', 1.672),
 ('game', 9.165),
 ('general', 1.0),
 ('goal', 1.0),
 ('ground', 2.0),
 ('hack', 1.0),
 ('half', 2.0),
 ('hearts', 1.0),
 ('kick', 1.0),
 ('kick', 2.0),
 ('leather', 1.0),
 ('package', 1.0),
 ('pass', 1.0),
 ('pass', 2.999),
 ('passing', 2.0),
 ('pigskin', 1.0),
 ('projectile', 1.0),
 ('quarter', 2.0),
 ('rangers', 1.0),
 ('return', 2.0),
 ('roof', 1.0),
 ('rough', 1.0),
 ('round', 1.0),
 ('rugby', 1.0),
 ('rugby', 2.0),
 ('run', 1.0),
 ('running', 2.0),
 ('rush', 1.0),
 ('safety', 1.0),
 ('snap', 1.0),
 ('soccer_ball', 1.0),
 ('solo', 1.0),
 ('spike', 1.0),
 ('sport', 8.0),
 ('strip', 1.0),
 ('s

In [41]:
with open('../data/kb/conceptnet_base.pkl', 'wb') as f:
    pickle.dump(concept_dict, f)

In [35]:
with open('../data/kb/labels_base.pkl', 'wb') as f:
    pickle.dump(labels_set, f)

### NRC

In [23]:
df = pd.read_csv("../data/kb/NRC-VAD-Lexicon.txt", sep='\t')

In [27]:
NRC_dict = {}
for i, row in df.iterrows():
    NRC_dict[row[0]] = tuple(row[1:])
    
with open('../data/kb/NRC_base.pkl', 'wb') as f:   
    pickle.dump(NRC_dict, f)

### SenticNet

In [28]:
# everything in NRC, copy one from senticnet
# pleasentness attention sensitivity aptitude polarity

sn = SenticNet()

senticnet_dict = dict()
for key, value in sn.data.items():
    senticnet_dict[key] = (value[0], value[1], value[2], value[3], value[7])
    
with open('../data/kb/senticnet_base.pkl', 'wb') as f:   
    pickle.dump(senticnet_dict, f)

### Test

In [42]:
with open('../data/kb/conceptnet_base.pkl', 'rb') as f:
    concepts = pickle.load(f)

with open('../data/kb/NRC_base.pkl', 'rb') as f:
    NRCs = pickle.load(f)
    
with open('../data/kb/labels_base.pkl', 'rb') as f:
    labels_test = pickle.load(f)
    
with open('../data/kb/senticnet_base.pkl', 'rb') as f:   
    senticnets = pickle.load(f)

In [46]:
concepts['birthday']

{('28th_february_1980', 1.0),
 ('about', 0.145),
 ('about_age', 0.145),
 ('ad_günü', 1.0),
 ('added', 0.129),
 ('adlawng_natawhan', 1.0),
 ('afmælisdagur', 1.0),
 ('again', 0.223),
 ('age', 0.102),
 ('age', 4.073),
 ('age', 8.203),
 ('age_anniversary', 1.234),
 ('age_celebration', 3.47),
 ('age_commemoration', 0.189),
 ('age_counter', 0.227),
 ('age_counting', 0.239),
 ('age_date', 0.609),
 ('age_day', 0.713),
 ('age_event', 0.381),
 ('age_holiday', 0.476),
 ('age_marker', 0.186),
 ('age_party', 1.134),
 ('age_progression', 0.194),
 ('age_related', 0.641),
 ('age_tracking', 0.193),
 ('ageing', 0.422),
 ('ageing_celebration', 0.278),
 ('ages', 0.22),
 ('aging', 1.811),
 ('aging_celebration', 0.293),
 ('ago', 0.134),
 ('alive', 0.183),
 ('aniversare', 1.0),
 ('aniversari', 1.0),
 ('aniversari', 2.0),
 ('aniversario', 1.0),
 ('aniversário', 2.0),
 ('anivèsè', 1.0),
 ('anniversaire', 1.0),
 ('anniversari', 1.0),
 ('anniversarie', 1.0),
 ('anniversariu', 1.0),
 ('anniversary', 0.217),
 ('an

In [32]:
NRCs['picture']

(0.7140000000000001, 0.308, 0.462)

In [25]:
senticnets['picture']

('0.034', '0.075', '0', '0.143', '0.084')