In [3]:
from tqdm import tqdm
import pandas as pd
import pickle
import spacy
import json
import numpy as np
import csv
from collections import defaultdict
from ast import literal_eval
import pandas as pd
from senticnet.senticnet import SenticNet

spacy_en = spacy.load("en_core_web_sm")

### Labels

#### Labels of text: utterance tokens

In [4]:
# TEXT: collect sentences from data.json

data = json.load(open('../data/data.json', 'r'))

sentences = list()
num_sentences = 0
for key, item in data.items():
    sentences.extend(item['sentences'])
    num_sentences += len(item['sentences'])
#     print('{0} has {1} sentences, {2} sentences in total'.format(key, len(item['sentences']), num_sentences))
    
assert len(sentences) == num_sentences

# tokenization: clean words for conceptNet

sentences_cleaned = list()
for sentence in sentences:
    sentence = sentence.lower()
    sentences_cleaned.append([token.text for token in spacy_en.tokenizer(sentence)])
    
# get n-grams set, 1-gram for good

n = 1
ngrams_all = list()
for sentence in sentences_cleaned:
    ngrams = list()
    for i in range(len(sentence)):
        for j in range(i, max(i - n, -1), -1):
            ngrams.append("_".join(sentence[j: i + 1]))
    ngrams_all.extend(ngrams)
    
ngrams_set = set(ngrams_all)

In [8]:
labels_text_set = ngrams_set

#### Labels of audio: emotions

In [10]:
emotions_audio = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful']

In [11]:
labels_audio_set = set(emotions)
labels_audio_set_aug = set.union(labels_text_set, labels_audio_set) # 将文本中的知识信息扩充到音频中

#### Labels of video: objects, face expressions, actions

In [15]:
# prepare object labels

file_name = json.load(open('../data/labels/labels_name.json'))
label_name = dict()
label_object = list()
for index, name in file_name.items():
    label_name[int(index)] = '_'.join(name)
    label_object.append('_'.join(name))
    
labels_object_set = set(label_object)
    
# prepare action labels
file_action = json.load(open('../data/labels/memor_2.json'))

label_action = list()
flag = False
for item in file_action:
    name_video = item['video']
    name_actions = list()
    for segment in item['clips']:
        name_actions.append(segment['label'])
        
    label_action.extend(['_'.join(label.split(' ')) for label in name_actions])
    
labels_action_set = set(label_action)

# prepare face expressions
expressions_face = ['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']

labels_expression = set(expressions_face)

In [16]:
labels_video_set = set.union(labels_expression, label_object_set, labels_action_set)

In [17]:
labels_video_set

{'pumpkin',
 'cast',
 'pot',
 'drum',
 'grape',
 'bicycle',
 'cleaning_shoes',
 'identity_card',
 'limousine',
 'stop_sign',
 'crossing_river',
 'racket',
 'pie',
 'toothpaste',
 'cartwheeling',
 'school_bus',
 'soccer_ball',
 'tiara',
 'bolo_tie',
 'fire_hose',
 'wrench',
 'pillow',
 'crouton',
 'pigeon',
 'bobbin',
 'steering_wheel',
 'combination_lock',
 'legging',
 'shawl',
 'pepper',
 'papaya',
 'bartending',
 'assembling_computer',
 'cork',
 'clarinet',
 'French_toast',
 'waffle',
 'champagne',
 'pepper_mill',
 'thumbtack',
 'strawberry',
 'sofa',
 'bread-bin',
 'armchair',
 'surprise',
 'drone',
 'dining',
 'paddle',
 'garbage',
 'candle_holder',
 'knob',
 'pew',
 'tape_measure',
 'coin',
 'diskette',
 'musical_instrument',
 'record_player',
 'apron',
 'apple',
 'scratcher',
 'step_stool',
 'tapestry',
 'salami',
 'costume',
 'bird',
 'boot',
 'corkscrew',
 'pan',
 'bouncing_on_trampoline',
 'clutch_bag',
 'potholder',
 'baby',
 'parrot',
 'sugar_bowl',
 'snowman',
 'helmet',
 '

### ConceptNet

In [18]:
labels_set = set.union(labels_text_set, labels_audio_set, labels_video_set)

In [44]:
conceptnet = csv.reader(open('../data/kb/conceptnet-assertions-5.7.0.csv', 'r'), delimiter='\t')

concept_dict = defaultdict(set)

for i, row in enumerate(conceptnet):
        if i % 1000000 == 0:
            print("Processed {0} rows".format(i))
        
        lang = row[2].split("/")[2]
        if lang == 'en':
            c1 = row[2].split("/")[3]
            c2 = row[3].split("/")[3]
            weight = literal_eval(row[-1])["weight"]
            if c1 in labels_set:
                concept_dict[c1].add((c2, weight))
            if c2 in labels_set:
                concept_dict[c2].add((c1, weight))

Processed 0 rows
Processed 1000000 rows
Processed 2000000 rows
Processed 3000000 rows
Processed 4000000 rows
Processed 5000000 rows
Processed 6000000 rows
Processed 7000000 rows
Processed 8000000 rows
Processed 9000000 rows
Processed 10000000 rows
Processed 11000000 rows
Processed 12000000 rows
Processed 13000000 rows
Processed 14000000 rows
Processed 15000000 rows
Processed 16000000 rows
Processed 17000000 rows
Processed 18000000 rows
Processed 19000000 rows
Processed 20000000 rows
Processed 21000000 rows
Processed 22000000 rows
Processed 23000000 rows
Processed 24000000 rows
Processed 25000000 rows
Processed 26000000 rows
Processed 27000000 rows
Processed 28000000 rows
Processed 29000000 rows
Processed 30000000 rows
Processed 31000000 rows
Processed 32000000 rows
Processed 33000000 rows
Processed 34000000 rows


In [45]:
concept_dict['birthday_card']

{('2012', 1.0),
 ('belated_birthday_card', 1.0),
 ('birthday', 1.0),
 ('birthday_card_for_men', 1.0),
 ('birthday_card_for_women', 1.0),
 ('birthday_cards', 1.0),
 ('birthdaycard', 1.0),
 ('card', 1.0),
 ('carte_d_anniversaire', 1.0),
 ('cartolina_di_compleanno', 1.0),
 ('cartão_de_aniversário', 1.0),
 ('family_birthday_card', 1.0),
 ('fødselsdagskort', 1.0),
 ('geburtstagskarte', 1.0),
 ('greeting_card', 1.0),
 ('greeting_card', 2.0),
 ('kad_hari_jadi', 1.0),
 ("kartolina_t'għeluq_sninek", 1.0),
 ('love_birthday_card', 1.0),
 ('mail', 1.0),
 ('mail_box', 1.0),
 ('occasion_card', 1.0),
 ('synttärikortti', 1.0),
 ('syntymäpäiväkortti', 1.0),
 ('umbel', 1.0),
 ('verjaardagskaart', 1.0),
 ('wiki', 0.25),
 ('wn31', 1.0),
 ('καρτα_γενεθλιων', 1.0),
 ('поздравительная_открытка', 1.0),
 ('バースデーカード', 1.0),
 ('生日卡', 1.0),
 ('生日賀卡', 1.0),
 ('生日贺卡', 1.0)}

In [41]:
with open('../data/kb/conceptnet_base.pkl', 'wb') as f:
    pickle.dump(concept_dict, f)

In [35]:
with open('../data/kb/labels_base.pkl', 'wb') as f:
    pickle.dump(labels_set, f)

### NRC

In [23]:
df = pd.read_csv("../data/kb/NRC-VAD-Lexicon.txt", sep='\t')

In [27]:
NRC_dict = {}
for i, row in df.iterrows():
    NRC_dict[row[0]] = tuple(row[1:])
    
with open('../data/kb/NRC_base.pkl', 'wb') as f:   
    pickle.dump(NRC_dict, f)

### SenticNet

In [28]:
# everything in NRC, copy one from senticnet
# pleasentness attention sensitivity aptitude polarity

sn = SenticNet()

senticnet_dict = dict()
for key, value in sn.data.items():
    senticnet_dict[key] = (value[0], value[1], value[2], value[3], value[7])
    
with open('../data/kb/senticnet_base.pkl', 'wb') as f:   
    pickle.dump(senticnet_dict, f)

### Test

In [42]:
with open('../data/kb/conceptnet_base.pkl', 'rb') as f:
    concepts = pickle.load(f)

with open('../data/kb/NRC_base.pkl', 'rb') as f:
    NRCs = pickle.load(f)
    
with open('../data/kb/labels_base.pkl', 'rb') as f:
    labels_test = pickle.load(f)
    
with open('../data/kb/senticnet_base.pkl', 'rb') as f:   
    senticnets = pickle.load(f)

In [46]:
concepts['birthday']

{('28th_february_1980', 1.0),
 ('about', 0.145),
 ('about_age', 0.145),
 ('ad_günü', 1.0),
 ('added', 0.129),
 ('adlawng_natawhan', 1.0),
 ('afmælisdagur', 1.0),
 ('again', 0.223),
 ('age', 0.102),
 ('age', 4.073),
 ('age', 8.203),
 ('age_anniversary', 1.234),
 ('age_celebration', 3.47),
 ('age_commemoration', 0.189),
 ('age_counter', 0.227),
 ('age_counting', 0.239),
 ('age_date', 0.609),
 ('age_day', 0.713),
 ('age_event', 0.381),
 ('age_holiday', 0.476),
 ('age_marker', 0.186),
 ('age_party', 1.134),
 ('age_progression', 0.194),
 ('age_related', 0.641),
 ('age_tracking', 0.193),
 ('ageing', 0.422),
 ('ageing_celebration', 0.278),
 ('ages', 0.22),
 ('aging', 1.811),
 ('aging_celebration', 0.293),
 ('ago', 0.134),
 ('alive', 0.183),
 ('aniversare', 1.0),
 ('aniversari', 1.0),
 ('aniversari', 2.0),
 ('aniversario', 1.0),
 ('aniversário', 2.0),
 ('anivèsè', 1.0),
 ('anniversaire', 1.0),
 ('anniversari', 1.0),
 ('anniversarie', 1.0),
 ('anniversariu', 1.0),
 ('anniversary', 0.217),
 ('an

In [32]:
NRCs['picture']

(0.7140000000000001, 0.308, 0.462)

In [25]:
senticnets['picture']

('0.034', '0.075', '0', '0.143', '0.084')