In [1]:
import json
import numpy as np
from tqdm import tqdm

import sys
sys.path.append("..")


In [2]:
from helper_functions import extract_text,load_data,apply_negation,get_entity

In [3]:
TRAIN_PATH = '../data/raw/PIZZA_train.json'
DEV_PATH = '../data/raw/PIZZA_dev.json'
TEST_PATH = '../data/raw/PIZZA_test.json'

In [5]:
_, explanations_train, topics_train, decoupled_topics_train=map(np.array, zip(*[extract_text(data,'train') for data in load_data(TRAIN_PATH,'train')]))
_, explanations_dev, topics_dev, decoupled_topics_dev=map(np.array, zip(*[extract_text(data,'dev') for data in load_data(DEV_PATH,'dev')]))
_, explanations_test, topics_test, decoupled_topics_test=map(np.array, zip(*[extract_text(data,'test') for data in load_data(TEST_PATH,'test')]))

In [6]:
entities_dev = set([word[1:]
               for t in topics_dev for word in t.split() if word.isupper()])

entities_train = set([word[1:]
               for t in topics_train for word in t.split() if word.isupper()])

full_entities = entities_dev | entities_train

enitities_exclude_not = full_entities - {'NOT'}

negate_mapping = {}
for entity in full_entities:
    if entity != 'NOT':
        negate_mapping[entity] = "NOT_" + entity

In [7]:
negated_topics_train = np.vectorize(lambda x: apply_negation(x, negate_mapping,enitities_exclude_not))(topics_train)
negated_topics_dev = np.vectorize(lambda x: apply_negation(x, negate_mapping,enitities_exclude_not))(topics_dev)
negated_topics_test = np.vectorize(lambda x: apply_negation(x, negate_mapping,enitities_exclude_not))(topics_test)

In [8]:
final_entities = full_entities.copy()
final_entities.remove('NOT')
final_entities.remove('COMPLEX_TOPPING')
final_entities.remove('PIZZAORDER')
final_entities.remove('DRINKORDER')
final_entities.remove('ORDER')
print(final_entities)

{'SIZE', 'VOLUME', 'STYLE', 'QUANTITY', 'CONTAINERTYPE', 'NUMBER', 'TOPPING', 'DRINKTYPE'}


In [9]:
pizza_entities = final_entities.copy()
pizza_entities.remove('CONTAINERTYPE')
pizza_entities.remove('DRINKTYPE')
pizza_entities.remove('VOLUME')
print(pizza_entities)

{'SIZE', 'STYLE', 'QUANTITY', 'NUMBER', 'TOPPING'}


In [10]:
negate_mapping_pizza = {}
for entity in pizza_entities:
    if entity != 'NOT':        negate_mapping_pizza[entity] = "NOT_" + entity

print(negate_mapping_pizza)

{'SIZE': 'NOT_SIZE', 'STYLE': 'NOT_STYLE', 'QUANTITY': 'NOT_QUANTITY', 'NUMBER': 'NOT_NUMBER', 'TOPPING': 'NOT_TOPPING'}


In [11]:
drink_entities = final_entities.copy()
drink_entities.remove('TOPPING')
drink_entities.remove('STYLE')
drink_entities.remove('QUANTITY')
print(drink_entities)

{'SIZE', 'VOLUME', 'CONTAINERTYPE', 'NUMBER', 'DRINKTYPE'}


In [12]:
negate_mapping_drink = {}
for entity in drink_entities:
    if entity != 'NOT':        negate_mapping_drink[entity] = "NOT_" + entity
    
print(negate_mapping_drink)

{'SIZE': 'NOT_SIZE', 'VOLUME': 'NOT_VOLUME', 'CONTAINERTYPE': 'NOT_CONTAINERTYPE', 'NUMBER': 'NOT_NUMBER', 'DRINKTYPE': 'NOT_DRINKTYPE'}


In [13]:
final_negated_entities_pizza=pizza_entities|set(negate_mapping_pizza.values())
final_negated_entities_drink=drink_entities|set(negate_mapping_drink.values())
print(final_negated_entities_pizza)
print(final_negated_entities_drink)

{'SIZE', 'NOT_STYLE', 'STYLE', 'NOT_NUMBER', 'QUANTITY', 'NOT_TOPPING', 'NUMBER', 'TOPPING', 'NOT_QUANTITY', 'NOT_SIZE'}
{'SIZE', 'VOLUME', 'NOT_DRINKTYPE', 'NOT_NUMBER', 'NOT_CONTAINERTYPE', 'CONTAINERTYPE', 'NOT_VOLUME', 'NUMBER', 'DRINKTYPE', 'NOT_SIZE'}


In [14]:
np.save('../data/processed/final_negated_entities_pizza.npy',np.array(list(final_negated_entities_pizza)))
np.save('../data/processed/final_negated_entities_drink.npy',np.array(list(final_negated_entities_drink)))

In [21]:
number=2023
print(negated_topics_train[number])
mp=get_entity(negated_topics_train[number],final_negated_entities_pizza)
print(mp)

(ORDER i want (PIZZAORDER (NUMBER one ) (SIZE personal size ) pizza without (NOT (NOT_TOPPING bbq pulled pork ) ) ) )
['O', 'O', 'B-NUMBER', 'B-SIZE', 'I-SIZE', 'O', 'O', 'B-NOT_TOPPING', 'I-NOT_TOPPING', 'I-NOT_TOPPING']


In [22]:
words_entities_pizza_train = [get_entity(topic, final_negated_entities_pizza) for topic in 
                           tqdm(negated_topics_train,desc="train pizza")]
words_entities_drink_train = [get_entity(topic, final_negated_entities_drink) for topic in 
                           tqdm(negated_topics_train,desc="train drink")]

words_entities_pizza_dev = [get_entity(topic, final_negated_entities_pizza) for topic in
                            tqdm(negated_topics_dev,desc="dev pizza")]

words_entities_drink_dev = [get_entity(topic, final_negated_entities_drink) for topic in
                            tqdm(negated_topics_dev,desc="dev drink")]

words_entities_pizza_test = [get_entity(topic, final_negated_entities_pizza) for topic in
                                tqdm(negated_topics_test,desc="test pizza")]

words_entities_drink_test = [get_entity(topic, final_negated_entities_drink) for topic in
                                tqdm(negated_topics_test,desc="test drink")]

train pizza: 100%|██████████| 2456446/2456446 [00:24<00:00, 102268.74it/s]
train drink: 100%|██████████| 2456446/2456446 [00:23<00:00, 105776.77it/s]
dev pizza: 100%|██████████| 348/348 [00:00<00:00, 102042.63it/s]
dev drink: 100%|██████████| 348/348 [00:00<00:00, 85920.52it/s]
test pizza: 100%|██████████| 1357/1357 [00:00<00:00, 102787.83it/s]
test drink: 100%|██████████| 1357/1357 [00:00<00:00, 106432.12it/s]


In [31]:
np.save('../data/processed/train/words_entities_pizza_train.npy',np.array(words_entities_pizza_train,dtype=object))
np.save('../data/processed/train/words_entities_drink_train.npy',np.array(words_entities_drink_train,dtype=object))

np.save('../data/processed/dev/words_entities_pizza_dev.npy',np.array(words_entities_pizza_dev,dtype=object))
np.save('../data/processed/dev/words_entities_drink_dev.npy',np.array(words_entities_drink_dev,dtype=object))

np.save('../data/processed/test/words_entities_pizza_test.npy',np.array(words_entities_pizza_test,dtype=object))
np.save('../data/processed/test/words_entities_drink_test.npy',np.array(words_entities_drink_test,dtype=object))
