In [1]:
import json
import pandas as pd
import re
import spacy
import numpy as np

In [2]:
def read_food_json(filepath):
    with open(filepath) as f:
        data = json.loads(f.read())

    texts = []
    foods = []
    labels = []
    for conversation in data:
        utterances = conversation['utterances']
        for utterance in utterances:
            text = utterance['text']
            texts.append(text) # ok

            try:
                fs = []
                anns = []
                for segment in utterance['segments']:
                    food_words = segment['text']
                    fs.append(food_words)
                    annotations = segment['annotations']
                    for annotation in annotations:
                        anns.append(annotation['name'])
                foods.append(fs)
                labels.append(anns)
            except KeyError:
                foods.append([])
                labels.append([])
                continue

    assert len(texts) == len(foods) == len(labels)
    return texts, foods, labels

In [3]:
food_ordering_file = "food-ordering.json"
restauraunt_search_file = "restaurant-search.json"
food_ordering_texts, food_ordering_foods, food_ordering_labels = read_food_json(food_ordering_file)
restauraunt_search_texts, restauraunt_search_foods, restauraunt_search_labels = read_food_json(restauraunt_search_file)
texts = food_ordering_texts + restauraunt_search_texts
foods = food_ordering_foods + restauraunt_search_foods
labels = food_ordering_labels + restauraunt_search_labels
assert len(texts) == len(foods) == len(labels)

In [4]:
df = pd.DataFrame(list(zip(texts, foods, labels)), columns=["Utterances", "Segments", "Annotations"])

In [5]:
df.head(20)

Unnamed: 0,Utterances,Segments,Annotations
0,Hi.,[],[]
1,How can I help you?,[],[]
2,What would you like to order in Barbecue?,[Barbecue],[food_order.type.food]
3,"Yeah, I'd like to get a rack of ribs.",[a rack of ribs],[food_order.name.item]
4,What kind of sides can I get with that?,[],[]
5,"With rack of ribs, I would suggest Barbecue sa...","[With rack of ribs, Barbecue sauce]","[food_order.name.item, food_order.name.item]"
6,"Yeah. Of course, I want the barbecue sauce and...","[barbecue sauce, ribs, slaw or macaroni and ch...","[food_order.name.item, food_order.name.item, f..."
7,Would you like to add extra?,[],[]
8,"No, I'd like to get a salad with that. No toma...","[No tomatoes, with ranch dressing, a salad]","[food_order.other_description.item, food_order..."
9,Salad with no tomatoes and ranch dressing has ...,"[Salad, with no tomatoes and ranch dressing]","[food_order.name.item, food_order.other_descri..."


In [6]:
df.tail(20)

Unnamed: 0,Utterances,Segments,Annotations
80138,"Okay, I found a few places you might like.",[],[]
80139,How about Spinners Pizza or Kings Subs and Piz...,"[Spinners Pizza, Kings Subs and Pizza, Depot H...","[restaurant.name.restaurant, restaurant.name.r..."
80140,"I'll go with House of Pizza, can you tell me w...",[House of Pizza],[restaurant.name.restaurant]
80141,"It is located at 53 Essex St, Andover, MA 01810.","[53 Essex St, Andover, MA 01810]",[restaurant.location]
80142,"Thank you, I'll head over there now.",[],[]
80143,You're welcome.,[],[]
80144,I'm hungry.,[],[]
80145,"Hi, how can I help you?",[],[]
80146,Sure I can help. Any particular food you are l...,[],[]
80147,I really like American food. I wonder if there...,"[American, burger]","[restaurant.type.food, restaurant.type.food]"


In [7]:
df_foods = pd.read_csv("food_related.csv", names=["foods"])

In [8]:
df_foods.head(20)

Unnamed: 0,foods
0,a-yeast
1,aai
2,abalone
3,abba-zaba
4,abbreviations
5,abietate
6,above
7,absorbed
8,absorbent
9,absorbic


In [9]:
nlp = spacy.load("en_core_web_lg")

In [10]:
labels = []

for text in df["Utterances"]:
    tmp = []
    doc = nlp(text)
    for token in doc:
        if token.text in df_foods["foods"].values:
            tmp.append(1)
        else:
            tmp.append(0)
    labels.append(tmp)

In [11]:
len(labels)

80158

In [12]:
max_length = 0
for label in labels:
    if len(label) > max_length:
        max_length = len(label)

In [13]:
max_length

158

In [14]:
labels_padded = []

for label in labels:
    if len(label) < max_length:
        labels_padded.append(label + ([2] * (max_length - len(label))))
    else:
        labels_padded.append(label)

In [15]:
len(labels_padded)

80158

In [16]:
def get_placeholder_vector(d):
    return np.ones(d)

In [17]:
def spacy_parser(nlp, text):
    # https://spacy.io/api/doc
    doc = nlp(text)
    tokens = []
    vectors = []
    # https://spacy.io/usage/linguistic-features#sbd
    for token in doc:
        # https://spacy.io/api/token
        # https://spacy.io/usage/linguistic-features#lemmatization
        # https://spacy.io/api/lemmatizer
        # https://spacy.io/usage/linguistic-features#vectors-similarity

        # print(token.text, token.has_vector, token.vector_norm, token.is_oov)
        if token.has_vector:
            vector = token.vector
        else:
            vector = get_placeholder_vector(300) # ((300,1) is the shape of the vector in Spacy)

        tokens.append(token.text)
        vectors.append(vector)
    # print("tokens with no vectors:", no_vector)
    # if there is a very low count of vectors: https://spacy.io/usage/linguistic-features#adding-vectors 
    return tokens, vectors

In [18]:
tokens = []
embedings = []

for idx, row in df.iterrows():
    t, e = spacy_parser(nlp, row["Utterances"])
    tokens.append(t)
    embedings.append(e)

In [19]:
len(embedings)

80158

In [20]:
len(embedings[0][0])

300

In [21]:
embeding_size = 300

In [None]:
padded_embedings = []

for embeding in embedings:
    tmp = []
    embeding_len = len(embeding)
    if embeding_len < max_length:
        for _ in range(max_length - embeding_len):
            tmp.append([0] * embeding_size)
        padded_embedings.append(np.concatenate((embeding, tmp), axis=0))
    else:
        padded_embedings.append(embeding)

In [None]:
len(padded_embedings)

In [None]:
X = np.array(padded_embedings)

In [None]:
X.shape