# Setup

## Imports

In [1]:
# General
import numpy as np
import pandas as pd
import re
import gc

from nltk.tokenize import regexp_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
import os
import sys
# Add the path to the utils folder
sys.path.append(os.path.abspath('../..'))
import importlib
# Custom modules
from utils import memory_usage, load_json, process_parquet_in_chunks, file_exists, save_np, load_np
from config import run_config, token_pattern, DATASET_PATH, PROCESSED_DATA_PATH
importlib.reload(sys.modules['utils'])
importlib.reload(sys.modules['config'])

<module 'config' from 'e:\\College\\4- Senior 2\\Semester 1\\NLP\\Project\\config.py'>

## Config

In [3]:
run_config()

# Load Dataset

In [4]:
train_filename = DATASET_PATH + '/PIZZA_train.json'
df_train = load_json(train_filename, cols=['train.SRC', 'train.TOP'])
df_train.head()

Unnamed: 0,train.SRC,train.TOP
0,can i have a large bbq pulled pork,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING bbq pulled pork ) ) )
1,large pie with green pepper and with extra peperonni,(ORDER (PIZZAORDER (SIZE large ) pie with (TOPPING green pepper ) and with (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING peperonni ) ) ) )
2,i'd like a large vegetarian pizza,(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE large ) (STYLE vegetarian ) pizza ) )
3,party size stuffed crust pie with american cheese and with mushroom,(ORDER (PIZZAORDER (SIZE party size ) (STYLE stuffed crust ) pie with (TOPPING american cheese ) and with (TOPPING mushroom ) ) )
4,can i have one personal sized artichoke,(ORDER can i have (PIZZAORDER (NUMBER one ) (SIZE personal sized ) (TOPPING artichoke ) ) )


In [5]:
df_dev = pd.read_json(DATASET_PATH + '/PIZZA_dev.json', lines=True)[['dev.SRC', 'dev.TOP', 'dev.PCFG_ERR']]
df_dev.head()

Unnamed: 0,dev.SRC,dev.TOP,dev.PCFG_ERR
0,i want to order two medium pizzas with sausage and black olives and two medium pizzas with pepperoni and extra cheese and three large pizzas with pepperoni and sausage,(ORDER i want to order (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING sausage ) and (TOPPING black olives ) ) and (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING pepperoni ) and (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) ) and (PIZZAORDER (NUMBER three ) (SIZE large ) pizzas with (TOPPING pepperoni ) and (TOPPING sausage ) ) ),False
1,five medium pizzas with tomatoes and ham,(ORDER (PIZZAORDER (NUMBER five ) (SIZE medium ) pizzas with (TOPPING tomatoes ) and (TOPPING ham ) ) ),False
2,i need to order one large vegetarian pizza with extra banana peppers,(ORDER i need to order (PIZZAORDER (NUMBER one ) (SIZE large ) (STYLE vegetarian ) pizza with (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING banana peppers ) ) ) ),False
3,i'd like to order a large onion and pepper pizza,(ORDER i'd like to order (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING onion ) and (TOPPING pepper ) pizza ) ),False
4,i'll have one pie along with pesto and ham but avoid olives,(ORDER i'll have (PIZZAORDER (NUMBER one ) pie along with (TOPPING pesto ) and (TOPPING ham ) but avoid (NOT (TOPPING olives ) ) ) ),False


In [6]:
memory_usage()

1253.41796875

# Preprocessing

## Column names

- **SRC** 
    - The source text of the pizza order as given by the user.
- **EXR** 
    - The expected representation of the pizza order in a structured format (likely a parse tree or similar structure).
- **TOP** 
    - The top-level representation of the pizza order, possibly a normalized or tokenized version of the source text.
- **PCFG_ERR** 
    - A boolean indicating whether there was an error in parsing the pizza order using a Probabilistic Context-Free Grammar (PCFG).

In [7]:
df_dev.rename(columns={
    'dev.SRC': 'src', 
    'dev.EXR': 'exr',
    'dev.TOP': 'top',
    'dev.PCFG_ERR': 'pcfg_err',
}, inplace=True)
df_train.rename(columns={
    'train.SRC': 'src', 
    'train.EXR': 'exr',
    'train.TOP': 'top',
    'train.TOP-DECOUPLED': 'decoupled',
}, inplace=True)

## Merge train and dev sets

In [8]:
df_dev = df_dev[df_dev['pcfg_err'] == 'False'][['src', 'top']]
print(df_dev.shape)
df_dev.head(1)

(242, 2)


Unnamed: 0,src,top
0,i want to order two medium pizzas with sausage and black olives and two medium pizzas with pepperoni and extra cheese and three large pizzas with pepperoni and sausage,(ORDER i want to order (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING sausage ) and (TOPPING black olives ) ) and (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING pepperoni ) and (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) ) and (PIZZAORDER (NUMBER three ) (SIZE large ) pizzas with (TOPPING pepperoni ) and (TOPPING sausage ) ) )


In [9]:
df = pd.concat([df_train, df_dev], ignore_index=True)

## Split Data

In [10]:
# Split the data into train, dev and test sets (Fn. shuffles the data by default)
df_train, df_test = train_test_split(df, test_size=0.05, random_state=0)
df_train, df_dev = train_test_split(df_train, test_size=0.1, random_state=0)

In [11]:
# Save the data
df_train.to_parquet(PROCESSED_DATA_PATH + '/train.parquet')
df_dev.to_parquet(PROCESSED_DATA_PATH + '/dev.parquet')
df_test.to_parquet(PROCESSED_DATA_PATH + '/test.parquet')

In [12]:
del df
gc.collect()

0

## Features

### Tokenization

In [13]:
update_tokenization = False
if update_tokenization or not file_exists(PROCESSED_DATA_PATH + '/X_train.npy'):
    # Tokenize the data
    df_train["tokenized"] = df_train["src"].progress_apply(lambda x: regexp_tokenize(x, token_pattern))
    df_dev["tokenized"] = df_dev["src"].progress_apply(lambda x: regexp_tokenize(x, token_pattern))
    df_test["tokenized"] = df_test["src"].progress_apply(lambda x: regexp_tokenize(x, token_pattern))    

    # Pad the sequences
    max_len = df_train["tokenized"].apply(lambda x: len(x)).max()
    df_train["tokenized"].progress_apply(lambda x: x.extend(['<PAD>'] * (max_len - len(x))))
    df_dev["tokenized"].progress_apply(lambda x: x.extend(['<PAD>'] * (max_len - len(x))))
    df_test["tokenized"].progress_apply(lambda x: x.extend(['<PAD>'] * (max_len - len(x))))

    X_train = df_train['tokenized'].to_numpy()
    X_dev = df_dev['tokenized'].to_numpy()
    X_test = df_test['tokenized'].to_numpy()

    # Save the tokenized data
    np.save(PROCESSED_DATA_PATH + '/X_train.npy', X_train)
    np.save(PROCESSED_DATA_PATH + '/X_dev.npy', X_dev)
    np.save(PROCESSED_DATA_PATH + '/X_test.npy', X_test)
else:
    print('Loading tokenized data')
    X_train = np.load(PROCESSED_DATA_PATH + '/X_train.npy', allow_pickle=True)
    X_dev = np.load(PROCESSED_DATA_PATH + '/X_dev.npy', allow_pickle=True)
    X_test = np.load(PROCESSED_DATA_PATH + '/X_test.npy', allow_pickle=True)
    max_len = len(X_train[0])

Loading tokenized data


In [15]:
X_train[0]

['three',
 'pizzas',
 'no',
 'american',
 'cheese',
 'and',
 'a',
 'water',
 'and',
 'one',
 'ginger',
 'ale',
 'and',
 'a',
 'san',
 'pellegrino',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>']

In [16]:
del X_train, X_dev, X_test
gc.collect()

0

In [17]:
memory_usage()

1447.9296875

## Targets

We remove the leading *ORDER* constructor from the target output sequences since it is a **universal top-level constructor** and **there is nothing to be learned from it**.

In [20]:
# train.SRC	train.EXR	train.TOP	train.TOP-DECOUPLED
def remove_order(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    """
    Remove the string '(ORDER' from the beginning of the columns in the DataFrame.
    And remove the string ')' from the end of the columns in the DataFrame.
    
    df: pd.DataFrame
        The DataFrame to remove the string from.
    cols: list[str]
        The columns to remove the string from.
    """
    for col in cols:
        df[col] = df[col].str.replace(r"^\(ORDER\s?", "", regex=True)
        df[col] = df[col].str.replace(r"\)$", "", regex=True)
    
    return df

df_train = remove_order(df_train, ['top'])
df_dev = remove_order(df_dev, ['top'])
df_test = remove_order(df_test, ['top'])

In [21]:
# Update the data
df_train.to_parquet(PROCESSED_DATA_PATH + '/train.parquet')
df_dev.to_parquet(PROCESSED_DATA_PATH + '/dev.parquet')
df_test.to_parquet(PROCESSED_DATA_PATH + '/test.parquet')

In [22]:
df_train.head(1)

Unnamed: 0,src,top
1726538,three pizzas no american cheese and a water and one ginger ale and a san pellegrino,(PIZZAORDER (NUMBER three ) pizzas no (NOT (TOPPING american cheese ) ) ) and (DRINKORDER (NUMBER a ) (DRINKTYPE water ) ) and (DRINKORDER (NUMBER one ) (DRINKTYPE ginger ale ) ) and (DRINKORDER (NUMBER a ) (DRINKTYPE san pellegrino ) )


### Extract Labels

In [23]:
if not file_exists(PROCESSED_DATA_PATH + '/entities.npy'):
    full_text = " ".join(df_train['top'].to_list())
    entities = [x.group() for x in re.finditer("(?<=\()[A-Z]+(_[A-Z]+)*", full_text)]
    entities = list(set(entities)) # Unique
    np.save(PROCESSED_DATA_PATH + '/entities.npy', entities)
    del full_text
    gc.collect()
else:
    entities = np.load(PROCESSED_DATA_PATH + '/entities.npy', allow_pickle=True).tolist()
entities

['CONTAINERTYPE',
 'SIZE',
 'NUMBER',
 'STYLE',
 'VOLUME',
 'DRINKORDER',
 'TOPPING',
 'PIZZAORDER',
 'DRINKTYPE',
 'NOT',
 'QUANTITY',
 'COMPLEX_TOPPING']

In [24]:
# Using BIO Tagging
if not file_exists(PROCESSED_DATA_PATH + '/bio_entities.npy') or not file_exists(PROCESSED_DATA_PATH + '/full_entities.npy'):
    full_entities = entities.copy()
    full_entities.remove("NOT")
    for entity in ["TOPPING", "STYLE"]: # Look at EDA
        full_entities.append(f"NOT_{entity}") 
    bio_entities = [f"{letter}-{entity}" for entity in full_entities for letter in "BI"]
    bio_entities.append('O')
    np.save(PROCESSED_DATA_PATH + '/bio_entities.npy', bio_entities)
    np.save(PROCESSED_DATA_PATH + '/full_entities.npy', full_entities)
else:
    full_entities = np.load(PROCESSED_DATA_PATH + '/full_entities.npy', allow_pickle=True)
    bio_entities = np.load(PROCESSED_DATA_PATH + '/bio_entities.npy', allow_pickle=True)
bio_entities

array(['B-CONTAINERTYPE', 'I-CONTAINERTYPE', 'B-SIZE', 'I-SIZE',
       'B-NUMBER', 'I-NUMBER', 'B-STYLE', 'I-STYLE', 'B-VOLUME',
       'I-VOLUME', 'B-DRINKORDER', 'I-DRINKORDER', 'B-TOPPING',
       'I-TOPPING', 'B-PIZZAORDER', 'I-PIZZAORDER', 'B-DRINKTYPE',
       'I-DRINKTYPE', 'B-QUANTITY', 'I-QUANTITY', 'B-COMPLEX_TOPPING',
       'I-COMPLEX_TOPPING', 'B-NOT_TOPPING', 'I-NOT_TOPPING',
       'B-NOT_STYLE', 'I-NOT_STYLE', 'O'], dtype='<U17')

In [37]:
update_encoder = False
if update_encoder or not file_exists(PROCESSED_DATA_PATH + '/label_encoder.npy'):
    label_encoder = LabelEncoder()
    label_encoder.fit(list(bio_entities) + ['<PAD>'])
    # label_encoder.classes_ = np.append(label_encoder.classes_, '<PAD>')
    np.save(PROCESSED_DATA_PATH + '/label_encoder.npy', label_encoder.classes_)
else:
    label_encoder = LabelEncoder()
    label_encoder.classes_ = np.load(PROCESSED_DATA_PATH + '/label_encoder.npy')

In [38]:
label_encoder.transform(['B-PIZZAORDER'])

array([8])

In [39]:
def encode_labels(labels: list[str]):
    return label_encoder.transform(labels)
def decode_labels(labels: list[str]):
    return label_encoder.inverse_transform(labels)

In [42]:
def extract_IS_labels(row, entities):
    top = row['top']
    # Extract words and parenthesis
    pattern = r"\b\w+(?:'\w+)?(?:-\w+)*\b|[()]"
    tokens = regexp_tokenize(top, pattern)
    
    labels: list[str] = []
    count: int = 0
    # print(tokens)
    is_beginning = True
    order_type = "PIZZAORDER"
    for i, token in enumerate(tokens):
        # print(token, count)
        # Skip all entities except ["PIZZAORDER", "DRINKORDER"]
        if token in entities and token not in ["PIZZAORDER", "DRINKORDER"]:
            continue
        elif token == "(":
            count += 1
        elif token == ")":
            count -= 1
        elif token == "PIZZAORDER":
            order_type = "PIZZAORDER"
        elif token == "DRINKORDER":
            order_type = "DRINKORDER"
        
        elif count == 0:
            labels.append("O")
            is_beginning = True
        else:
            if is_beginning == True:
                labels.append("B-" + order_type)
                is_beginning = False
                continue
            if is_beginning == False:
                labels.append("I-" + order_type)
    labels = labels + ["<PAD>"] * (max_len - len(labels)) # Pad the sequence
    row['IS_encoded_labels'] = encode_labels(labels)
    return row

In [59]:
def extract_NER_labels(row, entities):
    top = row['top']
    # Extract words and parenthesis
    pattern = r"\b\w+(?:'\w+)?(?:-\w+)*\b|[()]"
    tokens = regexp_tokenize(top, pattern)
    
    labels: list[str] = []
    count: int = 0
    # print(tokens)
    is_beginning = True
    not_count = 0
    label = ""
    for i, token in enumerate(tokens):
        # print(token, count)
        if token in entities:
            # Skip ["PIZZAORDER", "DRINKORDER"] entities
            if token in ["PIZZAORDER", "DRINKORDER"]:
                continue
            elif token == "NOT":
                not_count = count
            else:
                label = token
        elif token == "(":
            count += 1
        elif token == ")":
            count -= 1
            if count == not_count:
                not_count = 0
        elif count > 1:
            if is_beginning == True:
                add_not = "NOT_" if not_count else ""
                labels.append("B-" + add_not + label)
                is_beginning = False
                continue
            if is_beginning == False:
                add_not = "NOT_" if not_count else ""
                labels.append("I-" + add_not + label)
        # First level of parenthesis (PIZZAORDER, DRINKORDER)
        else:
            labels.append("O")
            is_beginning = True

    labels = labels + ["<PAD>"] * (max_len - len(labels)) # Pad the sequence
    row['NER_encoded_labels'] = encode_labels(labels)
    return row

In [65]:
#* Note: chunk: 100k -> 30m 27.8s
#* Note: chunk: 500k -> ~19m
update_IS_labels = False
if update_IS_labels or not file_exists(PROCESSED_DATA_PATH + '/train_preprocessing.parquet'):
    process_parquet_in_chunks(
        PROCESSED_DATA_PATH + '/train.parquet',
        PROCESSED_DATA_PATH + '/train_preprocessing.parquet',
        500000,
        extract_IS_labels,
        args=tuple([entities]),
    )

100%|██████████| 500000/500000 [04:31<00:00, 1842.25it/s]


Chunk 0 processed and saved to ../../data/saved/data/train_preprocessing.parquet_0.parquet


100%|██████████| 500000/500000 [04:21<00:00, 1910.76it/s]


Chunk 1 processed and saved to ../../data/saved/data/train_preprocessing.parquet_1.parquet


100%|██████████| 500000/500000 [04:44<00:00, 1760.47it/s]


Chunk 2 processed and saved to ../../data/saved/data/train_preprocessing.parquet_2.parquet


100%|██████████| 500000/500000 [04:44<00:00, 1758.09it/s]


Chunk 3 processed and saved to ../../data/saved/data/train_preprocessing.parquet_3.parquet


100%|██████████| 100467/100467 [00:57<00:00, 1744.16it/s]


Chunk 4 processed and saved to ../../data/saved/data/train_preprocessing.parquet_4.parquet
Merging processed chunks into a single Parquet file...
Merged file saved to ../../data/saved/data/train_preprocessing.parquet


In [45]:
if update_IS_labels or not file_exists(PROCESSED_DATA_PATH + '/dev_preprocessing.parquet'):
    df_dev = df_dev.progress_apply(extract_IS_labels, axis=1, args=tuple([entities]))
    df_dev.to_parquet(PROCESSED_DATA_PATH + '/dev_preprocessing.parquet')
else:
    df_dev = pd.read_parquet(PROCESSED_DATA_PATH + '/dev.parquet')

if update_IS_labels or not file_exists(PROCESSED_DATA_PATH + '/test_preprocessing.parquet'):
    df_test = df_test.progress_apply(extract_IS_labels, axis=1, args=tuple([entities]))
    df_test.to_parquet(PROCESSED_DATA_PATH + '/test_preprocessing.parquet')
else:
    df_test = pd.read_parquet(PROCESSED_DATA_PATH + '/test.parquet')

100%|██████████| 233386/233386 [02:06<00:00, 1848.46it/s]
100%|██████████| 122835/122835 [01:06<00:00, 1840.98it/s]


In [55]:
"""
(PIZZAORDER 
    (NUMBER three ) pizzas no 
    (NOT (TOPPING american cheese ) ) ) and 
(DRINKORDER 
    (NUMBER a ) (DRINKTYPE water ) ) and 
(DRINKORDER 
    (NUMBER one ) (DRINKTYPE ginger ale ) ) and 
(DRINKORDER 
    (NUMBER a ) (DRINKTYPE san pellegrino ) )

['B-NUMBER', 'O', 'O', 'B-NOT_TOPPING', 'I-NOT_TOPPING', 'O', 'B-NUMBER', 'I-DRINKTYPE', 'O', 'B-NUMBER', 'I-DRINKTYPE', 'I-DRINKTYPE', 'O', 'B-NUMBER', 'I-DRINKTYPE', 'I-DRINKTYPE']

"""

''

In [58]:
df_train[df_train['src'].str.find("large pie with green pepper and with extra peperonni") > 0]

Unnamed: 0,src,top,IS_encoded_labels
237730,a large pie with green pepper and with extra peperonni,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) pie with (TOPPING green pepper ) and with (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING peperonni ) ) ) ),"[7, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20]"
470155,can i have one large pie with green pepper and with extra peperonni,(ORDER can i have (PIZZAORDER (NUMBER one ) (SIZE large ) pie with (TOPPING green pepper ) and with (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING peperonni ) ) ) ),"[7, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20]"
920373,i'd like a large pie with green pepper and with extra peperonni,(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE large ) pie with (TOPPING green pepper ) and with (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING peperonni ) ) ) ),"[7, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20]"


In [67]:
#* Note: chunk: 100k -> 30m 27.8s
update_NER_labels = False
if update_NER_labels or not file_exists(PROCESSED_DATA_PATH + '/train_preprocessing.parquet'):
    process_parquet_in_chunks(
        PROCESSED_DATA_PATH + '/train_preprocessing.parquet',
        PROCESSED_DATA_PATH + '/train_preprocessing.parquet',
        500000,
        extract_NER_labels,
        args=tuple([entities]),
    )

100%|██████████| 500000/500000 [04:33<00:00, 1826.87it/s]


Chunk 0 processed and saved to ../../data/saved/data/train_preprocessing.parquet_0.parquet


100%|██████████| 500000/500000 [04:22<00:00, 1903.58it/s]


Chunk 1 processed and saved to ../../data/saved/data/train_preprocessing.parquet_1.parquet


100%|██████████| 500000/500000 [04:23<00:00, 1894.17it/s]


Chunk 2 processed and saved to ../../data/saved/data/train_preprocessing.parquet_2.parquet


100%|██████████| 500000/500000 [04:41<00:00, 1777.95it/s]


Chunk 3 processed and saved to ../../data/saved/data/train_preprocessing.parquet_3.parquet


100%|██████████| 100467/100467 [01:00<00:00, 1660.39it/s]


Chunk 4 processed and saved to ../../data/saved/data/train_preprocessing.parquet_4.parquet
Merging processed chunks into a single Parquet file...
Merged file saved to ../../data/saved/data/train_preprocessing.parquet


In [61]:
if update_NER_labels or not file_exists(PROCESSED_DATA_PATH + '/dev_preprocessing.parquet'):
    df_dev = df_dev.progress_apply(extract_NER_labels, axis=1, args=tuple([entities]))
    df_dev.to_parquet(PROCESSED_DATA_PATH + '/dev_preprocessing.parquet')
else:
    df_dev = pd.read_parquet(PROCESSED_DATA_PATH + '/dev.parquet')

if update_NER_labels or not file_exists(PROCESSED_DATA_PATH + '/test_preprocessing.parquet'):
    df_test = df_test.progress_apply(extract_NER_labels, axis=1, args=tuple([entities]))
    df_test.to_parquet(PROCESSED_DATA_PATH + '/test_preprocessing.parquet')
else:
    df_test = pd.read_parquet(PROCESSED_DATA_PATH + '/test.parquet')

100%|██████████| 233386/233386 [02:07<00:00, 1832.35it/s]
100%|██████████| 122835/122835 [01:05<00:00, 1884.71it/s]


In [68]:
df_train = pd.read_parquet(PROCESSED_DATA_PATH + '/train_preprocessing.parquet')
df_train.head()

Unnamed: 0,src,top,IS_encoded_labels,NER_encoded_labels
0,three pizzas no american cheese and a water and one ginger ale and a san pellegrino,(PIZZAORDER (NUMBER three ) pizzas no (NOT (TOPPING american cheese ) ) ) and (DRINKORDER (NUMBER a ) (DRINKTYPE water ) ) and (DRINKORDER (NUMBER one ) (DRINKTYPE ginger ale ) ) and (DRINKORDER (NUMBER a ) (DRINKTYPE san pellegrino ) ),"[8, 21, 21, 21, 21, 27, 3, 16, 27, 3, 16, 16, 27, 3, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[7, 27, 27, 6, 19, 27, 7, 17, 27, 7, 17, 17, 27, 7, 17, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,three large pizzas with balsamic glaze and three party - sized pies with just a little cherry tomato,(PIZZAORDER (NUMBER three ) (SIZE large ) pizzas with (TOPPING balsamic glaze ) ) and (PIZZAORDER (NUMBER three ) (SIZE party - sized ) pies with (COMPLEX_TOPPING (QUANTITY just a little ) (TOPPING cherry tomato ) ) ),"[8, 21, 21, 21, 21, 21, 27, 8, 21, 21, 21, 21, 21, 21, 21, 21, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[7, 23, 27, 27, 12, 25, 27, 7, 23, 23, 27, 27, 9, 22, 22, 25, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,i'd like a lunch sized pizza with parsley fried onions and parmesan,i'd like (PIZZAORDER (NUMBER a ) (SIZE lunch sized ) pizza with (TOPPING parsley ) (TOPPING fried onions ) and (TOPPING parmesan ) ),"[27, 27, 8, 21, 21, 21, 21, 21, 21, 21, 21, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[27, 27, 7, 23, 23, 27, 27, 12, 25, 25, 27, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,two regular pizzas without any caramelized onions,(PIZZAORDER (NUMBER two ) (SIZE regular ) pizzas without any (NOT (TOPPING caramelized onions ) ) ),"[8, 21, 21, 21, 21, 21, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[7, 23, 27, 27, 27, 6, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,i'd like a pizza with parmesan cheese beef and roasted tomatoes,i'd like (PIZZAORDER (NUMBER a ) pizza with (TOPPING parmesan cheese ) (TOPPING beef ) and (TOPPING roasted tomatoes ) ),"[27, 27, 8, 21, 21, 21, 21, 21, 21, 21, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[27, 27, 7, 27, 27, 12, 25, 25, 27, 12, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [66]:
df_train.iloc[237730]

src                                                                                                         a large pie with green pepper and with extra peperonni
top                   (PIZZAORDER (NUMBER a ) (SIZE large ) pie with (TOPPING green pepper ) and with (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING peperonni ) ) ) 
NER_encoded_labels                                        [7, 23, 27, 27, 12, 25, 27, 27, 9, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Name: 237730, dtype: object

### Save Targets

In [69]:
y1_train, y2_train = df_train['IS_encoded_labels'].to_frame(), df_train['NER_encoded_labels'].to_frame()

y1_dev, y2_dev = df_dev['IS_encoded_labels'].to_frame(), df_dev['NER_encoded_labels'].to_frame()

y1_test, y2_test = df_test['IS_encoded_labels'].to_frame(), df_test['NER_encoded_labels'].to_frame()

In [71]:
np.save(PROCESSED_DATA_PATH + '/y1_train.npy', y1_train)
np.save(PROCESSED_DATA_PATH + '/y2_train.npy', y2_train)
np.save(PROCESSED_DATA_PATH + '/y1_dev.npy', y1_dev)
np.save(PROCESSED_DATA_PATH + '/y2_dev.npy', y2_dev)
np.save(PROCESSED_DATA_PATH + '/y1_test.npy', y1_test)
np.save(PROCESSED_DATA_PATH + '/y2_test.npy', y2_test)

In [70]:
y1_train.to_parquet(PROCESSED_DATA_PATH + '/y1_train.parquet')
y2_train.to_parquet(PROCESSED_DATA_PATH + '/y2_train.parquet')
y1_dev.to_parquet(PROCESSED_DATA_PATH + '/y1_dev.parquet')
y2_dev.to_parquet(PROCESSED_DATA_PATH + '/y2_dev.parquet')
y1_test.to_parquet(PROCESSED_DATA_PATH + '/y1_test.parquet')
y2_test.to_parquet(PROCESSED_DATA_PATH + '/y2_test.parquet')