# Setup

## Imports

In [1]:
!pip install sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn_crfsuite)
  Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn_crfsuite
Successfully installed python-crfsuite-0.9.11 sklearn_crfsuite-0.5.0


In [2]:
# General
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
import os
import tqdm

import spacy
# Load spacy model for lemmatization
nlp = spacy.load("en_core_web_sm")

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import regexp_tokenize

# Models
from sklearn_crfsuite import CRF
import torch

# Evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

## Config

In [4]:
pd.set_option('display.max_colwidth', 1000) # Show all content of the cells
# # Undo with 
# pd.reset_option('display.max_colwidth')

# config tqdm for pandas
tqdm.tqdm.pandas()

In [5]:
DATASET_PATH = "/kaggle/input/pizza-dataset"
OUTPUT_ROOT_PATH = "/kaggle/working"
MODELS_PATH = OUTPUT_ROOT_PATH + "/models"
PYTORCH_MODELS_PATH = MODELS_PATH + "/checkpoints"

In [6]:
!mkdir {MODELS_PATH} {PYTORCH_MODELS_PATH}
# !rmdir {MODELS_PATH} {PYTORCH_MODELS_PATH}

In [7]:
# token_pattern=r"(?u)\b\w+\b"
token_pattern=r"(?u)\b\w+(?:'\w+)?(?:-\w+)*\b"

# Load Dataset

In [8]:
df_dev = pd.read_json(DATASET_PATH + '/dataset/PIZZA_dev.json', lines=True)
df_dev.head()

Unnamed: 0,dev.SRC,dev.EXR,dev.TOP,dev.PCFG_ERR
0,i want to order two medium pizzas with sausage and black olives and two medium pizzas with pepperoni and extra cheese and three large pizzas with pepperoni and sausage,(ORDER (PIZZAORDER (NUMBER 2 ) (SIZE MEDIUM ) (COMPLEX_TOPPING (QUANTITY EXTRA ) (TOPPING CHEESE ) ) (TOPPING PEPPERONI ) ) (PIZZAORDER (NUMBER 2 ) (SIZE MEDIUM ) (TOPPING OLIVES ) (TOPPING SAUSAGE ) ) (PIZZAORDER (NUMBER 3 ) (SIZE LARGE ) (TOPPING PEPPERONI ) (TOPPING SAUSAGE ) ) ),(ORDER i want to order (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING sausage ) and (TOPPING black olives ) ) and (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING pepperoni ) and (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) ) and (PIZZAORDER (NUMBER three ) (SIZE large ) pizzas with (TOPPING pepperoni ) and (TOPPING sausage ) ) ),False
1,five medium pizzas with tomatoes and ham,(ORDER (PIZZAORDER (NUMBER 5 ) (SIZE MEDIUM ) (TOPPING HAM ) (TOPPING TOMATOES ) ) ),(ORDER (PIZZAORDER (NUMBER five ) (SIZE medium ) pizzas with (TOPPING tomatoes ) and (TOPPING ham ) ) ),False
2,i need to order one large vegetarian pizza with extra banana peppers,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (STYLE VEGETARIAN ) (COMPLEX_TOPPING (QUANTITY EXTRA ) (TOPPING BANANA_PEPPERS ) ) ) ),(ORDER i need to order (PIZZAORDER (NUMBER one ) (SIZE large ) (STYLE vegetarian ) pizza with (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING banana peppers ) ) ) ),False
3,i'd like to order a large onion and pepper pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (TOPPING ONIONS ) (TOPPING PEPPERS ) ) ),(ORDER i'd like to order (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING onion ) and (TOPPING pepper ) pizza ) ),False
4,i'll have one pie along with pesto and ham but avoid olives,(ORDER (PIZZAORDER (NOT (TOPPING OLIVES ) ) (NUMBER 1 ) (TOPPING HAM ) (TOPPING PESTO ) ) ),(ORDER i'll have (PIZZAORDER (NUMBER one ) pie along with (TOPPING pesto ) and (TOPPING ham ) but avoid (NOT (TOPPING olives ) ) ) ),False


In [9]:
df_train = pd.read_json(DATASET_PATH + '/dataset/PIZZA_train.json/PIZZA_train.json', lines=True)
df_train.head()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
0,can i have a large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (TOPPING BBQ_PULLED_PORK ) ) ),(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING bbq pulled pork ) ) ),(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING bbq pulled pork ) ) )
1,large pie with green pepper and with extra peperonni,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (TOPPING GREEN_PEPPERS ) (COMPLEX_TOPPING (QUANTITY EXTRA ) (TOPPING PEPPERONI ) ) ) ),(ORDER (PIZZAORDER (SIZE large ) pie with (TOPPING green pepper ) and with (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING peperonni ) ) ) ),(ORDER (PIZZAORDER (SIZE large ) (TOPPING green pepper ) (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING peperonni ) ) ) )
2,i'd like a large vegetarian pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (STYLE VEGETARIAN ) ) ),(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE large ) (STYLE vegetarian ) pizza ) ),(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (STYLE vegetarian ) ) )
3,party size stuffed crust pie with american cheese and with mushroom,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZE ) (STYLE STUFFED_CRUST ) (TOPPING AMERICAN_CHEESE ) (TOPPING MUSHROOMS ) ) ),(ORDER (PIZZAORDER (SIZE party size ) (STYLE stuffed crust ) pie with (TOPPING american cheese ) and with (TOPPING mushroom ) ) ),(ORDER (PIZZAORDER (SIZE party size ) (STYLE stuffed crust ) (TOPPING american cheese ) (TOPPING mushroom ) ) )
4,can i have one personal sized artichoke,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PERSONAL_SIZE ) (TOPPING ARTICHOKES ) ) ),(ORDER can i have (PIZZAORDER (NUMBER one ) (SIZE personal sized ) (TOPPING artichoke ) ) ),(ORDER (PIZZAORDER (NUMBER one ) (SIZE personal sized ) (TOPPING artichoke ) ) )


# EDA - Exploratory Data Analysis

## Column names

- **SRC** 
    - The source text of the pizza order as given by the user.
- **EXR** 
    - The expected representation of the pizza order in a structured format (likely a parse tree or similar structure).
    - Or the required extraction ~ Ahmed :)
- **TOP** 
    - The top-level representation of the pizza order, possibly a normalized or tokenized version of the source text.
- **PCFG_ERR** 
    - A boolean indicating whether there was an error in parsing the pizza order using a Probabilistic Context-Free Grammar (PCFG).

In [10]:
df_dev.rename(columns={
    'dev.SRC': 'src', 
    'dev.EXR': 'exr',
    'dev.TOP': 'top',
    'dev.PCFG_ERR': 'pcfg_err',
}, inplace=True)
df_train.rename(columns={
    'train.SRC': 'src', 
    'train.EXR': 'exr',
    'train.TOP': 'top',
    'train.TOP-DECOUPLED': 'decoupled',
}, inplace=True)

## Check Duplicates & Missing Data

In [11]:
df_dev.describe()

Unnamed: 0,src,exr,top,pcfg_err
count,348,348,348,348
unique,348,270,348,2
top,i want to order two medium pizzas with sausage and black olives and two medium pizzas with pepperoni and extra cheese and three large pizzas with pepperoni and sausage,(ORDER (PIZZAORDER (NOT (TOPPING PEPPERS ) ) (NUMBER 1 ) (SIZE SMALL ) (STYLE THIN_CRUST ) (TOPPING HAM ) ) ),(ORDER i want to order (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING sausage ) and (TOPPING black olives ) ) and (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING pepperoni ) and (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) ) and (PIZZAORDER (NUMBER three ) (SIZE large ) pizzas with (TOPPING pepperoni ) and (TOPPING sausage ) ) ),False
freq,1,5,1,242


In [12]:
df_train.describe()

Unnamed: 0,src,exr,top,decoupled
count,2456446,2456446,2456446,2456446
unique,2456446,694346,2456446,1425035
top,can i have a large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZE ) (TOPPING BANANA_PEPPERS ) (TOPPING PEPPERONI ) (TOPPING YELLOW_PEPPERS ) ) ),(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING bbq pulled pork ) ) ),(ORDER (PIZZAORDER (NUMBER three ) (NOT (TOPPING american cheese ) ) ) (PIZZAORDER (NUMBER three ) (NOT (TOPPING american cheese ) ) ) )
freq,1,1999,1,167


In [13]:
df_dev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 348 entries, 0 to 347
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   src       348 non-null    object
 1   exr       348 non-null    object
 2   top       348 non-null    object
 3   pcfg_err  348 non-null    object
dtypes: object(4)
memory usage: 11.0+ KB


In [14]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2456446 entries, 0 to 2456445
Data columns (total 4 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   src        object
 1   exr        object
 2   top        object
 3   decoupled  object
dtypes: object(4)
memory usage: 75.0+ MB


In [15]:
df_dev.duplicated().sum()

0

In [16]:
df_train.duplicated().sum()

0

In [17]:
df_dev.isna().sum()

src         0
exr         0
top         0
pcfg_err    0
dtype: int64

In [18]:
df_train.isna().sum()

src          0
exr          0
top          0
decoupled    0
dtype: int64

## Data Cleaning

Masha2 allah: Data is clean (no punctuation | whitespace)

In [19]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [20]:
df_dev['src'].str.find(string.punctuation).describe()

count    348.0
mean      -1.0
std        0.0
min       -1.0
25%       -1.0
50%       -1.0
75%       -1.0
max       -1.0
Name: src, dtype: float64

In [21]:
df_dev['src'].str.extract("([A-Z]\.)+").describe() # No Abbreviations!

Unnamed: 0,0
count,0.0
unique,0.0
top,
freq,


In [22]:
df_train['src'].str.extract("([A-Z]\.)+").describe() # No Abbreviations!

Unnamed: 0,0
count,0.0
unique,0.0
top,
freq,


In [23]:
df_dev['src'].str.find(string.whitespace).describe()

count    348.0
mean      -1.0
std        0.0
min       -1.0
25%       -1.0
50%       -1.0
75%       -1.0
max       -1.0
Name: src, dtype: float64

In [24]:
df_dev['src'][df_dev['src'].str.find("i'd") > 0].describe()

count                                                                                                                                              6
unique                                                                                                                                             6
top       how are you tonight my order is for a medium pizza and i'd like chicken on it as well as extra cheese but please no onions i appreciate it
freq                                                                                                                                               1
Name: src, dtype: object

In [25]:
df_train['top'][df_train['top'].str.find("DRINKORDER") > 0].str.find("PIZZAORDER")

1251888    -1
1251889    -1
1251890    -1
1251891    -1
1251892    -1
           ..
2268941    17
2268942    17
2268943    17
2268944    17
2268945    17
Name: top, Length: 767058, dtype: int64

In [26]:
df_train.iloc[2268941]['top']

"(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza with (TOPPING roasted peppers ) ) and (DRINKORDER (NUMBER nine ) (DRINKTYPE seven up ) ) )"

# Preprocessing

## Targets

We remove the leading *ORDER* constructor from the target output sequences since it is a **universal top-level constructor** and **there is nothing to be learned from it**.

In [27]:
# train.SRC	train.EXR	train.TOP	train.TOP-DECOUPLED
df_train['src'] = df_train['src'].str.replace(r"^\(ORDER\s?", "", regex=True)
df_train['exr'] = df_train['exr'].str.replace(r"^\(ORDER\s?", "", regex=True)
df_train['top'] = df_train['top'].str.replace(r"^\(ORDER\s?", "", regex=True)
df_train['decoupled'] = df_train['decoupled'].str.replace(r"^\(ORDER\s?", "", regex=True)
df_train['src'] = df_train['src'].str.replace(r"\)$", "", regex=True)
df_train['exr'] = df_train['exr'].str.replace(r"\)$", "", regex=True)
df_train['top'] = df_train['top'].str.replace(r"\)$", "", regex=True)
df_train['decoupled'] = df_train['decoupled'].str.replace(r"\)$", "", regex=True)

df_dev['src'] = df_dev['src'].str.replace(r"^\(ORDER\s?", "", regex=True)
df_dev['exr'] = df_dev['exr'].str.replace(r"^\(ORDER\s?", "", regex=True)
df_dev['top'] = df_dev['top'].str.replace(r"^\(ORDER\s?", "", regex=True)
df_dev['src'] = df_dev['src'].str.replace(r"\)$", "", regex=True)
df_dev['exr'] = df_dev['exr'].str.replace(r"\)$", "", regex=True)
df_dev['top'] = df_dev['top'].str.replace(r"\)$", "", regex=True)

In [28]:
df_train.head(1)

Unnamed: 0,src,exr,top,decoupled
0,can i have a large bbq pulled pork,(PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (TOPPING BBQ_PULLED_PORK ) ),can i have (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING bbq pulled pork ) ),(PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING bbq pulled pork ) )


In [29]:
df_dev.head(1)

Unnamed: 0,src,exr,top,pcfg_err
0,i want to order two medium pizzas with sausage and black olives and two medium pizzas with pepperoni and extra cheese and three large pizzas with pepperoni and sausage,(PIZZAORDER (NUMBER 2 ) (SIZE MEDIUM ) (COMPLEX_TOPPING (QUANTITY EXTRA ) (TOPPING CHEESE ) ) (TOPPING PEPPERONI ) ) (PIZZAORDER (NUMBER 2 ) (SIZE MEDIUM ) (TOPPING OLIVES ) (TOPPING SAUSAGE ) ) (PIZZAORDER (NUMBER 3 ) (SIZE LARGE ) (TOPPING PEPPERONI ) (TOPPING SAUSAGE ) ),i want to order (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING sausage ) and (TOPPING black olives ) ) and (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING pepperoni ) and (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) ) and (PIZZAORDER (NUMBER three ) (SIZE large ) pizzas with (TOPPING pepperoni ) and (TOPPING sausage ) ),False


## Features - Inputs

### Lemmatize src

- We compare the accuracy of **normal text** to that of **lemmatized text**.
- This will provide insights into whether the models can use plural words to capture the meaning of quantities better or not.

In [30]:
def lemmatize_text(text):
    doc = nlp(text)
    lemma_tokenized = [token.lemma_.lower() for token in doc if not token.is_punct and not token.is_space]
    return lemma_tokenized

# Apply lemmatization
# Used for the rest of embeddings
df_dev["tokenized_lemma"] = df_dev["src"].progress_apply(lemmatize_text)
df_dev["tokenized"] = df_dev["src"].progress_apply(lambda x: regexp_tokenize(x, token_pattern))
# Used for tf-idf
df_dev["lemmatized"] = df_dev["tokenized_lemma"].progress_apply(lambda x: " ".join(x))
df_dev.head()

100%|██████████| 348/348 [00:02<00:00, 128.08it/s]
100%|██████████| 348/348 [00:00<00:00, 70414.29it/s]
100%|██████████| 348/348 [00:00<00:00, 251614.86it/s]


Unnamed: 0,src,exr,top,pcfg_err,tokenized_lemma,tokenized,lemmatized
0,i want to order two medium pizzas with sausage and black olives and two medium pizzas with pepperoni and extra cheese and three large pizzas with pepperoni and sausage,(PIZZAORDER (NUMBER 2 ) (SIZE MEDIUM ) (COMPLEX_TOPPING (QUANTITY EXTRA ) (TOPPING CHEESE ) ) (TOPPING PEPPERONI ) ) (PIZZAORDER (NUMBER 2 ) (SIZE MEDIUM ) (TOPPING OLIVES ) (TOPPING SAUSAGE ) ) (PIZZAORDER (NUMBER 3 ) (SIZE LARGE ) (TOPPING PEPPERONI ) (TOPPING SAUSAGE ) ),i want to order (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING sausage ) and (TOPPING black olives ) ) and (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING pepperoni ) and (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) ) and (PIZZAORDER (NUMBER three ) (SIZE large ) pizzas with (TOPPING pepperoni ) and (TOPPING sausage ) ),False,"[i, want, to, order, two, medium, pizza, with, sausage, and, black, olive, and, two, medium, pizza, with, pepperoni, and, extra, cheese, and, three, large, pizza, with, pepperoni, and, sausage]","[i, want, to, order, two, medium, pizzas, with, sausage, and, black, olives, and, two, medium, pizzas, with, pepperoni, and, extra, cheese, and, three, large, pizzas, with, pepperoni, and, sausage]",i want to order two medium pizza with sausage and black olive and two medium pizza with pepperoni and extra cheese and three large pizza with pepperoni and sausage
1,five medium pizzas with tomatoes and ham,(PIZZAORDER (NUMBER 5 ) (SIZE MEDIUM ) (TOPPING HAM ) (TOPPING TOMATOES ) ),(PIZZAORDER (NUMBER five ) (SIZE medium ) pizzas with (TOPPING tomatoes ) and (TOPPING ham ) ),False,"[five, medium, pizza, with, tomato, and, ham]","[five, medium, pizzas, with, tomatoes, and, ham]",five medium pizza with tomato and ham
2,i need to order one large vegetarian pizza with extra banana peppers,(PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (STYLE VEGETARIAN ) (COMPLEX_TOPPING (QUANTITY EXTRA ) (TOPPING BANANA_PEPPERS ) ) ),i need to order (PIZZAORDER (NUMBER one ) (SIZE large ) (STYLE vegetarian ) pizza with (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING banana peppers ) ) ),False,"[i, need, to, order, one, large, vegetarian, pizza, with, extra, banana, pepper]","[i, need, to, order, one, large, vegetarian, pizza, with, extra, banana, peppers]",i need to order one large vegetarian pizza with extra banana pepper
3,i'd like to order a large onion and pepper pizza,(PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (TOPPING ONIONS ) (TOPPING PEPPERS ) ),i'd like to order (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING onion ) and (TOPPING pepper ) pizza ),False,"[i, would, like, to, order, a, large, onion, and, pepper, pizza]","[i'd, like, to, order, a, large, onion, and, pepper, pizza]",i would like to order a large onion and pepper pizza
4,i'll have one pie along with pesto and ham but avoid olives,(PIZZAORDER (NOT (TOPPING OLIVES ) ) (NUMBER 1 ) (TOPPING HAM ) (TOPPING PESTO ) ),i'll have (PIZZAORDER (NUMBER one ) pie along with (TOPPING pesto ) and (TOPPING ham ) but avoid (NOT (TOPPING olives ) ) ),False,"[i, will, have, one, pie, along, with, pesto, and, ham, but, avoid, olive]","[i'll, have, one, pie, along, with, pesto, and, ham, but, avoid, olives]",i will have one pie along with pesto and ham but avoid olive


### TF-IDF

In [31]:
# TF-IDF feature extraction
vectorizer = TfidfVectorizer(analyzer="word", ngram_range=(1, 1), token_pattern=token_pattern)

# Transform lemmatized tokens into TF-IDF features
# Converts i'd -> i would (2 tokens instead of 1!) -> Trade off between lemmatizing 'top' or use 'src'
# tfidf_features = vectorizer.fit_transform(df_dev["lemmatized"]) 
tfidf_features = vectorizer.fit_transform(df_dev["src"])
print(len(tfidf_features.toarray()[5]))
print(tfidf_features.toarray()[0])
print(vectorizer.get_feature_names_out()[0])

245
[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.33419201 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.27444242 0.         0.         0.         0.         0.
 0.         0.         0.13552587 0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.13966882 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.08430144 0.         0.
 0.         0.         0.         0.         0.     

In [32]:
tfidf_features.shape, df_dev.shape

((348, 245), (348, 7))

In [33]:
vocab = vectorizer.get_feature_names_out()
docterm = pd.DataFrame(tfidf_features.todense(), columns=vocab)

In [34]:
docterm.head()

Unnamed: 0,7-up,a,add,adding,additional,all,allergic,along,also,am,...,well,which,will,wish,with,without,won't,would,yellow,you
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.217732,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.159152,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.11483,0.0,0.0,0.0,0.0,0.0
3,0.0,0.172717,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.317027,0.0,0.0,...,0.0,0.0,0.0,0.0,0.135995,0.0,0.0,0.0,0.0,0.0


In [35]:
# (docterm['7'].to_list() == tfidf_features.toarray().T[0]).all()

In [36]:
vectorizer.vocabulary_["i'd"]

94

In [37]:
tfidf_features[vectorizer.vocabulary_.get('i', 0)].toarray()[0]

array([0.        , 0.18706812, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.14551966, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.33806403,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.34014466,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

### Word2Vec

In [38]:
model_name = "/word2vec_model.bin"
update_model = True
if update_model or not os.path.exists(MODELS_PATH + model_name):
    print(f"Creating {model_name}...")
    # Create a Word2Vec model
    # word2vec_model = Word2Vec(sentences=df_dev["tokenized_lemma"], vector_size=200, window=5, min_count=1, workers=4)
    word2vec_model = Word2Vec(sentences=df_dev["tokenized"], vector_size=200, window=5, min_count=1, workers=4)
    print(f"Saving {model_name}...")
    # Save the trained model
    word2vec_model.save(MODELS_PATH + model_name)
else:
    print(f"Loading {model_name}...")
    # Load the trained model
    Word2Vec.load(MODELS_PATH + model_name)
    
word2vec_model

Creating /word2vec_model.bin...
Saving /word2vec_model.bin...


<gensim.models.word2vec.Word2Vec at 0x799095e60700>

In [39]:
word_embeddings = {word: word2vec_model.wv[word] for word in word2vec_model.wv.index_to_key}
# word_embeddings

In [40]:
# # Step 2: Prepare CRF features by combining tokens with embeddings
# def convert_to_crf_features(tokens, word_embeddings):
#     features = []
#     for sentence, embeddings in zip(tokens, word_embeddings):
#         sentence_features = []
#         for word, embedding in zip(sentence, embeddings):
#             word_features = {
#                 'word': word,  # Word token itself (useful for CRF)
#                 'embedding': embedding,  # Word embedding (numerical vector)
#             }
#             sentence_features.append(word_features)
#         features.append(sentence_features)
#     return features


In [41]:
# # Step 3: Get the embeddings for your tokens (train data)
# word_embeddings = get_word_embeddings(tokens, word2vec_model)

# # Step 4: Convert tokens and embeddings into CRF-friendly format
# crf_train_features = convert_to_crf_features(tokens, word_embeddings)

In [42]:
# len(word_embeddings)

In [43]:
# word_embeddings[0]

## Word Vectorization

In [44]:
def vectorize_words(row):
    tokens: list[str] = row["tokenized"]
    sentence_tfidfs = []
    sentence_word2vec = []
    for token in tokens:
        sentence_tfidfs.append(tfidf_features[vectorizer.vocabulary_.get(token, 0)].toarray()[0])
        sentence_word2vec.append(word_embeddings.get(token, [0] * 100)) # Default zero-vector
    row['tfidf_features'], row['word2vec_features'] = sentence_tfidfs, sentence_word2vec
    return row

df_dev = df_dev.progress_apply(vectorize_words, axis=1)
df_dev.head(1)

100%|██████████| 348/348 [00:00<00:00, 422.01it/s]


Unnamed: 0,src,exr,top,pcfg_err,tokenized_lemma,tokenized,lemmatized,tfidf_features,word2vec_features
0,i want to order two medium pizzas with sausage and black olives and two medium pizzas with pepperoni and extra cheese and three large pizzas with pepperoni and sausage,(PIZZAORDER (NUMBER 2 ) (SIZE MEDIUM ) (COMPLEX_TOPPING (QUANTITY EXTRA ) (TOPPING CHEESE ) ) (TOPPING PEPPERONI ) ) (PIZZAORDER (NUMBER 2 ) (SIZE MEDIUM ) (TOPPING OLIVES ) (TOPPING SAUSAGE ) ) (PIZZAORDER (NUMBER 3 ) (SIZE LARGE ) (TOPPING PEPPERONI ) (TOPPING SAUSAGE ) ),i want to order (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING sausage ) and (TOPPING black olives ) ) and (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING pepperoni ) and (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) ) and (PIZZAORDER (NUMBER three ) (SIZE large ) pizzas with (TOPPING pepperoni ) and (TOPPING sausage ) ),False,"[i, want, to, order, two, medium, pizza, with, sausage, and, black, olive, and, two, medium, pizza, with, pepperoni, and, extra, cheese, and, three, large, pizza, with, pepperoni, and, sausage]","[i, want, to, order, two, medium, pizzas, with, sausage, and, black, olives, and, two, medium, pizzas, with, pepperoni, and, extra, cheese, and, three, large, pizzas, with, pepperoni, and, sausage]",i want to order two medium pizza with sausage and black olive and two medium pizza with pepperoni and extra cheese and three large pizza with pepperoni and sausage,"[[0.0, 0.18706812373139206, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.14551966070645694, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.338064030414566, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.34014465734620436, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.30724661810746495, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1835399430287813, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...], [0.0, 0.0, 0.36841804537972767, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3769445313850932, 0.0, 0.0, 0.0, 0.19351042155584847, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.19618713190325923, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.202184450...","[[0.0677469, -0.041739315, -0.08083352, 0.12261514, 0.16721858, -0.05736017, 0.09708657, 0.28409472, -0.118705854, -0.027471012, -0.057421274, -0.14376749, 0.0473368, 0.14283921, -0.030394064, -0.031858526, 0.052781112, 0.07924095, 0.0069334647, -0.2780961, 0.09244968, -0.032641098, -0.026583068, 0.036451172, 0.03341323, -0.025228117, -0.07656543, -0.118221, -0.10734222, 0.03260397, 0.15275688, 0.05852104, 0.106173486, -0.05881508, -0.082549885, 0.1174997, 0.06174134, -0.08423522, -0.055344716, -0.16970553, -0.06645013, 0.006280372, -0.11624798, -0.00490635, 0.1332938, -0.05824645, -0.00758351, -0.1255734, 0.039873965, 0.07710664, 0.045426518, -0.18234527, -0.024939088, -0.10234514, 0.08368705, -0.06852132, -0.07534842, -0.15418817, -0.14619642, -0.003937893, -0.101910375, -0.030932281, 0.1487361, -0.090863645, -0.320359, 0.09530128, -0.022416687, 0.22749043, -0.17629756, 0.21058744, -0.03136628, 0.1173821, 0.12045498, 0.03843337, 0.11156459, 0.058297314, 0.104958236, -0.11261302, ..."


## Extracting Entities

We updated the token_pattern to tokenize words with at least 1 character instead of the default of min 2 chars, which will skip important tokens such as a, 7, etc. (used for NUMBER entity label)


In [45]:
[x.group() for x in re.finditer(r"\(PIZZAORDER .+?\)", df_dev['top'][0])]

['(PIZZAORDER (NUMBER two )',
 '(PIZZAORDER (NUMBER two )',
 '(PIZZAORDER (NUMBER three )']

In [46]:
full_text = " ".join(df_train['top'].to_list())
entities = [x.group() for x in re.finditer("(?<=\()[A-Z]+(_[A-Z]+)*", full_text)]

In [47]:
entities = list(set(entities)) # Unique
entities

['TOPPING',
 'DRINKORDER',
 'COMPLEX_TOPPING',
 'CONTAINERTYPE',
 'NOT',
 'PIZZAORDER',
 'NUMBER',
 'DRINKTYPE',
 'VOLUME',
 'QUANTITY',
 'SIZE',
 'STYLE']

In [48]:
# Using BIO Tagging
bio_entities = [f"{letter}-{entity}" for entity in entities for letter in "BI"]
bio_entities.append('O')
bio_entities

['B-TOPPING',
 'I-TOPPING',
 'B-DRINKORDER',
 'I-DRINKORDER',
 'B-COMPLEX_TOPPING',
 'I-COMPLEX_TOPPING',
 'B-CONTAINERTYPE',
 'I-CONTAINERTYPE',
 'B-NOT',
 'I-NOT',
 'B-PIZZAORDER',
 'I-PIZZAORDER',
 'B-NUMBER',
 'I-NUMBER',
 'B-DRINKTYPE',
 'I-DRINKTYPE',
 'B-VOLUME',
 'I-VOLUME',
 'B-QUANTITY',
 'I-QUANTITY',
 'B-SIZE',
 'I-SIZE',
 'B-STYLE',
 'I-STYLE',
 'O']

In [49]:
label_encoder = LabelEncoder()
label_encoder.fit(bio_entities)

In [50]:
label_encoder.transform(['B-PIZZAORDER'])

array([6])

In [51]:
def encode_entities(entities):
    return [label_encoder.transform(entity) for entity in entities]
def decode_entities(encoded_entities):
    return [label_encoder.inverse_transform(entity) for entity in encoded_entities]

### Modification
#### we used regexp_tokenize to handle cases like:
- I'd 
- party - size

In [52]:
quotes_sparated = df_train['src'].str.find("i'd") > 0
df_train[['src', 'top']][quotes_sparated]

Unnamed: 0,src,top
852235,three pizzas no american cheese and i'd like five personal size pies with no american cheese,(PIZZAORDER (NUMBER three ) pizzas no (NOT (TOPPING american cheese ) ) ) and i'd like (PIZZAORDER (NUMBER five ) (SIZE personal size ) pies with no (NOT (TOPPING american cheese ) ) )
852238,three pizzas no american cheese and i'd like three large pizzas with peperonni and without any vegan pepperoni,(PIZZAORDER (NUMBER three ) pizzas no (NOT (TOPPING american cheese ) ) ) and i'd like (PIZZAORDER (NUMBER three ) (SIZE large ) pizzas with (TOPPING peperonni ) and without any (NOT (TOPPING vegan pepperoni ) ) )
852270,four pizzas with balsamic glaze and i'd like three party size pies with peperonni and with no roasted pepper,(PIZZAORDER (NUMBER four ) pizzas with (TOPPING balsamic glaze ) ) and i'd like (PIZZAORDER (NUMBER three ) (SIZE party size ) pies with (TOPPING peperonni ) and with no (NOT (TOPPING roasted pepper ) ) )
852280,three large pizzas with balsamic glaze and i'd like four pies with peperonni and roasted green peppers,(PIZZAORDER (NUMBER three ) (SIZE large ) pizzas with (TOPPING balsamic glaze ) ) and i'd like (PIZZAORDER (NUMBER four ) pies with (TOPPING peperonni ) and (TOPPING roasted green peppers ) )
852287,four pizzas with balsamic glaze and i'd like three pies with pepper,(PIZZAORDER (NUMBER four ) pizzas with (TOPPING balsamic glaze ) ) and i'd like (PIZZAORDER (NUMBER three ) pies with (TOPPING pepper ) )
...,...,...
1251865,five pizzas no american cheese and i'd like three party sized pies with no banana peppers,(PIZZAORDER (NUMBER five ) pizzas no (NOT (TOPPING american cheese ) ) ) and i'd like (PIZZAORDER (NUMBER three ) (SIZE party sized ) pies with no (NOT (TOPPING banana peppers ) ) )
1251868,three party sized pizzas no american cheese and i'd like two regular pies no american cheese,(PIZZAORDER (NUMBER three ) (SIZE party sized ) pizzas no (NOT (TOPPING american cheese ) ) ) and i'd like (PIZZAORDER (NUMBER two ) (SIZE regular ) pies no (NOT (TOPPING american cheese ) ) )
1251870,five large pizzas with balsamic glaze and i'd like three pies with mozzarella and sauce,(PIZZAORDER (NUMBER five ) (SIZE large ) pizzas with (TOPPING balsamic glaze ) ) and i'd like (PIZZAORDER (NUMBER three ) pies with (TOPPING mozzarella ) and (TOPPING sauce ) )
1251871,three pizzas no american cheese and i'd like three party size pies with meatball and without roasted green pepper,(PIZZAORDER (NUMBER three ) pizzas no (NOT (TOPPING american cheese ) ) ) and i'd like (PIZZAORDER (NUMBER three ) (SIZE party size ) pies with (TOPPING meatball ) and without (NOT (TOPPING roasted green pepper ) ) )


In [53]:
hyphen_separated = df_train['src'].str.find("party") > 0
df_train[['src', 'top']][hyphen_separated]

Unnamed: 0,src,top
8,can i have one party sized high rise dough pizza with american cheese and a lot of peperonni,can i have (PIZZAORDER (NUMBER one ) (SIZE party sized ) (STYLE high rise dough ) pizza with (TOPPING american cheese ) and (COMPLEX_TOPPING (QUANTITY a lot of ) (TOPPING peperonni ) ) )
11,i'd like one party sized pie with american cheese and with pesto sauce,i'd like (PIZZAORDER (NUMBER one ) (SIZE party sized ) pie with (TOPPING american cheese ) and with (TOPPING pesto sauce ) )
13,can i have a party - sized pie without any bean,can i have (PIZZAORDER (NUMBER a ) (SIZE party - sized ) pie without any (NOT (TOPPING bean ) ) )
15,i'd like a party sized high rise dough pie with a lot of banana pepper and pecorino cheese,i'd like (PIZZAORDER (NUMBER a ) (SIZE party sized ) (STYLE high rise dough ) pie with (COMPLEX_TOPPING (QUANTITY a lot of ) (TOPPING banana pepper ) ) and (TOPPING pecorino cheese ) )
16,i'd like a party sized pie with balsamic glaze and black olive,i'd like (PIZZAORDER (NUMBER a ) (SIZE party sized ) pie with (TOPPING balsamic glaze ) and (TOPPING black olive ) )
...,...,...
2081430,i'd like a party - sized pizza with red peppers salami and fried onions,i'd like (PIZZAORDER (NUMBER a ) (SIZE party - sized ) pizza with (TOPPING red peppers ) (TOPPING salami ) and (TOPPING fried onions ) )
2081432,i'd like a party - size pizza with vegan pepperoni bbq chicken and dried tomato,i'd like (PIZZAORDER (NUMBER a ) (SIZE party - size ) pizza with (TOPPING vegan pepperoni ) (TOPPING bbq chicken ) and (TOPPING dried tomato ) )
2081434,i'd like a party - size pizza with barbecue sauce peppperoni and white onion,i'd like (PIZZAORDER (NUMBER a ) (SIZE party - size ) pizza with (TOPPING barbecue sauce ) (TOPPING peppperoni ) and (TOPPING white onion ) )
2081439,i'd like a party size pizza with bacon yellow peppers and sausage,i'd like (PIZZAORDER (NUMBER a ) (SIZE party size ) pizza with (TOPPING bacon ) (TOPPING yellow peppers ) and (TOPPING sausage ) )


In [54]:
def extract_labels(top: str, entities):
    # Extract words and parenthesis
    pattern = r"\b\w+(?:'\w+)?(?:-\w+)*\b|[()]"
    tokens = regexp_tokenize(top, pattern)
    
    labels = []
    count = 0
    # print(tokens)
    is_beginning = True
    order_type = "PIZZAORDER"
    for i, token in enumerate(tokens):
        # print(token, count)
        # Skip all entities except ["PIZZAORDER", "DRINKORDER"]
        if token in entities and token not in ["PIZZAORDER", "DRINKORDER"]:
            continue
        # if token == "'":
        #     i += 1
        elif token == "(":
            count += 1
        elif token == ")":
            count -= 1
        elif token == "PIZZAORDER":
            order_type = "PIZZAORDER"
        elif token == "DRINKORDER":
            order_type = "DRINKORDER"
        
        elif count == 0:
            labels.append("O")
            is_beginning = True
        else:
            if is_beginning == True:
                labels.append("B-" + order_type)
                is_beginning = False
                continue
            if is_beginning == False:
                labels.append("I-" + order_type)
    labels = label_encoder.transform(labels)
    return labels

index = 1251868
tokens = df_train['src'][index].split()
labels = extract_labels(df_train['top'][index], entities)
print(len(tokens), len(labels))
print([(x, y) for x, y in zip(tokens, labels)])
df_train.iloc[index:index+1].head()

16 16
[('three', 6), ('party', 18), ('sized', 18), ('pizzas', 18), ('no', 18), ('american', 18), ('cheese', 18), ('and', 24), ("i'd", 24), ('like', 24), ('two', 6), ('regular', 18), ('pies', 18), ('no', 18), ('american', 18), ('cheese', 18)]


Unnamed: 0,src,exr,top,decoupled
1251868,three party sized pizzas no american cheese and i'd like two regular pies no american cheese,(PIZZAORDER (NUMBER 2 ) (SIZE REGULARSIZE ) (NOT (TOPPING AMERICAN_CHEESE ) ) ) (PIZZAORDER (NUMBER 3 ) (SIZE PARTY_SIZE ) (NOT (TOPPING AMERICAN_CHEESE ) ) ),(PIZZAORDER (NUMBER three ) (SIZE party sized ) pizzas no (NOT (TOPPING american cheese ) ) ) and i'd like (PIZZAORDER (NUMBER two ) (SIZE regular ) pies no (NOT (TOPPING american cheese ) ) ),(PIZZAORDER (NUMBER three ) (SIZE party sized ) (NOT (TOPPING american cheese ) ) ) (PIZZAORDER (NUMBER two ) (SIZE regular ) (NOT (TOPPING american cheese ) ) )


In [55]:
# df_train['labels'] = df_train['top'].progress_apply(lambda x: extract_labels(x, entities))
# df_train.head(1)

In [56]:
df_dev['labels'] = df_dev['top'].progress_apply(lambda x: extract_labels(x, entities))
df_dev.head(1)

100%|██████████| 348/348 [00:00<00:00, 8682.83it/s]


Unnamed: 0,src,exr,top,pcfg_err,tokenized_lemma,tokenized,lemmatized,tfidf_features,word2vec_features,labels
0,i want to order two medium pizzas with sausage and black olives and two medium pizzas with pepperoni and extra cheese and three large pizzas with pepperoni and sausage,(PIZZAORDER (NUMBER 2 ) (SIZE MEDIUM ) (COMPLEX_TOPPING (QUANTITY EXTRA ) (TOPPING CHEESE ) ) (TOPPING PEPPERONI ) ) (PIZZAORDER (NUMBER 2 ) (SIZE MEDIUM ) (TOPPING OLIVES ) (TOPPING SAUSAGE ) ) (PIZZAORDER (NUMBER 3 ) (SIZE LARGE ) (TOPPING PEPPERONI ) (TOPPING SAUSAGE ) ),i want to order (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING sausage ) and (TOPPING black olives ) ) and (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING pepperoni ) and (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) ) and (PIZZAORDER (NUMBER three ) (SIZE large ) pizzas with (TOPPING pepperoni ) and (TOPPING sausage ) ),False,"[i, want, to, order, two, medium, pizza, with, sausage, and, black, olive, and, two, medium, pizza, with, pepperoni, and, extra, cheese, and, three, large, pizza, with, pepperoni, and, sausage]","[i, want, to, order, two, medium, pizzas, with, sausage, and, black, olives, and, two, medium, pizzas, with, pepperoni, and, extra, cheese, and, three, large, pizzas, with, pepperoni, and, sausage]",i want to order two medium pizza with sausage and black olive and two medium pizza with pepperoni and extra cheese and three large pizza with pepperoni and sausage,"[[0.0, 0.18706812373139206, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.14551966070645694, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.338064030414566, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.34014465734620436, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.30724661810746495, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1835399430287813, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...], [0.0, 0.0, 0.36841804537972767, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3769445313850932, 0.0, 0.0, 0.0, 0.19351042155584847, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.19618713190325923, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.202184450...","[[0.0677469, -0.041739315, -0.08083352, 0.12261514, 0.16721858, -0.05736017, 0.09708657, 0.28409472, -0.118705854, -0.027471012, -0.057421274, -0.14376749, 0.0473368, 0.14283921, -0.030394064, -0.031858526, 0.052781112, 0.07924095, 0.0069334647, -0.2780961, 0.09244968, -0.032641098, -0.026583068, 0.036451172, 0.03341323, -0.025228117, -0.07656543, -0.118221, -0.10734222, 0.03260397, 0.15275688, 0.05852104, 0.106173486, -0.05881508, -0.082549885, 0.1174997, 0.06174134, -0.08423522, -0.055344716, -0.16970553, -0.06645013, 0.006280372, -0.11624798, -0.00490635, 0.1332938, -0.05824645, -0.00758351, -0.1255734, 0.039873965, 0.07710664, 0.045426518, -0.18234527, -0.024939088, -0.10234514, 0.08368705, -0.06852132, -0.07534842, -0.15418817, -0.14619642, -0.003937893, -0.101910375, -0.030932281, 0.1487361, -0.090863645, -0.320359, 0.09530128, -0.022416687, 0.22749043, -0.17629756, 0.21058744, -0.03136628, 0.1173821, 0.12045498, 0.03843337, 0.11156459, 0.058297314, 0.104958236, -0.11261302, ...","[24, 24, 24, 24, 6, 18, 18, 18, 18, 18, 18, 18, 24, 6, 18, 18, 18, 18, 18, 18, 18, 24, 6, 18, 18, 18, 18, 18, 18]"


In [57]:
df_dev.iloc[3:4].head()

Unnamed: 0,src,exr,top,pcfg_err,tokenized_lemma,tokenized,lemmatized,tfidf_features,word2vec_features,labels
3,i'd like to order a large onion and pepper pizza,(PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (TOPPING ONIONS ) (TOPPING PEPPERS ) ),i'd like to order (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING onion ) and (TOPPING pepper ) pizza ),False,"[i, would, like, to, order, a, large, onion, and, pepper, pizza]","[i'd, like, to, order, a, large, onion, and, pepper, pizza]",i would like to order a large onion and pepper pizza,"[[0.0, 0.25878141499079277, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.25390069252267294, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.27720767756148734, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3739374411308161, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.22767219330101038, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.353498476780873, 0.2855402908978217, 0.0, 0.0, 0.0, 0.0,...","[[0.034876496, -0.018838737, -0.03094462, 0.049610227, 0.07292387, -0.028232593, 0.0426675, 0.124641865, -0.046168078, -0.010289274, -0.022492006, -0.067076765, 0.026225913, 0.0594144, -0.013272378, -0.013796637, 0.0225329, 0.031439085, 0.005429208, -0.117336124, 0.04022778, -0.011933805, -0.014957658, 0.011053996, 0.01287688, -0.011281822, -0.03759305, -0.05268785, -0.04346962, 0.011879201, 0.067503154, 0.025923895, 0.046448585, -0.020964967, -0.03751059, 0.049969103, 0.02965557, -0.039529294, -0.01972989, -0.07808588, -0.027959775, 0.0023421464, -0.049128532, -0.0011981872, 0.061333265, -0.021473194, -0.0011677572, -0.050754834, 0.018149236, 0.034283537, 0.0179269, -0.08481379, -0.012953331, -0.04719673, 0.03888765, -0.02838799, -0.027466046, -0.06957505, -0.06718644, 0.0016567827, -0.044730537, -0.014175598, 0.06563012, -0.04024822, -0.1364846, 0.04118543, -0.010231526, 0.09637261, -0.07360343, 0.09349069, -0.014939378, 0.045573164, 0.051587228, 0.019778902, 0.044990543, 0.01990...","[24, 24, 24, 24, 6, 18, 18, 18, 18, 18]"


In [58]:
# df_dev[["tfidf_features", "word2vec_features", "labels"]]
len(df_dev["tfidf_features"][3]), len(df_dev["word2vec_features"][3]), len(df_dev["labels"][3])

(10, 10, 10)

In [59]:
# def extract_labels(top: str, entities):
#     # Extract words and parenthesis
#     pattern = r"\b\w+(?:'\w+)?(?:-\w+)*\b|[()]"
#     tokens = regexp_tokenize(top, pattern)
    
#     IS_labels = []
#     NER_labels = []
#     # print(tokens)
#     beginning_of_order = True
#     order_type = "PIZZAORDER"
#     beginning_of_ner = True
#     ner_type = ""
    
#     count = 0 # Count the opened parenthesis (lower level in the hierarchy)
#     for i, token in enumerate(tokens):
#         # print(token, count)
#         # Skip all entities except ["PIZZAORDER", "DRINKORDER"]
#         if token in entities and token not in ["PIZZAORDER", "DRINKORDER"]:
#             continue
#         elif token == "(":
#             count += 1
#         elif token == ")":
#             count -= 1
#         elif token == "PIZZAORDER":
#             order_type = "PIZZAORDER"
#         elif token == "DRINKORDER":
#             order_type = "DRINKORDER"
        
#         elif count == 0:
#             IS_labels.append("O")
#             NER_labels.append("O")
#             beginning_of_order = True
#             beginning_of_ner = True
#         # 2nd top-level are PIZZAORDER & DRINKORDER
#         elif count >= 1:
#             if beginning_of_order == True:
#                 IS_labels.append("B-" + order_type)
#                 beginning_of_order = False
#                 continue
#             if beginning_of_order == False:
#                 IS_labels.append("I-" + order_type)
        
#         # Other levels: Number, Size, etc...
#         else:
#             if beginning_of_ner == True:
#                 NER_labels.append("B-" + order_type)
#                 beginning_of_ner = False
#                 continue
#             if beginning_of_ner == False:
#                 NER_labels.append("I-" + order_type)
            

#     return IS_labels

# # index = 1251868
# # tokens = df_train['src'][index].split()
# # labels = extract_labels(df_train['top'][index], entities)
# # print(len(tokens), len(labels))
# # print([(x, y) for x, y in zip(tokens, labels)])
# # df_train.iloc[index:index+1].head()

# Models

## CRF

In [60]:
# CRF Model
crf = CRF(algorithm="lbfgs", max_iterations=100)

# Format data for CRF training
# crf_train_tokens = word_embeddings
# crf_train_labels = labels
crf_train_tokens = df_dev['word2vec_features']
crf_train_tokens = df_dev['tfidf_features'].to_list()
crf_train_labels = df_dev['labels']

In [61]:
# Will work only without vectorizing words! (Dimension of the word vector of course will be )
print(len(crf_train_tokens))
print(len(crf_train_labels))
i = 0
k = 0
for token, label in zip(crf_train_tokens, crf_train_labels):
    if len(token) != len(label):
        print(f"{len(token)}, {len(label)}, {i}")
        # print(f"Mismatch found: {token}, {label}, {i}")
        k += 1
    if k > 10: break
    i += 1

348
348


In [62]:
# Train CRF
crf.fit(crf_train_tokens, crf_train_labels)
# Save the trained model
crf.save(MODELS_PATH + "/crf_model.bin")

TypeError: expected bytes, numpy.int64 found

In [None]:
# Predict using CRF
test = "I want a pizza seafood with olives and a can of coke".split()
print(test)
crf_predictions = crf.predict([test])
print(crf_predictions)

In [None]:
# Predict using CRF
# crf_predictions = crf.predict(crf_train_tokens)

In [None]:
true_dev_labels = []
tokens_dev = []
for i in range(len(df_dev['top'])):
    true_dev_labels.append(extract_labels(df_dev['top'][i], entities))
    tokens_dev.append(df_dev['src'][i].split())

In [None]:
crf_predictions = crf.predict(tokens_dev)

## IS CRF Model Evaluation
#### Calculate Exact Match (EM) accuracy, Compute Precision, Recall, F1-Score

In [None]:
# Flatten true labels and predictions
flattened_true_labels = [label for sublist in true_dev_labels for label in sublist]
flattened_pred_labels = [label for sublist in crf_predictions for label in sublist]

em_accuracy = accuracy_score(flattened_true_labels, flattened_pred_labels)
precision = precision_score(flattened_true_labels, flattened_pred_labels, average='macro')
recall = recall_score(flattened_true_labels, flattened_pred_labels, average='macro')
f1 = f1_score(flattened_true_labels, flattened_pred_labels, average='macro')

print(f"Exact Match Accuracy: {em_accuracy:.5f}")
print(f"Precision: {precision:.5f}")
print(f"Recall: {recall:.5f}")
print(f"F1-Score: {f1:.5f}")


## LSTM

In [None]:
# Define the LSTM model
class LSTMModel(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = torch.nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Hyperparameters
input_size = 10  # Adjust based on your data
hidden_size = 256
output_size = 1  # Adjust based on your data
num_layers = 4
num_epochs = 50
learning_rate = 0.001

# Initialize the model, loss function, and optimizer
model = LSTMModel(input_size, hidden_size, output_size, num_layers).to(device)
criterion = torch.nn.MSELoss()  # Adjust based on your task
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Dummy data for illustration (replace with your actual data)
x_train = torch.randn(100, 10, input_size).to(device)  # (batch_size, sequence_length, input_size)
y_train = torch.randn(100, output_size).to(device)  # (batch_size, output_size)

# Training loop
for epoch in range(num_epochs):
    model.train()
    outputs = model(x_train)
    optimizer.zero_grad()
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 5 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Model Evaluation