# Setup

## Imports

In [85]:
# General
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
import os
import json
import tqdm
import pickle
import gc
import psutil
from typing import Literal

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, regexp_tokenize
# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

# Models
from sklearn_crfsuite import CRF
import torch

# Evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [86]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## Config

In [87]:
pd.set_option('display.max_colwidth', 1000) # Show all content of the cells
# # Undo with 
# pd.reset_option('display.max_colwidth')

# config tqdm for pandas
tqdm.tqdm.pandas()

### Folder Paths

In [None]:
DATASET_PATH = "/kaggle/input/pizza-dataset/dataset" # Kaggle
DATASET_PATH = "./data/dataset" # Local

OUTPUT_ROOT_PATH = "/kaggle/working" # Kaggle
OUTPUT_ROOT_PATH = "./data/saved" # Local

FEATURES_PATH = OUTPUT_ROOT_PATH + "/features"
MODELS_PATH = OUTPUT_ROOT_PATH + "/models"


In [89]:
os.makedirs(OUTPUT_ROOT_PATH, exist_ok=True)
os.makedirs(FEATURES_PATH, exist_ok=True)
os.makedirs(MODELS_PATH, exist_ok=True)

# os.rmdir(OUTPUT_ROOT_PATH)
# os.rmdir(FEATURES_PATH)
# os.rmdir(MODELS_PATH)

### Variables

In [90]:
token_pattern=r"(?u)\b\w+(?:'\w+)?(?:-\w+)*\b"

# Utils

In [91]:
process = psutil.Process(os.getpid())
def memory_usage():
    return (process.memory_info().rss / 1024 ** 2)

print(f"Starting with Memory Usage = {memory_usage()}")

Starting with Memory Usage = 3196.10546875


In [None]:
def save_pickle(path, obj, type: Literal["model", "feature"] | None = None):
    if type is not None:
        if type == "model":
            path = MODELS_PATH + "/" + path
        elif type == "feature":
            path = FEATURES_PATH + "/" + path
    with open (path, 'wb') as f:
        pickle.dump(obj, f)

def load_pickle(path, type: Literal["model", "feature"] | None = None):
    if type is not None:
        if type == "model":
            path = MODELS_PATH + "/" + path
        elif type == "feature":
            path = FEATURES_PATH + "/" + path
    with open(path, 'rb') as f:
        return pickle.load(f)
    
def save_np(path, obj, type: Literal["model", "feature"] | None = None, allow_pickle=True):
    if type is not None:
        if type == "model":
            path = MODELS_PATH + "/" + path
        elif type == "feature":
            path = FEATURES_PATH + "/" + path
    np.save(path, obj, allow_pickle=allow_pickle)

def load_np(path, type: Literal["model", "feature"] | None = None, allow_pickle=True):
    if type is not None:
        if type == "model":
            path = MODELS_PATH + "/" + path
        elif type == "feature":
            path = FEATURES_PATH + "/" + path

    return np.load(path, allow_pickle=allow_pickle)

In [121]:
def file_exists(path):
    return os.path.exists(path)

# Load Dataset

In [93]:
def load_json(filename: str, cols: list[str] | None = None):
    """
    Load a json file into a pandas DataFrame.
    * This function is useful (for some reason) for loading the large dataset files.
    
    filename: str
        The name of the file to load.
    cols: list[str] | None
        The columns to load. If None, load all columns.
    return: pd.DataFrame
        The DataFrame containing the data from the json file.
    """
    all_cols = True if cols is None else False
    data = []

    with open(filename, encoding='latin-1') as f:
        line = f.readline()
        f.seek(0) # Go back to the beginning of the file
        doc = json.loads(line)
        if all_cols:
            cols = list(doc.keys())
        
        for line in f:
            doc = json.loads(line)
            lst = [doc[col] for col in cols]
            data.append(lst)

    df = pd.DataFrame(data=data, columns=cols)
    return df

train_filename = DATASET_PATH + '/PIZZA_train.json'
df_train = load_json(train_filename, cols=['train.SRC', 'train.TOP'])
df_train.head()

Unnamed: 0,train.SRC,train.TOP
0,can i have a large bbq pulled pork,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING bbq pulled pork ) ) )
1,large pie with green pepper and with extra peperonni,(ORDER (PIZZAORDER (SIZE large ) pie with (TOPPING green pepper ) and with (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING peperonni ) ) ) )
2,i'd like a large vegetarian pizza,(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE large ) (STYLE vegetarian ) pizza ) )
3,party size stuffed crust pie with american cheese and with mushroom,(ORDER (PIZZAORDER (SIZE party size ) (STYLE stuffed crust ) pie with (TOPPING american cheese ) and with (TOPPING mushroom ) ) )
4,can i have one personal sized artichoke,(ORDER can i have (PIZZAORDER (NUMBER one ) (SIZE personal sized ) (TOPPING artichoke ) ) )


In [94]:
df_dev = pd.read_json(DATASET_PATH + '/PIZZA_dev.json', lines=True)[['dev.SRC', 'dev.TOP']]
df_dev.head()

Unnamed: 0,dev.SRC,dev.TOP
0,i want to order two medium pizzas with sausage and black olives and two medium pizzas with pepperoni and extra cheese and three large pizzas with pepperoni and sausage,(ORDER i want to order (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING sausage ) and (TOPPING black olives ) ) and (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING pepperoni ) and (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) ) and (PIZZAORDER (NUMBER three ) (SIZE large ) pizzas with (TOPPING pepperoni ) and (TOPPING sausage ) ) )
1,five medium pizzas with tomatoes and ham,(ORDER (PIZZAORDER (NUMBER five ) (SIZE medium ) pizzas with (TOPPING tomatoes ) and (TOPPING ham ) ) )
2,i need to order one large vegetarian pizza with extra banana peppers,(ORDER i need to order (PIZZAORDER (NUMBER one ) (SIZE large ) (STYLE vegetarian ) pizza with (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING banana peppers ) ) ) )
3,i'd like to order a large onion and pepper pizza,(ORDER i'd like to order (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING onion ) and (TOPPING pepper ) pizza ) )
4,i'll have one pie along with pesto and ham but avoid olives,(ORDER i'll have (PIZZAORDER (NUMBER one ) pie along with (TOPPING pesto ) and (TOPPING ham ) but avoid (NOT (TOPPING olives ) ) ) )


In [95]:
memory_usage()

4239.59375

# EDA - Exploratory Data Analysis

## Column names

- **SRC** 
    - The source text of the pizza order as given by the user.
- **EXR** 
    - The expected representation of the pizza order in a structured format (likely a parse tree or similar structure).
- **TOP** 
    - The top-level representation of the pizza order, possibly a normalized or tokenized version of the source text.
- **PCFG_ERR** 
    - A boolean indicating whether there was an error in parsing the pizza order using a Probabilistic Context-Free Grammar (PCFG).

In [96]:
df_dev.rename(columns={
    'dev.SRC': 'src', 
    'dev.EXR': 'exr',
    'dev.TOP': 'top',
    'dev.PCFG_ERR': 'pcfg_err',
}, inplace=True)
df_train.rename(columns={
    'train.SRC': 'src', 
    'train.EXR': 'exr',
    'train.TOP': 'top',
    'train.TOP-DECOUPLED': 'decoupled',
}, inplace=True)

## Check Duplicates & Missing Data

In [97]:
df_train.describe()

Unnamed: 0,src,top
count,2456446,2456446
unique,2456446,2456446
top,can i have a large bbq pulled pork,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING bbq pulled pork ) ) )
freq,1,1


In [98]:
df_dev.describe()

Unnamed: 0,src,top
count,348,348
unique,348,348
top,i want to order two medium pizzas with sausage and black olives and two medium pizzas with pepperoni and extra cheese and three large pizzas with pepperoni and sausage,(ORDER i want to order (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING sausage ) and (TOPPING black olives ) ) and (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING pepperoni ) and (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) ) and (PIZZAORDER (NUMBER three ) (SIZE large ) pizzas with (TOPPING pepperoni ) and (TOPPING sausage ) ) )
freq,1,1


In [99]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2456446 entries, 0 to 2456445
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   src     object
 1   top     object
dtypes: object(2)
memory usage: 37.5+ MB


In [100]:
df_dev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 348 entries, 0 to 347
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   src     348 non-null    object
 1   top     348 non-null    object
dtypes: object(2)
memory usage: 5.6+ KB


In [101]:
df_train.duplicated().sum()

0

In [102]:
df_dev.duplicated().sum()

0

In [103]:
df_train.isna().sum()

src    0
top    0
dtype: int64

In [104]:
df_dev.isna().sum()

src    0
top    0
dtype: int64

## Data Cleaning

Masha2 allah: Data is clean (no punctuation | whitespace)

In [105]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [106]:
# Escape all punctuation(special) characters
# Grouped for the pd.str.extract() function
punctuation_regex = "(\\" + "|\\".join(string.punctuation) + ")" 

In [107]:
puncs = df_train['src'].str.extract(punctuation_regex).dropna()
puncs.value_counts()

0
'    737203
-    310742
Name: count, dtype: int64

In [108]:
puncs = df_dev['src'].str.extract(punctuation_regex).dropna()
puncs.value_counts()

0
'    69
-     3
Name: count, dtype: int64

In [109]:
df_train['src'].str.extract("([A-Z]\.)+").dropna().value_counts() # No Abbreviations!

Series([], Name: count, dtype: int64)

In [110]:
df_dev['src'].str.extract("([A-Z]\.)+").dropna().value_counts() # No Abbreviations!

Series([], Name: count, dtype: int64)

In [111]:
df_dev['src'][df_dev['src'].str.find("i'd") > 0].describe()

count                                                                                                                                              6
unique                                                                                                                                             6
top       how are you tonight my order is for a medium pizza and i'd like chicken on it as well as extra cheese but please no onions i appreciate it
freq                                                                                                                                               1
Name: src, dtype: object

In [112]:
# Are there any orders containing both DRINKORDER and PIZZAORDER?
df_train['top'].str.extract(rf"(PIZZAORDER).*(DRINKORDER)").dropna().tail()

Unnamed: 0,0,1
2268941,PIZZAORDER,DRINKORDER
2268942,PIZZAORDER,DRINKORDER
2268943,PIZZAORDER,DRINKORDER
2268944,PIZZAORDER,DRINKORDER
2268945,PIZZAORDER,DRINKORDER


In [113]:
df_train.iloc[2268941]['top']

"(ORDER i'd like (PIZZAORDER (NUMBER a ) pizza with (TOPPING roasted peppers ) ) and (DRINKORDER (NUMBER nine ) (DRINKTYPE seven up ) ) )"

# Preprocessing

## Targets

We remove the leading *ORDER* constructor from the target output sequences since it is a **universal top-level constructor** and **there is nothing to be learned from it**.

In [114]:
# train.SRC	train.EXR	train.TOP	train.TOP-DECOUPLED
def remove_order(df, cols):
    """
    Remove the string '(ORDER' from the beginning of the columns in the DataFrame.
    
    df: pd.DataFrame
        The DataFrame to remove the string from.
    cols: list[str]
        The columns to remove the string from.
    """
    for col in cols:
        df[col] = df[col].str.replace(r"^\(ORDER\s?", "", regex=True)
        df[col] = df[col].str.replace(r"\)$", "", regex=True)
    
    return df

df_train = remove_order(df_train, ['top'])
df_dev = remove_order(df_dev, ['top'])


In [115]:
df_train.head(1)

Unnamed: 0,src,top
0,can i have a large bbq pulled pork,can i have (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING bbq pulled pork ) )


In [116]:
df_dev.head(1)

Unnamed: 0,src,top
0,i want to order two medium pizzas with sausage and black olives and two medium pizzas with pepperoni and extra cheese and three large pizzas with pepperoni and sausage,i want to order (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING sausage ) and (TOPPING black olives ) ) and (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING pepperoni ) and (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) ) and (PIZZAORDER (NUMBER three ) (SIZE large ) pizzas with (TOPPING pepperoni ) and (TOPPING sausage ) )


## Features - Inputs

### Lemmatize src

- We compare the accuracy of **normal text** to that of **lemmatized text**.
- This will provide insights into whether the models can use plural words to capture the meaning of quantities better or not.
- **Note**
  - lemma change the number of the words in a sentence
    - Ex.: i'd -> i would
    - Thus we won't consider the lemma

In [117]:
# # Initialize the WordNetLemmatizer
# lemmatizer = WordNetLemmatizer()
# def lemmatize_text(text):
#     tokens = word_tokenize(text)
#     lemma_tokenized = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalnum()]
#     return lemma_tokenized

# # Apply lemmatization
# # Used for the rest of embeddings
# df_dev["tokenized_lemma"] = df_dev["src"].progress_apply(lemmatize_text)
# # Used for tf-idf
# df_dev["lemmatized"] = df_dev["tokenized_lemma"].progress_apply(lambda x: " ".join(x))
# df_dev.head()

### Tokenization

In [118]:
df_train["tokenized"] = df_train["src"].progress_apply(lambda x: regexp_tokenize(x, token_pattern))
df_dev["tokenized"] = df_dev["src"].progress_apply(lambda x: regexp_tokenize(x, token_pattern))

100%|██████████| 2456446/2456446 [00:22<00:00, 110756.64it/s]
100%|██████████| 348/348 [00:00<?, ?it/s]


In [119]:
memory_usage()

6740.3515625

### TF-IDF

**Note:** (spacy) Converts i'd -> i would (2 tokens instead of 1!) -> Trade off between lemmatizing 'top' or use 'src'

In [None]:
update_feature = False
if update_feature or not file_exists(FEATURES_PATH + "/tfidf_features.npy") or not file_exists(MODELS_PATH + "/tfidf_vectorizer.pkl"):
    # TF-IDF feature extraction
    vectorizer = TfidfVectorizer(analyzer="word", ngram_range=(1, 1), token_pattern=token_pattern)
    tfidf_features = vectorizer.fit_transform(df_train["src"])
    save_pickle(MODELS_PATH + "/tfidf_vectorizer.pkl", vectorizer)
    save_np("tfidf_features.npy", tfidf_features, type="feature")
else:
    vectorizer = load_pickle(MODELS_PATH + "/tfidf_vectorizer.pkl")
    tfidf_features = load_np("tfidf_features.npy", type="feature").tolist()

print(len(tfidf_features.toarray()[5]))
print(tfidf_features.toarray()[0])
print(vectorizer.get_feature_names_out()[0])

304
[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.14234776 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.41417928 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.35753269 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.     

In [None]:
tfidf_features.shape

(2456446, 304)

In [None]:
# vocab = vectorizer.get_feature_names_out()
# docterm = pd.DataFrame(tfidf_features.todense(), columns=vocab)

In [None]:
vectorizer.vocabulary_["i'd"]

129

In [None]:
tfidf_features[:, vectorizer.vocabulary_.get("you", 0)].toarray().reshape(-1)

array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
memory_usage()

368.234375

### Word2Vec

In [None]:
model_name = "/word2vec_model.bin"
update_model = False
if update_model or not os.path.exists(MODELS_PATH + model_name):
    print(f"Creating '{model_name}'...")
    # Create a Word2Vec model
    word2vec_model = Word2Vec(sentences=df_train["tokenized"], vector_size=200, window=5, min_count=1, workers=4)
    # Save the trained model
    print(f"Saving '{model_name}'...")
    word2vec_model.save(MODELS_PATH + model_name)
else:
    print(f"Loading '{model_name}'...")
    # Load the trained model
    word2vec_model = Word2Vec.load(MODELS_PATH + model_name)
    
word2vec_model

Loading '/word2vec_model.bin'...


<gensim.models.word2vec.Word2Vec at 0x27ab2f42bf0>

In [None]:
word_embeddings = {word: word2vec_model.wv[word] for word in word2vec_model.wv.index_to_key}
# word_embeddings

In [129]:
memory_usage()

3086.80859375

## Word Vectorization

Related Resources:
- TF-IDF Matrix -> https://openclassrooms.com/en/courses/6532301-introduction-to-natural-language-processing/8081363-apply-the-tf-idf-vectorization-approach

In [None]:
def vectorize_words(row):
    tokens: list[str] = row["tokenized"]
    sentence_tfidfs = []
    sentence_word2vec = []
    for token in tokens:
        tfidf_index = vectorizer.vocabulary_.get(token, 0) # Default index zero
        sentence_tfidfs.append(tfidf_features[:, tfidf_index].toarray().reshape(-1))
        sentence_word2vec.append(word_embeddings.get(token, [0] * 100)) # Default zero-vector
    row['tfidf_features'], row['word2vec_features'] = sentence_tfidfs, sentence_word2vec
    return row

In [None]:
df_train = df_train.progress_apply(vectorize_words, axis=1)
df_train.head(1)

  0%|          | 58/2456446 [01:25<934:18:24,  1.37s/it] 

In [None]:
df_dev = df_dev.progress_apply(vectorize_words, axis=1)
df_dev.head(1)

100%|██████████| 348/348 [00:00<00:00, 499.51it/s]


Unnamed: 0,src,top,tokenized,tfidf_features,word2vec_features
0,i want to order two medium pizzas with sausage and black olives and two medium pizzas with pepperoni and extra cheese and three large pizzas with pepperoni and sausage,i want to order (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING sausage ) and (TOPPING black olives ) ) and (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING pepperoni ) and (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) ) and (PIZZAORDER (NUMBER three ) (SIZE large ) pizzas with (TOPPING pepperoni ) and (TOPPING sausage ) ),"[i, want, to, order, two, medium, pizzas, with, sausage, and, black, olives, and, two, medium, pizzas, with, pepperoni, and, extra, cheese, and, three, large, pizzas, with, pepperoni, and, sausage]","[[0.08430143723922251, 0.0, 0.13337969831862775, 0.0, 0.0, 0.10034713760427152, 0.17396375337864922, 0.17958770538428326, 0.14606531916873786, 0.17721922092796344, 0.19713448714781656, 0.0, 0.17288347347903418, 0.0, 0.0, 0.0, 0.1366724335010161, 0.0, 0.0, 0.1562794386914836, 0.19391225920436742, 0.0, 0.11286164475205294, 0.0, 0.12416543512769322, 0.0, 0.0, 0.19234760746279847, 0.12248119173361699, 0.0, 0.20664361011193244, 0.0, 0.18995951017233198, 0.2045777012433944, 0.1168537902697436, 0.0, 0.16890313266495663, 0.0, 0.0, 0.1335421575549344, 0.0, 0.19944260753166945, 0.0, 0.18473089869790738, 0.0, 0.19133428377261583, 0.20978968793267175, 0.14310616380645452, 0.17265659900612262, 0.0, 0.16817054006660384, 0.0, 0.14150333339868498, 0.17275472419101204, 0.22930989128613138, 0.0, 0.0, 0.20155267717081113, 0.1546082459604938, 0.15693159662581818, 0.1832160787506764, 0.0, 0.18729363892005363, 0.0, 0.20773045712657492, 0.0, 0.19346051503038109, 0.0, 0.16041910398343542, 0.09303061188182...","[[0.0677469, -0.041739315, -0.08083352, 0.12261514, 0.16721858, -0.05736017, 0.09708657, 0.28409472, -0.118705854, -0.027471012, -0.057421274, -0.14376749, 0.0473368, 0.14283921, -0.030394064, -0.031858526, 0.052781112, 0.07924095, 0.0069334647, -0.2780961, 0.09244968, -0.032641098, -0.026583068, 0.036451172, 0.03341323, -0.025228117, -0.07656543, -0.118221, -0.10734222, 0.03260397, 0.15275688, 0.05852104, 0.106173486, -0.05881508, -0.082549885, 0.1174997, 0.06174134, -0.08423522, -0.055344716, -0.16970553, -0.06645013, 0.006280372, -0.11624798, -0.00490635, 0.1332938, -0.05824645, -0.00758351, -0.1255734, 0.039873965, 0.07710664, 0.045426518, -0.18234527, -0.024939088, -0.10234514, 0.08368705, -0.06852132, -0.07534842, -0.15418817, -0.14619642, -0.003937893, -0.101910375, -0.030932281, 0.1487361, -0.090863645, -0.320359, 0.09530128, -0.022416687, 0.22749043, -0.17629756, 0.21058744, -0.03136628, 0.1173821, 0.12045498, 0.03843337, 0.11156459, 0.058297314, 0.104958236, -0.11261302, ..."


In [None]:
save_pickle(FEATURES_PATH + "/df_train.pkl", df_train)
save_pickle(FEATURES_PATH + "/df_dev.pkl", df_dev)

In [None]:
memory_usage()

## Extracting Entities

We updated the token_pattern to tokenize words with at least 1 character instead of the default of min 2 chars, which will skip important tokens such as a, 7, etc. (used for NUMBER entity label)


In [None]:
full_text = " ".join(df_train['top'].to_list())
entities = [x.group() for x in re.finditer("(?<=\()[A-Z]+(_[A-Z]+)*", full_text)]

In [None]:
del full_text
gc.collect()

2680130

In [None]:
entities = list(set(entities)) # Unique
entities

['TOPPING',
 'PIZZAORDER',
 'DRINKTYPE',
 'DRINKORDER',
 'NUMBER',
 'STYLE',
 'SIZE',
 'VOLUME',
 'CONTAINERTYPE',
 'COMPLEX_TOPPING',
 'QUANTITY',
 'NOT']

In [None]:
# Using BIO Tagging
bio_entities = [f"{letter}-{entity}" for entity in entities for letter in "BI"]
bio_entities.append('O')
bio_entities

['B-TOPPING',
 'I-TOPPING',
 'B-PIZZAORDER',
 'I-PIZZAORDER',
 'B-DRINKTYPE',
 'I-DRINKTYPE',
 'B-DRINKORDER',
 'I-DRINKORDER',
 'B-NUMBER',
 'I-NUMBER',
 'B-STYLE',
 'I-STYLE',
 'B-SIZE',
 'I-SIZE',
 'B-VOLUME',
 'I-VOLUME',
 'B-CONTAINERTYPE',
 'I-CONTAINERTYPE',
 'B-COMPLEX_TOPPING',
 'I-COMPLEX_TOPPING',
 'B-QUANTITY',
 'I-QUANTITY',
 'B-NOT',
 'I-NOT',
 'O']

In [None]:
label_encoder = LabelEncoder()
label_encoder.fit(bio_entities)

In [None]:
label_encoder.transform(['B-PIZZAORDER'])

array([6])

In [None]:
def encode_entities(entities):
    return [label_encoder.transform(entity) for entity in entities]
def decode_entities(encoded_entities):
    return [label_encoder.inverse_transform(entity) for entity in encoded_entities]

### Modification
#### we used regexp_tokenize to handle cases like:
- I'd 
- party - size

In [None]:
quotes_sparated = df_train['src'].str.find("i'd") > 0
df_train[['src', 'top']][quotes_sparated]

Unnamed: 0,src,top
852235,three pizzas no american cheese and i'd like five personal size pies with no american cheese,(PIZZAORDER (NUMBER three ) pizzas no (NOT (TOPPING american cheese ) ) ) and i'd like (PIZZAORDER (NUMBER five ) (SIZE personal size ) pies with no (NOT (TOPPING american cheese ) ) )
852238,three pizzas no american cheese and i'd like three large pizzas with peperonni and without any vegan pepperoni,(PIZZAORDER (NUMBER three ) pizzas no (NOT (TOPPING american cheese ) ) ) and i'd like (PIZZAORDER (NUMBER three ) (SIZE large ) pizzas with (TOPPING peperonni ) and without any (NOT (TOPPING vegan pepperoni ) ) )
852270,four pizzas with balsamic glaze and i'd like three party size pies with peperonni and with no roasted pepper,(PIZZAORDER (NUMBER four ) pizzas with (TOPPING balsamic glaze ) ) and i'd like (PIZZAORDER (NUMBER three ) (SIZE party size ) pies with (TOPPING peperonni ) and with no (NOT (TOPPING roasted pepper ) ) )
852280,three large pizzas with balsamic glaze and i'd like four pies with peperonni and roasted green peppers,(PIZZAORDER (NUMBER three ) (SIZE large ) pizzas with (TOPPING balsamic glaze ) ) and i'd like (PIZZAORDER (NUMBER four ) pies with (TOPPING peperonni ) and (TOPPING roasted green peppers ) )
852287,four pizzas with balsamic glaze and i'd like three pies with pepper,(PIZZAORDER (NUMBER four ) pizzas with (TOPPING balsamic glaze ) ) and i'd like (PIZZAORDER (NUMBER three ) pies with (TOPPING pepper ) )
...,...,...
1251865,five pizzas no american cheese and i'd like three party sized pies with no banana peppers,(PIZZAORDER (NUMBER five ) pizzas no (NOT (TOPPING american cheese ) ) ) and i'd like (PIZZAORDER (NUMBER three ) (SIZE party sized ) pies with no (NOT (TOPPING banana peppers ) ) )
1251868,three party sized pizzas no american cheese and i'd like two regular pies no american cheese,(PIZZAORDER (NUMBER three ) (SIZE party sized ) pizzas no (NOT (TOPPING american cheese ) ) ) and i'd like (PIZZAORDER (NUMBER two ) (SIZE regular ) pies no (NOT (TOPPING american cheese ) ) )
1251870,five large pizzas with balsamic glaze and i'd like three pies with mozzarella and sauce,(PIZZAORDER (NUMBER five ) (SIZE large ) pizzas with (TOPPING balsamic glaze ) ) and i'd like (PIZZAORDER (NUMBER three ) pies with (TOPPING mozzarella ) and (TOPPING sauce ) )
1251871,three pizzas no american cheese and i'd like three party size pies with meatball and without roasted green pepper,(PIZZAORDER (NUMBER three ) pizzas no (NOT (TOPPING american cheese ) ) ) and i'd like (PIZZAORDER (NUMBER three ) (SIZE party size ) pies with (TOPPING meatball ) and without (NOT (TOPPING roasted green pepper ) ) )


In [None]:
hyphen_separated = df_train['src'].str.find("party") > 0
df_train[['src', 'top']][hyphen_separated]

Unnamed: 0,src,top
8,can i have one party sized high rise dough pizza with american cheese and a lot of peperonni,can i have (PIZZAORDER (NUMBER one ) (SIZE party sized ) (STYLE high rise dough ) pizza with (TOPPING american cheese ) and (COMPLEX_TOPPING (QUANTITY a lot of ) (TOPPING peperonni ) ) )
11,i'd like one party sized pie with american cheese and with pesto sauce,i'd like (PIZZAORDER (NUMBER one ) (SIZE party sized ) pie with (TOPPING american cheese ) and with (TOPPING pesto sauce ) )
13,can i have a party - sized pie without any bean,can i have (PIZZAORDER (NUMBER a ) (SIZE party - sized ) pie without any (NOT (TOPPING bean ) ) )
15,i'd like a party sized high rise dough pie with a lot of banana pepper and pecorino cheese,i'd like (PIZZAORDER (NUMBER a ) (SIZE party sized ) (STYLE high rise dough ) pie with (COMPLEX_TOPPING (QUANTITY a lot of ) (TOPPING banana pepper ) ) and (TOPPING pecorino cheese ) )
16,i'd like a party sized pie with balsamic glaze and black olive,i'd like (PIZZAORDER (NUMBER a ) (SIZE party sized ) pie with (TOPPING balsamic glaze ) and (TOPPING black olive ) )
...,...,...
2081430,i'd like a party - sized pizza with red peppers salami and fried onions,i'd like (PIZZAORDER (NUMBER a ) (SIZE party - sized ) pizza with (TOPPING red peppers ) (TOPPING salami ) and (TOPPING fried onions ) )
2081432,i'd like a party - size pizza with vegan pepperoni bbq chicken and dried tomato,i'd like (PIZZAORDER (NUMBER a ) (SIZE party - size ) pizza with (TOPPING vegan pepperoni ) (TOPPING bbq chicken ) and (TOPPING dried tomato ) )
2081434,i'd like a party - size pizza with barbecue sauce peppperoni and white onion,i'd like (PIZZAORDER (NUMBER a ) (SIZE party - size ) pizza with (TOPPING barbecue sauce ) (TOPPING peppperoni ) and (TOPPING white onion ) )
2081439,i'd like a party size pizza with bacon yellow peppers and sausage,i'd like (PIZZAORDER (NUMBER a ) (SIZE party size ) pizza with (TOPPING bacon ) (TOPPING yellow peppers ) and (TOPPING sausage ) )


In [None]:
def extract_labels(top: str, entities):
    # Extract words and parenthesis
    pattern = r"\b\w+(?:'\w+)?(?:-\w+)*\b|[()]"
    tokens = regexp_tokenize(top, pattern)
    
    labels: list[str] = []
    count: int = 0
    # print(tokens)
    is_beginning = True
    order_type = "PIZZAORDER"
    for i, token in enumerate(tokens):
        # print(token, count)
        # Skip all entities except ["PIZZAORDER", "DRINKORDER"]
        if token in entities and token not in ["PIZZAORDER", "DRINKORDER"]:
            continue
        elif token == "(":
            count += 1
        elif token == ")":
            count -= 1
        elif token == "PIZZAORDER":
            order_type = "PIZZAORDER"
        elif token == "DRINKORDER":
            order_type = "DRINKORDER"
        
        elif count == 0:
            labels.append("O")
            is_beginning = True
        else:
            if is_beginning == True:
                labels.append("B-" + order_type)
                is_beginning = False
                continue
            if is_beginning == False:
                labels.append("I-" + order_type)
    return labels

def encode_labels(labels: list[str]):
    return label_encoder.transform(labels)
def decode_labels(labels: list[str]):
    return label_encoder.inverse_transform(labels)

# index = 1251868
# tokens = df_train['src'][index].split()
# labels = extract_labels(df_train['top'][index], entities)
# encoded_labels = encode_labels(labels)
# print(len(tokens), len(labels))
# print([(x, y) for x, y in zip(tokens, labels)])
# df_train.iloc[index:index+1].head()

In [None]:
df_train['labels'] = df_train['top'].progress_apply(lambda x: extract_labels(x, entities))
df_train['encoded_labels'] = df_train['labels'].progress_apply(encode_labels)
df_dev.head(1)

In [None]:
df_dev['labels'] = df_dev['top'].progress_apply(lambda x: extract_labels(x, entities))
df_dev['encoded_labels'] = df_dev['labels'].progress_apply(encode_labels)
df_dev.head(1)

100%|██████████| 348/348 [00:00<00:00, 38664.35it/s]
100%|██████████| 348/348 [00:00<00:00, 11999.09it/s]


Unnamed: 0,src,top,tokenized,tfidf_features,word2vec_features,labels,encoded_labels
0,i want to order two medium pizzas with sausage and black olives and two medium pizzas with pepperoni and extra cheese and three large pizzas with pepperoni and sausage,i want to order (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING sausage ) and (TOPPING black olives ) ) and (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING pepperoni ) and (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) ) and (PIZZAORDER (NUMBER three ) (SIZE large ) pizzas with (TOPPING pepperoni ) and (TOPPING sausage ) ),"[i, want, to, order, two, medium, pizzas, with, sausage, and, black, olives, and, two, medium, pizzas, with, pepperoni, and, extra, cheese, and, three, large, pizzas, with, pepperoni, and, sausage]","[[0.08430143723922251, 0.0, 0.13337969831862775, 0.0, 0.0, 0.10034713760427152, 0.17396375337864922, 0.17958770538428326, 0.14606531916873786, 0.17721922092796344, 0.19713448714781656, 0.0, 0.17288347347903418, 0.0, 0.0, 0.0, 0.1366724335010161, 0.0, 0.0, 0.1562794386914836, 0.19391225920436742, 0.0, 0.11286164475205294, 0.0, 0.12416543512769322, 0.0, 0.0, 0.19234760746279847, 0.12248119173361699, 0.0, 0.20664361011193244, 0.0, 0.18995951017233198, 0.2045777012433944, 0.1168537902697436, 0.0, 0.16890313266495663, 0.0, 0.0, 0.1335421575549344, 0.0, 0.19944260753166945, 0.0, 0.18473089869790738, 0.0, 0.19133428377261583, 0.20978968793267175, 0.14310616380645452, 0.17265659900612262, 0.0, 0.16817054006660384, 0.0, 0.14150333339868498, 0.17275472419101204, 0.22930989128613138, 0.0, 0.0, 0.20155267717081113, 0.1546082459604938, 0.15693159662581818, 0.1832160787506764, 0.0, 0.18729363892005363, 0.0, 0.20773045712657492, 0.0, 0.19346051503038109, 0.0, 0.16041910398343542, 0.09303061188182...","[[0.0677469, -0.041739315, -0.08083352, 0.12261514, 0.16721858, -0.05736017, 0.09708657, 0.28409472, -0.118705854, -0.027471012, -0.057421274, -0.14376749, 0.0473368, 0.14283921, -0.030394064, -0.031858526, 0.052781112, 0.07924095, 0.0069334647, -0.2780961, 0.09244968, -0.032641098, -0.026583068, 0.036451172, 0.03341323, -0.025228117, -0.07656543, -0.118221, -0.10734222, 0.03260397, 0.15275688, 0.05852104, 0.106173486, -0.05881508, -0.082549885, 0.1174997, 0.06174134, -0.08423522, -0.055344716, -0.16970553, -0.06645013, 0.006280372, -0.11624798, -0.00490635, 0.1332938, -0.05824645, -0.00758351, -0.1255734, 0.039873965, 0.07710664, 0.045426518, -0.18234527, -0.024939088, -0.10234514, 0.08368705, -0.06852132, -0.07534842, -0.15418817, -0.14619642, -0.003937893, -0.101910375, -0.030932281, 0.1487361, -0.090863645, -0.320359, 0.09530128, -0.022416687, 0.22749043, -0.17629756, 0.21058744, -0.03136628, 0.1173821, 0.12045498, 0.03843337, 0.11156459, 0.058297314, 0.104958236, -0.11261302, ...","[O, O, O, O, B-PIZZAORDER, I-PIZZAORDER, I-PIZZAORDER, I-PIZZAORDER, I-PIZZAORDER, I-PIZZAORDER, I-PIZZAORDER, I-PIZZAORDER, O, B-PIZZAORDER, I-PIZZAORDER, I-PIZZAORDER, I-PIZZAORDER, I-PIZZAORDER, I-PIZZAORDER, I-PIZZAORDER, I-PIZZAORDER, O, B-PIZZAORDER, I-PIZZAORDER, I-PIZZAORDER, I-PIZZAORDER, I-PIZZAORDER, I-PIZZAORDER, I-PIZZAORDER]","[24, 24, 24, 24, 6, 18, 18, 18, 18, 18, 18, 18, 24, 6, 18, 18, 18, 18, 18, 18, 18, 24, 6, 18, 18, 18, 18, 18, 18]"


In [None]:
df_dev.iloc[3:4].head()

Unnamed: 0,src,top,tokenized,tfidf_features,word2vec_features,labels,encoded_labels
3,i'd like to order a large onion and pepper pizza,i'd like to order (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING onion ) and (TOPPING pepper ) pizza ),"[i'd, like, to, order, a, large, onion, and, pepper, pizza]","[[0.0, 0.0, 0.0, 0.36456535031837345, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3320179597028343, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3810995381997384, 0.0, 0.0, 0.3675029239347814, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3285764791768002, 0.0, 0.3478991908698433, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.45154673327106754, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...], [0.0, 0.0, 0.0, 0.2978392690850122, 0.0, 0.0, 0.3057565112264578, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3038578248228859, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.34081775488781035, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2053807562929373, 0.31134721883722927, 0.0, 0.0, 0.30023918113927156, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.25152159878515074, 0.0, 0.0, 0.0, 0...","[[0.034876496, -0.018838737, -0.03094462, 0.049610227, 0.07292387, -0.028232593, 0.0426675, 0.124641865, -0.046168078, -0.010289274, -0.022492006, -0.067076765, 0.026225913, 0.0594144, -0.013272378, -0.013796637, 0.0225329, 0.031439085, 0.005429208, -0.117336124, 0.04022778, -0.011933805, -0.014957658, 0.011053996, 0.01287688, -0.011281822, -0.03759305, -0.05268785, -0.04346962, 0.011879201, 0.067503154, 0.025923895, 0.046448585, -0.020964967, -0.03751059, 0.049969103, 0.02965557, -0.039529294, -0.01972989, -0.07808588, -0.027959775, 0.0023421464, -0.049128532, -0.0011981872, 0.061333265, -0.021473194, -0.0011677572, -0.050754834, 0.018149236, 0.034283537, 0.0179269, -0.08481379, -0.012953331, -0.04719673, 0.03888765, -0.02838799, -0.027466046, -0.06957505, -0.06718644, 0.0016567827, -0.044730537, -0.014175598, 0.06563012, -0.04024822, -0.1364846, 0.04118543, -0.010231526, 0.09637261, -0.07360343, 0.09349069, -0.014939378, 0.045573164, 0.051587228, 0.019778902, 0.044990543, 0.01990...","[O, O, O, O, B-PIZZAORDER, I-PIZZAORDER, I-PIZZAORDER, I-PIZZAORDER, I-PIZZAORDER, I-PIZZAORDER]","[24, 24, 24, 24, 6, 18, 18, 18, 18, 18]"


In [None]:
len(df_dev["tfidf_features"][3]), len(df_dev["word2vec_features"][3]), len(df_dev["labels"][3])

(10, 10, 10)

In [None]:
# def extract_labels(top: str, entities):
#     # Extract words and parenthesis
#     pattern = r"\b\w+(?:'\w+)?(?:-\w+)*\b|[()]"
#     tokens = regexp_tokenize(top, pattern)
    
#     IS_labels = []
#     NER_labels = []
#     # print(tokens)
#     beginning_of_order = True
#     order_type = "PIZZAORDER"
#     beginning_of_ner = True
#     ner_type = ""
    
#     count = 0 # Count the opened parenthesis (lower level in the hierarchy)
#     for i, token in enumerate(tokens):
#         # print(token, count)
#         # # Skip all entities except ["PIZZAORDER", "DRINKORDER"]
#         # if token in entities and token not in ["PIZZAORDER", "DRINKORDER"]:
#         #     continue
#         if token == "(":
#             count += 1
#         elif token == ")":
#             count -= 1
#         elif token == "PIZZAORDER":
#             order_type = "PIZZAORDER"
#         elif token == "DRINKORDER":
#             order_type = "DRINKORDER"
#         elif token in entities:
#             ner_type = token
        
#         elif count == 0:
#             IS_labels.append("O")
#             NER_labels.append("O")
#             beginning_of_order = True
#             beginning_of_ner = True
#         # 2nd top-level are PIZZAORDER & DRINKORDER
#         elif count >= 1:
#             if beginning_of_order == True:
#                 IS_labels.append("B-" + order_type)
#                 beginning_of_order = False
#                 continue
#             else:
#                 IS_labels.append("I-" + order_type)
        
#         # Other levels: Number, Size, etc...
#         else:
#             if beginning_of_ner == True:
#                 NER_labels.append("B-" + order_type)
#                 beginning_of_ner = False
#                 continue
#             if beginning_of_ner == False:
#                 NER_labels.append("I-" + order_type)
            

#     return IS_labels

# # index = 1251868
# # tokens = df_train['src'][index].split()
# # labels = extract_labels(df_train['top'][index], entities)
# # print(len(tokens), len(labels))
# # print([(x, y) for x, y in zip(tokens, labels)])
# # df_train.iloc[index:index+1].head()

In [None]:
"""
(ORDER i want to order 
    (PIZZAORDER 
        (NUMBER two ) 
        (SIZE medium ) pizzas with 
        (TOPPING sausage ) and 
        (TOPPING black olives ) 
    ) and 
    (PIZZAORDER 
        (NUMBER two )
        (SIZE medium ) pizzas with 
        (TOPPING pepperoni ) and 
        (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) 
        (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) 
    ) and 
    (PIZZAORDER 
        (NUMBER three ) 
        (SIZE large ) pizzas with 
        (TOPPING pepperoni ) and 
        (TOPPING sausage ) 
    )
)
"""

'\n(ORDER i want to order \n    (PIZZAORDER \n        (NUMBER two ) \n        (SIZE medium ) pizzas with \n        (TOPPING sausage ) and \n        (TOPPING black olives ) \n    ) and \n    (PIZZAORDER \n        (NUMBER two )\n        (SIZE medium ) pizzas with \n        (TOPPING pepperoni ) and \n        (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) \n        (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) \n    ) and \n    (PIZZAORDER \n        (NUMBER three ) \n        (SIZE large ) pizzas with \n        (TOPPING pepperoni ) and \n        (TOPPING sausage ) \n    )\n)\n'

# Models

## CRF

### CRF Features

In [None]:
def word2features(row):
    sentence: list[str] = row["tokenized"]
    sentence_tfidf: list[np.ndarray] = row["tfidf_features"]
    sentence_word2vec: list[np.ndarray] = row["word2vec_features"]

    features = []
    for i, word in enumerate(sentence):
        tfidf = sentence_tfidf[i]
        word2vec = sentence_word2vec[i]
        features.append({
            "word": word,
            # "tfidf": f"{tfidf}",
            # "word2vec": word2vec,
            # "is_numeric": word.isnumeric(),
            # "is_capitalized": word[0].isupper(),
            # "prev_word": sentence[i-1] if i > 0 else "",
            # "next_word": sentence[i+1] if i < len(sentence) - 1 else "",
            # "pos_tag": nltk.pos_tag([word])[0][1],
            # "sentence_position": "start" if i == 0 else "middle" if i < len(sentence) - 1 else "end",
        })

    return features

In [None]:
crf_features = df_dev.progress_apply(word2features, axis=1)

100%|██████████| 348/348 [00:00<00:00, 43518.72it/s]


In [None]:
crf_features[0][0]

{'word': 'i'}

### CRF

In [None]:
# CRF Model
update_model = True
if update_model or not os.path.exists(MODELS_PATH + "/crf_model.pkl"):
    # Initialize the CRF model
    crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, verbose=True) # , all_possible_transitions=True
else:
    with open(MODELS_PATH + "/crf_model.pkl", "rb") as f:
        crf = pickle.load(f)

In [None]:
# Format data for CRF training
crf_train_tokens = crf_features.tolist()
crf_train_labels = df_dev['labels'].tolist()

In [None]:
# Will work only without vectorizing words! (Dimension of the word vector of course will be )
print(len(crf_train_tokens))
print(len(crf_train_labels))
i = 0
k = 0
for token, label in zip(crf_train_tokens, crf_train_labels):
    if len(token) != len(label):
        print(f"{len(token)}, {len(label)}, {i}")
        # print(f"Mismatch found: {token}, {label}, {i}")
        k += 1
    if k > 10: break
    i += 1

348
348


In [None]:
# Train CRF
crf.fit(crf_train_tokens, crf_train_labels)

# Save the trained model as a pickle file
with open(MODELS_PATH + "/crf_model.pkl", 'wb') as f:
    pickle.dump(crf, f)

In [None]:
# Predict using CRF
test = "I want a pizza seafood with olives and a can of coke".split()
crf_predictions = crf.predict([test])
[(word, tag) for word, tag in zip(test, crf_predictions[0])]

[('I', 'I-PIZZAORDER'),
 ('want', 'I-PIZZAORDER'),
 ('a', 'I-PIZZAORDER'),
 ('pizza', 'I-PIZZAORDER'),
 ('seafood', 'I-PIZZAORDER'),
 ('with', 'I-PIZZAORDER'),
 ('olives', 'I-PIZZAORDER'),
 ('and', 'I-PIZZAORDER'),
 ('a', 'I-PIZZAORDER'),
 ('can', 'I-PIZZAORDER'),
 ('of', 'I-PIZZAORDER'),
 ('coke', 'I-PIZZAORDER')]

In [None]:
true_dev_labels = []
tokens_dev = []
for i in range(len(df_dev['top'])):
    true_dev_labels.append(extract_labels(df_dev['top'][i], entities))
    tokens_dev.append(df_dev['src'][i].split())

In [None]:
crf_predictions = crf.predict(tokens_dev)

## IS CRF Model Evaluation
#### Calculate Exact Match (EM) accuracy, Compute Precision, Recall, F1-Score

In [None]:
# Flatten true labels and predictions
flattened_true_labels = [label for sublist in true_dev_labels for label in sublist]
flattened_pred_labels = [label for sublist in crf_predictions for label in sublist]

em_accuracy = accuracy_score(flattened_true_labels, flattened_pred_labels)
precision = precision_score(flattened_true_labels, flattened_pred_labels, average='macro')
recall = recall_score(flattened_true_labels, flattened_pred_labels, average='macro')
f1 = f1_score(flattened_true_labels, flattened_pred_labels, average='macro')

print(f"Exact Match Accuracy: {em_accuracy:.5f}")
print(f"Precision: {precision:.5f}")
print(f"Recall: {recall:.5f}")
print(f"F1-Score: {f1:.5f}")


Exact Match Accuracy: 0.61521
Precision: 0.12304
Recall: 0.20000
F1-Score: 0.15235


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## LSTM

In [None]:
# Define the LSTM model
class LSTMModel(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = torch.nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Hyperparameters
input_size = 10  # Adjust based on your data
hidden_size = 256
output_size = 1  # Adjust based on your data
num_layers = 4
num_epochs = 50
learning_rate = 0.001

# Initialize the model, loss function, and optimizer
model = LSTMModel(input_size, hidden_size, output_size, num_layers).to(device)
criterion = torch.nn.MSELoss()  # Adjust based on your task
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Dummy data for illustration (replace with your actual data)
x_train = torch.randn(100, 10, input_size).to(device)  # (batch_size, sequence_length, input_size)
y_train = torch.randn(100, output_size).to(device)  # (batch_size, output_size)

# Training loop
for epoch in range(num_epochs):
    model.train()
    outputs = model(x_train)
    optimizer.zero_grad()
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 5 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [5/50], Loss: 0.9838
Epoch [10/50], Loss: 0.9190
Epoch [15/50], Loss: 0.8162
Epoch [20/50], Loss: 0.7695
Epoch [25/50], Loss: 0.7054
Epoch [30/50], Loss: 0.6339
Epoch [35/50], Loss: 0.5165
Epoch [40/50], Loss: 0.3671
Epoch [45/50], Loss: 0.2514
Epoch [50/50], Loss: 0.1968


# Model Evaluation