# Setup

## Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import json

import torch

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## Config

In [6]:
pd.set_option('display.max_colwidth', 1000) # Show all content of the cells
# # Undo with 
# pd.reset_option('display.max_colwidth')

# Load Dataset

In [None]:
def load_json(filename: str, cols: list[str] | None = None):
    """
    Load a json file into a pandas DataFrame.
    * This function is useful (for some reason) for loading the large dataset files.
    
    filename: str
        The name of the file to load.
    cols: list[str] | None
        The columns to load. If None, load all columns.
    return: pd.DataFrame
        The DataFrame containing the data from the json file.
    """
    all_cols = True if cols is None else False
    data = []

    with open(filename, encoding='latin-1') as f:
        line = f.readline()
        f.seek(0) # Go back to the beginning of the file
        doc = json.loads(line)
        if all_cols:
            cols = list(doc.keys())
        
        for line in f:
            doc = json.loads(line)
            lst = [doc[col] for col in cols]
            data.append(lst)

    df = pd.DataFrame(data=data, columns=cols)
    return df

train_filename = 'data/dataset/PIZZA_train.json'
df_train = load_json(train_filename)
df_train.head()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
0,can i have a large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (TOPPING BBQ_PULLED_PORK ) ) ),(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING bbq pulled pork ) ) ),(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING bbq pulled pork ) ) )
1,large pie with green pepper and with extra peperonni,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (TOPPING GREEN_PEPPERS ) (COMPLEX_TOPPING (QUANTITY EXTRA ) (TOPPING PEPPERONI ) ) ) ),(ORDER (PIZZAORDER (SIZE large ) pie with (TOPPING green pepper ) and with (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING peperonni ) ) ) ),(ORDER (PIZZAORDER (SIZE large ) (TOPPING green pepper ) (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING peperonni ) ) ) )
2,i'd like a large vegetarian pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (STYLE VEGETARIAN ) ) ),(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE large ) (STYLE vegetarian ) pizza ) ),(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (STYLE vegetarian ) ) )
3,party size stuffed crust pie with american cheese and with mushroom,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZE ) (STYLE STUFFED_CRUST ) (TOPPING AMERICAN_CHEESE ) (TOPPING MUSHROOMS ) ) ),(ORDER (PIZZAORDER (SIZE party size ) (STYLE stuffed crust ) pie with (TOPPING american cheese ) and with (TOPPING mushroom ) ) ),(ORDER (PIZZAORDER (SIZE party size ) (STYLE stuffed crust ) (TOPPING american cheese ) (TOPPING mushroom ) ) )
4,can i have one personal sized artichoke,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PERSONAL_SIZE ) (TOPPING ARTICHOKES ) ) ),(ORDER can i have (PIZZAORDER (NUMBER one ) (SIZE personal sized ) (TOPPING artichoke ) ) ),(ORDER (PIZZAORDER (NUMBER one ) (SIZE personal sized ) (TOPPING artichoke ) ) )


In [8]:
df_dev = pd.read_json('data/dataset/PIZZA_dev.json', lines=True)
df_dev.head()

Unnamed: 0,dev.SRC,dev.EXR,dev.TOP,dev.PCFG_ERR
0,i want to order two medium pizzas with sausage and black olives and two medium pizzas with pepperoni and extra cheese and three large pizzas with pepperoni and sausage,(ORDER (PIZZAORDER (NUMBER 2 ) (SIZE MEDIUM ) (COMPLEX_TOPPING (QUANTITY EXTRA ) (TOPPING CHEESE ) ) (TOPPING PEPPERONI ) ) (PIZZAORDER (NUMBER 2 ) (SIZE MEDIUM ) (TOPPING OLIVES ) (TOPPING SAUSAGE ) ) (PIZZAORDER (NUMBER 3 ) (SIZE LARGE ) (TOPPING PEPPERONI ) (TOPPING SAUSAGE ) ) ),(ORDER i want to order (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING sausage ) and (TOPPING black olives ) ) and (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING pepperoni ) and (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) ) and (PIZZAORDER (NUMBER three ) (SIZE large ) pizzas with (TOPPING pepperoni ) and (TOPPING sausage ) ) ),False
1,five medium pizzas with tomatoes and ham,(ORDER (PIZZAORDER (NUMBER 5 ) (SIZE MEDIUM ) (TOPPING HAM ) (TOPPING TOMATOES ) ) ),(ORDER (PIZZAORDER (NUMBER five ) (SIZE medium ) pizzas with (TOPPING tomatoes ) and (TOPPING ham ) ) ),False
2,i need to order one large vegetarian pizza with extra banana peppers,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (STYLE VEGETARIAN ) (COMPLEX_TOPPING (QUANTITY EXTRA ) (TOPPING BANANA_PEPPERS ) ) ) ),(ORDER i need to order (PIZZAORDER (NUMBER one ) (SIZE large ) (STYLE vegetarian ) pizza with (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING banana peppers ) ) ) ),False
3,i'd like to order a large onion and pepper pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (TOPPING ONIONS ) (TOPPING PEPPERS ) ) ),(ORDER i'd like to order (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING onion ) and (TOPPING pepper ) pizza ) ),False
4,i'll have one pie along with pesto and ham but avoid olives,(ORDER (PIZZAORDER (NOT (TOPPING OLIVES ) ) (NUMBER 1 ) (TOPPING HAM ) (TOPPING PESTO ) ) ),(ORDER i'll have (PIZZAORDER (NUMBER one ) pie along with (TOPPING pesto ) and (TOPPING ham ) but avoid (NOT (TOPPING olives ) ) ) ),False


# EDA - Exploratory Data Analysis

## Column names

- **SRC** 
    - The source text of the pizza order as given by the user.
- **EXR** 
    - The expected representation of the pizza order in a structured format (likely a parse tree or similar structure).
    - Or the required extraction ~ Ahmed :)
- **TOP** 
    - The top-level representation of the pizza order, possibly a normalized or tokenized version of the source text.
- **PCFG_ERR** 
    - A boolean indicating whether there was an error in parsing the pizza order using a Probabilistic Context-Free Grammar (PCFG).


In [6]:
df_dev.rename(columns={
    'dev.SRC': 'text', 
    'dev.EXR': 'expected',
    'dev.TOP': 'label',
    'dev.PCFG_ERR': 'pcfg_err',
}, inplace=True)

In [9]:
print(df_train.shape, df_dev.shape)

(2456446, 4) (348, 4)


## Check Duplicates & Missing Data

In [7]:
df_dev.describe()

Unnamed: 0,text,expected,label,pcfg_err
count,348,348,348,348
unique,348,270,348,2
top,hello i need three pizzas with bacon and onions but i don't want any chicken on them thanks,(ORDER (PIZZAORDER (NOT (TOPPING PEPPERS ) ) (NUMBER 1 ) (SIZE SMALL ) (STYLE THIN_CRUST ) (TOPPING HAM ) ) ),(ORDER hello i need (PIZZAORDER (NUMBER three ) pizzas with (TOPPING bacon ) and (TOPPING onions ) but i don't want any (NOT (TOPPING chicken ) ) ) on them thanks ),False
freq,1,5,1,242


In [8]:
df_dev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 348 entries, 0 to 347
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      348 non-null    object
 1   expected  348 non-null    object
 2   label     348 non-null    object
 3   pcfg_err  348 non-null    object
dtypes: object(4)
memory usage: 11.0+ KB


In [9]:
df_dev.duplicated().sum()

np.int64(0)

## Data Cleaning

Masha2 allah: Data is clean (no punctuation | whitespace)

In [10]:
df_dev['text'].str.find(string.punctuation).describe()

count    348.0
mean      -1.0
std        0.0
min       -1.0
25%       -1.0
50%       -1.0
75%       -1.0
max       -1.0
Name: text, dtype: float64

In [11]:
df_dev['text'].str.find(string.whitespace).describe()

count    348.0
mean      -1.0
std        0.0
min       -1.0
25%       -1.0
50%       -1.0
75%       -1.0
max       -1.0
Name: text, dtype: float64

# Preprocessing

# Models

In [12]:
# Define the LSTM model
class LSTMModel(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = torch.nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Hyperparameters
input_size = 10  # Adjust based on your data
hidden_size = 256
output_size = 1  # Adjust based on your data
num_layers = 4
num_epochs = 50
learning_rate = 0.001

# Initialize the model, loss function, and optimizer
model = LSTMModel(input_size, hidden_size, output_size, num_layers).to(device)
criterion = torch.nn.MSELoss()  # Adjust based on your task
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Dummy data for illustration (replace with your actual data)
x_train = torch.randn(100, 10, input_size).to(device)  # (batch_size, sequence_length, input_size)
y_train = torch.randn(100, output_size).to(device)  # (batch_size, output_size)

# Training loop
for epoch in range(num_epochs):
    model.train()
    outputs = model(x_train)
    optimizer.zero_grad()
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 5 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [5/50], Loss: 0.9970
Epoch [10/50], Loss: 0.9159
Epoch [15/50], Loss: 0.8262
Epoch [20/50], Loss: 0.7855
Epoch [25/50], Loss: 0.7242
Epoch [30/50], Loss: 0.6577
Epoch [35/50], Loss: 0.6061
Epoch [40/50], Loss: 0.5145
Epoch [45/50], Loss: 0.4208
Epoch [50/50], Loss: 0.3274


# Model Evaluation

In [13]:
# Dummy validation data for illustration (replace with your actual data)
x_val = torch.randn(20, 10, input_size, device=device)  # (batch_size, sequence_length, input_size)
y_val = torch.randn(20, output_size, device=device)  # (batch_size, output_size)

# Evaluation
model.eval()
with torch.no_grad():
    val_outputs = model(x_val)
    val_loss = criterion(val_outputs, y_val)
    print(f'Validation Loss: {val_loss.item():.4f}')

Validation Loss: 2.2243
