# Imports

In [None]:
# General
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
import os
import tqdm

import spacy
# Load spacy model for lemmatization
nlp = spacy.load("en_core_web_sm")

# Preprocessing
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import regexp_tokenize

# Models

import torch

# Evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Read Json data and convert it to parqet format

In [None]:
import pandas as pd

# Load JSON dataset
df = pd.read_json('/kaggle/input/your-dataset-folder/dataset.json')

# Save as Parquet
df.to_parquet('/kaggle/working/dataset.parquet', index=False)

print("JSON converted to Parquet!")


# Split Test and Train Data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your dataset
df = pd.read_json('/kaggle/input/your-dataset-folder/dataset.json')

# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Define features and target
X = df['train.SRC']  # Replace with your actual feature columns
Y = df['train.TOP']  # Replace with your actual target column

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print("Data shuffled and split into training and testing sets.")


# Save Test and Train data

In [None]:
# Combine X_train and y_train into a single DataFrame
df_train = pd.DataFrame({'src': X_train, 'top': y_train})
test_df = pd.DataFrame({'src': X_test, 'top': y_test})

# Save to the dataset folder
df_train.to_parquet('/kaggle/working/train.parquet', index=False)
test_df.to_parquet('/kaggle/working/test.parquet', index=False)

print("Train and test datasets saved as Parquet files.")


# Tokenize SRC and save it

In [None]:

token_pattern=r"(?u)\b\w+(?:'\w+)?(?:-\w+)*\b"
df_train["tokenized"] = df_train["src"].progress_apply(lambda x: regexp_tokenize(x, token_pattern)) 
df_train.to_parquet('/kaggle/working/train_tokenized.parquet', index=False)
print("Tokenized src data saved as Parquet file.")

# Tranform BIO tags to numbers

In [None]:
full_text = " ".join(df_train['top'].to_list())
entities = [x.group() for x in re.finditer("(?<=\()[A-Z]+(_[A-Z]+)*", full_text)]
entities = list(set(entities)) # Unique

# Using BIO Tagging
bio_entities = [f"{letter}-{entity}" for entity in entities for letter in "BI"]
bio_entities.append('O')
bio_entities

label_encoder = LabelEncoder()
label_encoder.fit(bio_entities)

# Extract TOP Target and Save it 

In [None]:
def extract_labels(top: str, entities):
    # Extract words and parenthesis
    pattern = r"\b\w+(?:'\w+)?(?:-\w+)*\b|[()]"
    tokens = regexp_tokenize(top, pattern)
    
    labels = []
    count = 0
  
    is_beginning = True
    order_type = "PIZZAORDER"
    for i, token in enumerate(tokens):
       
        if token in entities and token not in ["PIZZAORDER", "DRINKORDER"]:
            continue
        elif token == "(":
            count += 1
        elif token == ")":
            count -= 1
        elif token == "PIZZAORDER":
            order_type = "PIZZAORDER"
        elif token == "DRINKORDER":
                    order_type = "DRINKORDER"
        
        elif count == 0:
            labels.append("O")
            is_beginning = True
        else:
            if is_beginning == True:
                labels.append("B-" + order_type)
                is_beginning = False
                continue
            if is_beginning == False:
                labels.append("I-" + order_type)
    labels = label_encoder.transform(labels)
    return labels

index = 1251868
tokens = df_train['src'][index].split()
labels = extract_labels(df_train['top'][index], entities)
print(len(tokens), len(labels))
print([(x, y) for x, y in zip(tokens, labels)])
df_train.iloc[index:index+1].head()
df_train['labels'] = df_train['top'].progress_apply(lambda x: extract_labels(x, entities))
df_train.to_parquet('/kaggle/working/train_tokenized_labels.parquet', index=False)
print("Labels extracted and saved as Parquet file.")