In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
import torch
import json
import pipeline_utils as utils
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
dataset_path = "/kaggle/input/pizza-train/PIZZA_train.json"

# Read and preprocess the dataset
data = []
with open(dataset_path, "r") as f: # Added encoding='utf-8'
    for line in f:
        entry = json.loads(line.strip())
        top = entry["train.TOP"]
        data.append(top)


In [2]:
# prompt: create dataframe of train.TOP
import pandas as pd

df = pd.DataFrame(data, columns=['train.TOP'])
df.head()

Unnamed: 0,train.TOP
0,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZ...
1,(ORDER (PIZZAORDER (SIZE large ) pie with (TOP...
2,(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE ...
3,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...
4,(ORDER can i have (PIZZAORDER (NUMBER one ) (S...


In [3]:
# DRINK WITH PIZZA
pizza_drink_df = df[df['train.TOP'].str.contains('PIZZAORDER') & df['train.TOP'].str.contains('DRINKORDER')]
num_pizza_drink_rows = len(pizza_drink_df)
print("Num of orders with pizza and drink", num_pizza_drink_rows)
pizza_drink_df = pizza_drink_df.sample(20000, random_state=42).reset_index(drop=True)

# DRINK
drink_df = df[df['train.TOP'].str.contains('DRINKORDER') & ~df['train.TOP'].str.contains('PIZZAORDER')]
num_drink_rows = len(drink_df)
print("Num of orders with only drink", num_drink_rows)
drink_df = drink_df.sample(20000, random_state=42).reset_index(drop=True)

# PIZZA
pizza_df = df[df['train.TOP'].str.contains('PIZZAORDER') & ~df['train.TOP'].str.contains('DRINKORDER')]
num_pizza_rows = len(pizza_df)
print("Num of orders with only pizza", num_pizza_rows)
pizza_df = pizza_df.sample(30000, random_state=42).reset_index(drop=True)

df = pd.concat([pizza_df, drink_df, pizza_drink_df], ignore_index=True)

Num of orders with pizza and drink 341942
Num of orders with only drink 425116
Num of orders with only pizza 1689388


In [None]:
def preprocess_dataframe(df, f_type):
        df["IS"] = df[f"{f_type}.TOP"].apply(utils.tag_orders)
        df["IS.tokens"] = df["IS"].apply(lambda x: x[0])
        df["IS.tag_ids"] = df["IS"].apply(lambda x: x[1])
        df["IS.tags"] = df["IS"].apply(lambda x: x[2])
        df = df.drop(columns=["IS"])
        df['grouped_tokens'] = df.apply(lambda row: utils.group_order_tokens(row['IS.tokens'], row['IS.tags']), axis=1)
        df['NER.tag_ids'] = df.apply(lambda row: utils.parse_top_string(row[f"{f_type}.TOP"], utils.entity_patterns, utils.tag2id)[1],
                                        axis=1)
        df["NER.tags"] = df["NER.tag_ids"].apply(lambda x: [utils.id2tag[tag_id] for tag_id in x])
        df['grouped_ids'] = df.apply(
                lambda row: utils.group_corresponding_tags(row['grouped_tokens'], row['IS.tokens'], row['NER.tag_ids']), axis=1)
        # downcase the tokens
        df['IS.tokens'] = df['IS.tokens'].apply(lambda row: [word.lower() for word in row])
        return df

In [None]:
df = preprocess_dataframe(df, "train")

In [8]:
import pandas as pd

# Create the first DataFrame
IS_df = df[['IS.tokens', 'IS.tag_ids']].copy().rename(columns={'IS.tokens': 'tokens', 'IS.tag_ids': 'ner_tags'})

# Create the second DataFrame
NER_df = df[['IS.tokens', 'NER.tag_ids']].copy().rename(columns={'IS.tokens': 'tokens', 'NER.tag_ids': 'ner_tags'})

In [None]:
import pandas as pd

def split_multiple_orders(df):
    expanded_tokens = []
    expanded_tags = []
    
    # Iterate through each row
    for _, row in df.iterrows():
        tokens = row['tokens']  # List of lists of tokens
        ner_tags = row['ner_tags']  # List of lists of tags
        
        assert len(tokens) == len(ner_tags), "Tokens and tags must have same length"
        
        # Add each order as a separate row
        for single_order_tokens, single_order_tags in zip(tokens, ner_tags):
            expanded_tokens.append(single_order_tokens)  # Wrap in list to maintain structure
            expanded_tags.append(single_order_tags)  # Wrap in list to maintain structure
    
    expanded_df = pd.DataFrame({
        'tokens': expanded_tokens,
        'ner_tags': expanded_tags
    })
    
    return expanded_df

'\nimport pandas as pd\n\ndef split_multiple_orders(df):\n    expanded_tokens = []\n    expanded_tags = []\n    \n    # Iterate through each row\n    for _, row in df.iterrows():\n        tokens = row[\'tokens\']  # List of lists of tokens\n        ner_tags = row[\'ner_tags\']  # List of lists of tags\n        \n        assert len(tokens) == len(ner_tags), "Tokens and tags must have same length"\n        \n        # Add each order as a separate row\n        for single_order_tokens, single_order_tags in zip(tokens, ner_tags):\n            expanded_tokens.append(single_order_tokens)  # Wrap in list to maintain structure\n            expanded_tags.append(single_order_tags)  # Wrap in list to maintain structure\n    \n    expanded_df = pd.DataFrame({\n        \'tokens\': expanded_tokens,\n        \'ner_tags\': expanded_tags\n    })\n    \n    return expanded_df\n\nNER_df = split_multiple_orders(NER_df)\n'

In [None]:
NER_df = split_multiple_orders(NER_df)

In [10]:
from datasets import Dataset

IS_dataset = Dataset.from_pandas(IS_df)
NER_dataset = Dataset.from_pandas(NER_df)

In [11]:
from transformers import AutoTokenizer, DataCollatorForTokenClassification
from datasets import Dataset
import numpy as np

def create_ner_preprocessing_pipeline(model_name='dslim/bert-base-NER', max_length=55):
    """
    Creates a complete preprocessing pipeline for NER tasks.
    
    Args:
        model_name (str): Name of the pre-trained model to use
        max_length (int): Maximum sequence length for tokenization
        
    Returns:
        tokenizer: The loaded tokenizer
        preprocess_function: The preprocessing function
        data_collator: The data collator for token classification
    """
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    def tokenize_and_align_labels(examples):
        """
        Tokenize inputs and align labels for NER tasks.
        
        Args:
            examples (dict): Dictionary containing 'tokens' and 'ner_tags'
            
        Returns:
            dict: Processed features with aligned labels
        """
        if 'tokens' not in examples or 'ner_tags' not in examples:
            raise ValueError("Examples must be a dictionary with 'tokens' and 'ner_tags' keys")
            
        if len(examples['tokens']) != len(examples['ner_tags']):
            raise ValueError("Number of token sequences doesn't match number of label sequences")
            
        # Tokenize the input tokens
        tokenized_inputs = tokenizer(
            examples["tokens"],
            truncation=True,
            is_split_into_words=True,
            padding=True,
            max_length=max_length,
            return_tensors=None  # Return lists instead of tensors
        )

        labels = []
        for i, label in enumerate(examples["ner_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            #previous_word_id = None
            aligned_labels = []
        
            # Validate label sequence length
            if len(label) != len(examples["tokens"][i]):
                raise ValueError(f"Mismatch between tokens and labels at index {i}")
           
            for word_id in word_ids:
                if word_id is None:
                    # Special tokens get labeled as -100
                    aligned_labels.append(-100)
                else:
                    try:
                        aligned_labels.append(label[word_id])
                    except IndexError:
                        raise IndexError(f"Label index {word_id} out of range for sequence {i}")
                #else:
                #    # Subsequent subword tokens get labeled as -100
                 #   aligned_labels.append(-100)
                #previous_word_id = word_id
            #print(aligned_labels)
            labels.append(aligned_labels)
        
        tokenized_inputs["labels"] = labels
        
        # Verify lengths match
        for key in tokenized_inputs.keys():
            if len(tokenized_inputs[key]) != len(examples['tokens']):
                raise ValueError(f"Length mismatch in processed features for key: {key}")
        
        return tokenized_inputs
    
    data_collator = DataCollatorForTokenClassification(
        tokenizer=tokenizer,
        pad_to_multiple_of=8  # Helpful for hardware optimization
    )
    
    return tokenizer, tokenize_and_align_labels, data_collator

def process_dataset(dataset, preprocessing_pipeline):
    """
    Process a dataset using the preprocessing pipeline.
    
    Args:
        dataset: The input dataset (must have 'tokens' and 'ner_tags' columns)
        preprocessing_pipeline: The preprocessing function from create_ner_preprocessing_pipeline
        
    Returns:
        processed_dataset: The processed dataset ready for training
    """
    try:
        processed_dataset = dataset.map(
            preprocessing_pipeline,
            batched=True,
            remove_columns=dataset.column_names,
            desc="Processing dataset"
        )
        return processed_dataset
    except Exception as e:
        raise RuntimeError(f"Error processing dataset: {str(e)}")


# Create pipeline
IS_tokenizer, IS_preprocess_fn, IS_data_collator = create_ner_preprocessing_pipeline()
NER_tokenizer, NERpreprocess_fn, NER_data_collator = create_ner_preprocessing_pipeline()

processed_IS_dataset = process_dataset(IS_dataset, IS_preprocess_fn)
processed_NER_dataset = process_dataset(NER_dataset, NERpreprocess_fn)

IS_batch = IS_data_collator([processed_IS_dataset[i] for i in range(len(processed_IS_dataset))])
NER_batch = NER_data_collator([processed_NER_dataset[i] for i in range(len(processed_NER_dataset))])

tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



Processing dataset:   0%|          | 0/70000 [00:00<?, ? examples/s]

Processing dataset:   0%|          | 0/70000 [00:00<?, ? examples/s]

In [12]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import Trainer, TrainingArguments

In [None]:
IS_model = AutoModelForTokenClassification.from_pretrained(
    'dslim/bert-base-NER',
    num_labels=len(utils.IS_tag2id),
    label2id=utils.IS_tag2id,
    id2label=utils.IS_id2tag,
    ignore_mismatched_sizes=True
)

NER_model = AutoModelForTokenClassification.from_pretrained(
    'dslim/bert-base-NER',
    num_labels=len(utils.tag2id),
    label2id=utils.tag2id,
    id2label=utils.id2tag,
    ignore_mismatched_sizes=True
)

IS_training_args = TrainingArguments(
    output_dir="./pizza_is_model",   # Directory to save the model
    learning_rate=2e-5,               # Standard BERT fine-tuning LR
    per_device_train_batch_size=128,   # Batch size
    per_device_eval_batch_size=128,
    num_train_epochs=5,               # Number of epochs
    evaluation_strategy="epoch",      # Evaluate after each epoch
    save_strategy="epoch",            # Save model at the end of each epoch
    load_best_model_at_end=True,      # Load the best model at the end
    metric_for_best_model="f1",       # Use F1 as the evaluation metric
)

NER_training_args = TrainingArguments(
    output_dir="./pizza_ner_model",   
    learning_rate=2e-5,             
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=5,             
    evaluation_strategy="epoch",  
    save_strategy="epoch",        
    load_best_model_at_end=True,   
    metric_for_best_model="f1",    
)

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dslim/bert-base-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) 

In [14]:
from sklearn.metrics import precision_recall_fscore_support

# 3. Define metrics calculation
def compute_metrics(pred):
    predictions, labels = pred.predictions, pred.label_ids
    predictions = np.argmax(predictions, axis=2)
    
    # Remove ignored index (-100)
    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [l for l in label if l != -100]
        for label in labels
    ]
    
    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        np.concatenate(true_labels),
        np.concatenate(true_predictions),
        average='weighted'
    )
    
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [15]:
from sklearn.model_selection import train_test_split
split_IS_dataset = processed_IS_dataset.train_test_split(test_size=0.2, seed=42)
split_NER_dataset = processed_NER_dataset.train_test_split(test_size=0.2, seed=42)

In [None]:
IS_trainer = Trainer(
    model=IS_model,
    args=IS_training_args,
    train_dataset=split_IS_dataset["train"],
    eval_dataset=split_IS_dataset["test"],
    data_collator=IS_data_collator,
    compute_metrics=compute_metrics
)

NER_trainer = Trainer(
    model=NER_model,
    args=NER_training_args,
    train_dataset=split_NER_dataset["train"],
    eval_dataset=split_NER_dataset["test"],
    data_collator=NER_data_collator,
    compute_metrics=compute_metrics
)

In [17]:
NER_trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.002138,0.999763,0.999762,0.999762
2,0.112900,0.001223,0.999813,0.999813,0.999813
3,0.001900,0.000726,0.99986,0.99986,0.99986
4,0.001100,0.000437,0.999903,0.999903,0.999903
5,0.000800,0.000411,0.99991,0.99991,0.99991


TrainOutput(global_step=2190, training_loss=0.026670368056591242, metrics={'train_runtime': 1136.404, 'train_samples_per_second': 246.391, 'train_steps_per_second': 1.927, 'total_flos': 5716848412800000.0, 'train_loss': 0.026670368056591242, 'epoch': 5.0})

In [18]:
IS_trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,9.5e-05,0.999988,0.999988,0.999988
2,0.035100,6.8e-05,0.999984,0.999984,0.999984
3,0.000200,2.2e-05,1.0,1.0,1.0
4,0.000100,1.8e-05,1.0,1.0,1.0
5,0.000100,2.1e-05,0.999996,0.999996,0.999996


TrainOutput(global_step=2190, training_loss=0.008094513797749803, metrics={'train_runtime': 1139.797, 'train_samples_per_second': 245.658, 'train_steps_per_second': 1.921, 'total_flos': 5716021584000000.0, 'train_loss': 0.008094513797749803, 'epoch': 5.0})

In [None]:
"""
ner_model_dir = '/kaggle/working/fine_tuned_pizza_ner_10_10_17K'
os.makedirs(ner_model_dir, exist_ok=True)
print(f"Directory {ner_model_dir} created!")
"""

Directory /kaggle/working/fine_tuned_pizza_ner_10_10_17K created!


In [None]:
"""
NER_model.save_pretrained(ner_model_dir)
NER_tokenizer.save_pretrained(ner_model_dir)
"""

('/kaggle/working/fine_tuned_pizza_ner_10_10_17K/tokenizer_config.json',
 '/kaggle/working/fine_tuned_pizza_ner_10_10_17K/special_tokens_map.json',
 '/kaggle/working/fine_tuned_pizza_ner_10_10_17K/vocab.txt',
 '/kaggle/working/fine_tuned_pizza_ner_10_10_17K/added_tokens.json',
 '/kaggle/working/fine_tuned_pizza_ner_10_10_17K/tokenizer.json')