# Imports

In [99]:
# General
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
import os
import tqdm
import gc
import spacy
# Load spacy model for lemmatization
nlp = spacy.load("en_core_web_sm")

# Preprocessing
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import regexp_tokenize

# Models

import torch
from nltk.tokenize import regexp_tokenize
# Evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# General
import numpy as np
import pandas as pd

# Preprocessing
from gensim.models import FastText
import os
import json
import pickle
import psutil
import numpy as np
import pandas as pd
import tqdm
from typing import Literal

#* Configurations
#* Folder Paths
DATASET_PATH = "../../data/dataset" # Local
DATASET_PATH = "/kaggle/input/dataset-with-is-final-labels/train_95_with_NER_and_IS_labels.parquet" # Kaggle

OUTPUT_ROOT_PATH = "../../data/saved" # Local
OUTPUT_ROOT_PATH = "/kaggle/working" # Kaggle

PROCESSED_DATA_PATH = OUTPUT_ROOT_PATH + "/data"
FEATURES_PATH = OUTPUT_ROOT_PATH + "/features"
MODELS_PATH = OUTPUT_ROOT_PATH + "/models"

#* Common Variables
token_pattern=r"(?u)\b\w+(?:'\w+)?(?:-\w+)*\b"

def run_config():
    #* Pandas
    pd.set_option('display.max_colwidth', 1000) # Show all content of the cells
    # pd.reset_option('display.max_colwidth') # Undo with 
    
    #* Config tqdm for pandas
    tqdm.tqdm.pandas()

    #* Output Folders
    os.makedirs(OUTPUT_ROOT_PATH, exist_ok=True)
    os.makedirs(FEATURES_PATH, exist_ok=True)
    os.makedirs(MODELS_PATH, exist_ok=True)
    os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)
    # os.rmdir(OUTPUT_ROOT_PATH)
    # os.rmdir(FEATURES_PATH)
    # os.rmdir(MODELS_PATH)
    # os.rmdir(PROCESSED_DATA_PATH)
run_config()

import os
import json
import pickle
import pyarrow as pa
import pyarrow.dataset as pda
import pyarrow.parquet as pq
import glob
import psutil
import numpy as np
import pandas as pd
from typing import Literal


types = Literal["model", "feature", "processed"]

#* General
def file_exists(path):
    return os.path.exists(path)

def add_to_path(path: str, type: types | None = None):
    if type is not None:
        if type == "model":
            path = MODELS_PATH + "/" + path
        elif type == "feature":
            path = FEATURES_PATH + "/" + path
        elif type == "processed":
            path = PROCESSED_DATA_PATH + "/" + path
    return path

#* Memory Management & Performance
def memory_usage():
    process = psutil.Process(os.getpid())
    return (process.memory_info().rss / 1024 ** 2)


#* Save & Load functions
def save_pickle(path: str, obj, type: types | None = None):
    path = add_to_path(path, type)
    with open (path, 'wb') as f:
        pickle.dump(obj, f)

def load_pickle(path: str, type: types | None = None):
    path = add_to_path(path, type)
    with open(path, 'rb') as f:
        return pickle.load(f)
    
def save_parquet(path: str, obj, type: types | None = None):
    path = add_to_path(path, type)
    obj.to_parquet(path, engine='pyarrow', compression='snappy')

def load_parquet(path: str, type: types | None = None):
    path = add_to_path(path, type)
    return pd.read_parquet(path, engine='pyarrow')
    
def save_np(path: str, obj, type: types | None = None, allow_pickle=True):
    path = add_to_path(path, type)
    np.save(path, obj, allow_pickle=allow_pickle)

def load_np(path: str, type: types | None = None, allow_pickle=True):
    path = add_to_path(path, type)
    return np.load(path, allow_pickle=allow_pickle)

def save_dict_to_json(path: str, obj, type: types | None = None):
    path = add_to_path(path, type)
    # Convert ndarray to list
    for key, value in obj.items():
        if isinstance(value, np.ndarray):
            obj[key] = value.tolist()

    with open(path, 'w') as f:
        json.dump(obj, f)

def load_json_to_dict(path: str, type: types | None = None):
    path = add_to_path(path, type)
    with open(path, 'r') as f:
        return json.load(f)

def load_json(filename: str, cols: list[str] | None = None):
    """
    Load a json file into a pandas DataFrame.
    * This function is useful (for some reason) for loading the large dataset files.
    
    filename: str
        The name of the file to load.
    cols: list[str] | None
        The columns to load. If None, load all columns.
    return: pd.DataFrame
        The DataFrame containing the data from the json file.
    """
    all_cols = True if cols is None else False
    data = []

    with open(filename, encoding='latin-1') as f:
        line = f.readline()
        f.seek(0) # Go back to the beginning of the file
        doc = json.loads(line)
        if all_cols:
            cols = list(doc.keys())
        
        for line in f:
            doc = json.loads(line)
            lst = [doc[col] for col in cols]
            data.append(lst)

    df = pd.DataFrame(data=data, columns=cols)
    return df


def process_parquet_in_chunks(input_file: str, output_file: str, chunk_size: int, preprocess_function: callable, args: tuple = (), merge_chunks: bool=True):
    """
    Process a large Parquet file in chunks, applying a preprocessing function to each row, 
    and save the processed chunks as new Parquet files. Optionally merge the processed chunks.
    Source: https://blog.clairvoyantsoft.com/efficient-processing-of-parquet-files-in-chunks-using-pyarrow-b315cc0c62f9

    Parameters:
    - input_file (str): Path to the input Parquet file.
    - output_file (str): Path to save the processed Parquet file.
    - chunk_size (int): Number of rows to process per chunk.
    - preprocess_function (function): Function to apply to each row.
    - merge_chunks (bool): Whether to merge the processed chunks into a single Parquet file (default: True).

    Returns:
    - None
    """

    parquet_file = pq.ParquetFile(input_file) # Dataframe which does not fit into system memory

    for i, batch in enumerate(parquet_file.iter_batches(batch_size=chunk_size)):
        df = batch.to_pandas()
        # Process the chunk (batch)
        processed_chunk = df.progress_apply(preprocess_function, args=args, axis=1)

        # Save the processed chunk to a new Parquet file
        output_chunk = f"{output_file}_{i}.parquet"
        processed_chunk.to_parquet(output_chunk, engine='pyarrow', compression='snappy')
        print(f"Chunk {i} processed and saved to {output_chunk}")

    # Optionally merge processed chunks
    if merge_chunks:
        print("Merging processed chunks into a single Parquet file...")

        # Get all processed chunk files
        parquet_files = glob.glob(f"{output_file}_*.parquet")
        # Read and concatenate them into a single DataFrame
        final_df = pd.concat([pd.read_parquet(file) for file in parquet_files], ignore_index=True)
        # Save the final DataFrame as a single Parquet file
        final_df.to_parquet(output_file, engine='pyarrow', compression='snappy')
        # Remove the processed chunk files
        for file in parquet_files:
            os.remove(file)

        print(f"Merged file saved to {output_file}")


def process_pickles_in_chunks(input_file: str, output_file: str, chunk_size: int, preprocess_function: callable, args: tuple = (), merge_chunks: bool=True):
    """
    Process a large pickle file in chunks, applying a preprocessing function to each row, 
    and save the processed chunks as new pickle files. Optionally merge the processed chunks.

    Parameters:
    - input_file (str): Path to the input pickle file.
    - output_file (str): Path to save the processed pickle file.
    - chunk_size (int): Number of rows to process per chunk.
    - preprocess_function (function): Function to apply to each row.
    - merge_chunks (bool): Whether to merge the processed chunks into a single pickle file (default: True).

    Returns:
    - None
    """

    # Load the pickle file
    with open(input_file, 'rb') as f:
        data = pickle.load(f)

    # Split the data into chunks
    chunks = [data[i:i + chunk_size] for i in range(0, len(data), chunk_size)]

    for i, chunk in enumerate(chunks):
        # Process the chunk
        processed_chunk = [preprocess_function(row, *args) for row in chunk]

        # Save the processed chunk to a new pickle file
        output_chunk = f"{output_file}_{i}.pkl"
        with open(output_chunk, 'wb') as f:
            pickle.dump(processed_chunk, f)

        print(f"Chunk {i} processed and saved to {output_chunk}")

    # Optionally merge processed chunks
    if merge_chunks:
        print("Merging processed chunks into a single pickle file...")

        # Get all processed chunk files
        pickle_files = glob.glob(f"{output_file}_*.pkl")
        # Read and concatenate them into a single list
        final_data = []
        for file in pickle_files:
            with open(file, 'rb') as f:
                final_data.extend(pickle.load(f))
        # Save the final list as a single pickle file
        with open(output_file, 'wb') as f:
            pickle.dump(final_data, f)
        # Remove the processed chunk files
        for file in pickle_files:
            os.remove(file)

        print(f"Merged file saved to {output_file}")

X_train = pd.read_parquet(DATASET_PATH)
X_train[:5]

from gensim.models import FastText
model_name = "/fast_text_model.bin"
update_model = True
if update_model or not os.path.exists(MODELS_PATH + model_name):
    print(f"Creating '{model_name}'...")
    # Create a FastText model
    EMBED_SIZE = 300
    fast_text_model = FastText(sentences=X_train, vector_size=EMBED_SIZE, window=5, min_count=1, workers=4)
    # fast_text_model.wv.add_vector("<UNK>", np.zeros(EMBED_SIZE))
    # fast_text_model.wv["<PAD>"] = np.zeros(EMBED_SIZE)

    # Save the trained model
    print(f"Saving '{model_name}'...")
    fast_text_model.save(MODELS_PATH + model_name)
else:
    print(f"Loading '{model_name}'...")
    # Load the trained model
    fast_text_model = FastText.load(MODELS_PATH + model_name)
    
fast_text_model

# Config

In [None]:
FEATURE_EXTRACTOR = 'fasttext'
PIPELINE = 'IS'
OUTPUT_SIZE = 5 if PIPELINE == 'IS' else 25

In [None]:
df_train  = pd.read_parquet('/kaggle/input/dataset-with-is-final-labels/train_95_with_NER_and_IS_labels.parquet')
print(df_train.head())

# Read Json data and convert it to parqet format

In [None]:
# import pandas as pd

# # Load JSON dataset
# df = pd.read_json('/kaggle/input/nlp-pizzaa-ner-dataset/PIZZA_train.json')

# # Save as Parquet
# df.to_parquet('/kaggle/working/PIZZA_train_all.parquet', index=False)

# print("JSON converted to Parquet!")


In [None]:
tqdm.tqdm.pandas()

# remove ORDER(

In [None]:
# df_train = pd.read_parquet('/kaggle/input/dataset-95-with-ner-and-is-labels/train_95_with_NER_and_IS_labels.parquet')
# print(df_train['src'].head())



# df_train['top'] = df_train['top'].str.replace(r"^\(ORDER\s?", "", regex=True)
# df_train['top'] = df_train['top'].str.replace(r"\)$", "", regex=True)



# Split Test and Train Data

In [None]:
# import pandas as pd
# from sklearn.model_selection import train_test_split

# # Load your dataset
# df = pd.read_parquet('/kaggle/input/train-parquet/train.parquet')

# # Shuffle the dataset
# df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# # Define features and target
# X = df['src']  
# Y = df['top']  

# # Split into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.05, random_state=42)

# print("Data shuffled and split into training and testing sets.")


# Save Test and Train data

In [None]:
# # Combine X_train and y_train into a single DataFrame
# df_train = pd.DataFrame({'src': X_train, 'top': y_train})
# df_test = pd.DataFrame({'src': X_test, 'top': y_test})

# # Save to the dataset folder
# df_train.to_parquet('/kaggle/working/train_95.parquet', index=False)
# df_test.to_parquet('/kaggle/working/test_5.parquet', index=False)

# print("Train and test datasets saved as Parquet files.")


# Tokenize SRC and save it

In [None]:

# # df_train = pd.read_parquet("train.parquet")
# token_pattern=r"(?u)\b\w+(?:'\w+)?(?:\s*-\s*\w+)*\b"
# df_train["tokenized"] = df_train["src"].progress_apply(lambda x: regexp_tokenize(x, token_pattern)) 
# # print(df_train["tokenized"])
# df_train.to_parquet('/kaggle/working/train_95_with_NER_and_IS_labels.parquet', index=False)
# print("Tokenized src data saved to parquet file.")
# print(df_train['tokenized'].head())

# Tranform BIO tags to numbers

In [None]:
# # # from sklearn.preprocessing import LabelEncoder
# full_text = " ".join(df_train['top'].to_list())
# entities = [x.group() for x in re.finditer("(?<=\()[A-Z]+(_[A-Z]+)*", full_text)]
# entities = list(set(entities)) # Unique

#  # Using BIO Tagging
# bio_entities = [f"{letter}-{entity}" for entity in entities for letter in "BI"]
# bio_entities.append('O')
# bio_entities



In [None]:
IS_BIO_entities = [
    'B-PIZZAORDER',
    'I-PIZZAORDER',
    'B-DRINKORDER',
    'I-DRINKORDER',
    'O'
] 

IS_label_encoder = LabelEncoder()
IS_label_encoder.fit(IS_BIO_entities)


# Extract TOP Target IS and Save it 

In [None]:
def extract_labels_IS(top: str, entities):
    # Extract words and parenthesis
   
    token_pattern = r"\b\w+(?:'\w+)?(?:-\w+)*\b|[()]"
    # token_pattern=r"(?u)\b\w+(?:'\w+)?(?:\s*-\s*\w+)*\b"
    tokens = regexp_tokenize(top, token_pattern)
    labels = []
    count = 0
  
    is_beginning = True
    order_type = "PIZZAORDER"
    for i, token in enumerate(tokens):
       
        if token in entities and token not in ["PIZZAORDER", "DRINKORDER"]:
            continue 
      
        elif token == "(":
            count += 1
        elif token == ")":
            count -= 1
        elif token == "PIZZAORDER":
            order_type = "PIZZAORDER"
        elif token == "DRINKORDER":
            order_type = "DRINKORDER"
        
        elif count == 0:
            labels.append("O")
            is_beginning = True
        else:
            if is_beginning == True:
                labels.append("B-" + order_type)
                is_beginning = False
                continue
            if is_beginning == False:
                labels.append("I-" + order_type)
    labels = IS_label_encoder.transform(labels)
    return labels


In [None]:
# # Function to apply extract_labels to each row
# df_train = pd.read_parquet('/kaggle/input/final-95-train-set/train_95_with_NER_and_IS_labels (1).parquet')
# def apply_extract_labels(row, entities):
#     labels = extract_labels_IS(row['top'], entities)
#     return labels.tolist()

# # # Apply the function to each row in the 'top' column and store the result in a new column 'IS_labels'
# df_train['IS_labels'] = df_train.progress_apply(lambda row: apply_extract_labels(row,['PIZZAORDER','DRINKORDER'] ), axis=1)


# print(df_train[['top', 'IS_labels']].head())
# labels = df_train['IS_labels']
# print(IS_label_encoder.inverse_transform(labels[0]))

# # # Save the modified DataFrame to a new Parquet file
# df_train.to_parquet('/kaggle/working/train_95_with_NER_and_IS_labels.parquet', index=False)
# print("Data with IS_labels saved to train_95")


In [None]:
# df_train = pd.read_parquet('/kaggle/input/final-95-train-set/train_95_with_NER_and_IS_labels (1).parquet')
# # Ensure the values are integers and remove NaN
# unique_values = df_train['IS_labels'].explode().dropna().unique()

# # # Convert unique_values to integers (if necessary)
# unique_values = unique_values.astype(int)

# # # Use inverse_transform
# # print("Decoded labels:", IS_label_encoder.inverse_transform(unique_values))

# # Print unique encoded values
# print("Unique values:", unique_values) 
# df_train.to_parquet('/kaggle/working/train_95_with_NER_and_IS_labels.parquet')

# convert parquet to csv

In [None]:
# import pandas as pd

# # Read the Parquet file


# # Save it as a CSV file
# df.to_csv('/kaggle/working/train_95.csv', index=False)


# word2vec

In [None]:
DATASET_PATH = "/kaggle/input/pizza-dataset"
OUTPUT_ROOT_PATH = "/kaggle/working"
MODELS_PATH = OUTPUT_ROOT_PATH + "/models"
PYTORCH_MODELS_PATH = MODELS_PATH + "/checkpoints"

In [None]:
# os.makedirs(MODELS_PATH, exist_ok=True)

In [None]:
# df_train = pd.read_parquet("train.parquet")
# print(df_train['tokenized'][0])
# print(df_train['tokenized'].apply(type).value_counts())
# print(df_train.columns)


In [None]:
# df_train = pd.read_parquet('/kaggle/input/final-95-train-set/train_95_with_NER_and_IS_labels (1).parquet',columns=['tokenized'])
# sentences=df_train['tokenized'].tolist()

# sentences = [sentence.tolist() if isinstance(sentence, np.ndarray) else sentence for sentence in sentences]
# '''  
# <class 'list'>
# <class 'list'>
# <class 'str'>
# ['party', 'size', 'dried', 'peppers', 'pizza', 'and', 'a', 'sprite']
# '''
# # Verify the format
# print(type(sentences))         # Should be list
# print(type(sentences[0]))      # Should be list
# print(type(sentences[0][0]))   # Should be str
# print(sentences[0])            # Check the first sentence



# # Train Word2Vec model
# word2vec_model = Word2Vec(sentences=sentences, vector_size=200, window=5, min_count=1, workers=4) 


In [None]:
# max_length = max(len(sentence) for sentence in df_train['IS_labels']) 
# print(max_length) 

In [None]:
# import os

# # Define the new directory path
# new_dir = "/kaggle/working/feature_extractors"

# # Create the directory if it doesn't exist
# if not os.path.exists(new_dir):
#     os.makedirs(new_dir)
#     print(f"Directory created: {new_dir}")
# else:
#     print("Directory already exists")

# word2vec_model.save("/kaggle/working/feature_extractors/word2vec_model.model")
# print("Word2Vec model saved successfully!")
# print(word2vec_model)

In [None]:
# import os

# # Specify the file path you want to delete
# file_path = '/kaggle/working/word2vec_features.npy'  # Replace with the path of the file you want to delete

# # Check if the file exists before deleting it
# if os.path.exists(file_path):
#     os.remove(file_path)
#     print(f"File {file_path} has been deleted.")
# else:
#     print(f"File {file_path} not found.")


In [None]:


DATASET_PATH = "/kaggle/input/pizza-dataset"
OUTPUT_ROOT_PATH = "/kaggle/working"
MODELS_PATH = OUTPUT_ROOT_PATH + "/models"
PYTORCH_MODELS_PATH = MODELS_PATH + "/checkpoints"

# General
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
import os
import tqdm

import spacy
# Load spacy model for lemmatization
nlp = spacy.load("en_core_web_sm")

# Preprocessing
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import regexp_tokenize

# Models

import torch

# Evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



In [None]:
# # File paths
# parquet_file_path = '/kaggle/working/train_95.parquet'
# model_file_path = '/kaggle/input/word2vecmodel/word2vec_model.bin'

# # Load the Parquet file
# train_data = pd.read_parquet(parquet_file_path)
# print("Parquet file loaded successfully.")

# # Load the Word2Vec model
# word2vec_model = Word2Vec.load(model_file_path)
# print("Word2Vec model loaded successfully.")

In [None]:

# Load tokenized data (assuming you have 'tokenized' column in the train.parquet)
# train_data = pd.read_parquet("train.parquet", columns=["tokenized", "IS_labels"])
# train_data['tokenized'] = train_data['tokenized'].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)
# Apply the get_word_vectors function to get word vectors for each sentence
# train_data['word_vectors'] = train_data['tokenized'].apply(lambda x: get_word_vectors(x, word2vec_model)) 



# LSTM

## Define LSTM in pytorch

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1, dropout=0.2):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        # Fully connected layer for classification
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Pass input through LSTM
        lstm_out, _ = self.lstm(x)
        # Use the output of the last time step for classification
        out = self.fc(lstm_out)  # Shape: (batch_size, output_size)
        return out


# IS DataSet

In [None]:
import pandas as pd
from torch.utils.data import Dataset
import numpy as np
import torch
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec

class ISDataSet(Dataset):
    def __init__(self, file, feature_extractor_model, vector_size):
        # Load the data
        df_train = pd.read_parquet(file, columns=['tokenized', 'IS_labels'])
        self.sentences = df_train['tokenized']
        self.labels = df_train['IS_labels']
        self.feature_extractor_model = feature_extractor_model  # Word2Vec or FastText model
        self.vector_size = vector_size  # Size of the word embeddings

    def __len__(self):
        return len(self.labels)
       

    def __len__(self):
        return len(self.labels)
        
    def get_sentence_vectors(self, sentence):
        """
        Convert a tokenized sentence into a list of word vectors using the Word2Vec model.
        
        :param sentence: List of tokens.
        :return: List of word vectors (numpy array).
        """
        sentence_vectors = []
        for word in sentence:
            if word in self.feature_extractor_model.wv.key_to_index:
                sentence_vectors.append(self.feature_extractor_model.wv[word])  # Word2Vec or  Fasttext
            else:
                sentence_vectors.append(np.zeros(self.vector_size))  # Zero vector for unknown words
        # print("Sentence vectors:", sentence_vectors[:5])  # Displaying a few vectors
        return sentence_vectors

    def __getitem__(self, idx):
        # Get the sentence vector
        feature = self.get_sentence_vectors(self.sentences.iloc[idx])
        # Apply padding to the sentence vectors
        padded_feature = pad_sequences([feature], maxlen=50, dtype='float32', padding='post')
        
        # For sequence labels, pad them as well
        label = self.labels.iloc[idx]
        padded_label = pad_sequences([label], maxlen=50, padding='post', value=-1)  # Padding with -1 for labels
        return padded_feature[0], padded_label[0]


# NER Dataset

In [None]:
import pandas as pd
from torch.utils.data import Dataset
import numpy as np
import torch
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec

class NERDataSet(Dataset):
    def __init__(self, file, feature_extractor_model, vector_size):
        df_train = pd.read_parquet(file,columns=['tokenized','NER_labels'])
        self.sentences = df_train['tokenized']
        self.labels = df_train['NER_labels']
        self.feature_exctractor_model = feature_extractor_model
        self.vector_size = vector_size
       

    def __len__(self):
        return len(self.labels)
        
    def get_sentence_vectors(self, sentence):
        """
        Convert a tokenized sentence into a list of word vectors using the Word2Vec model.
        
        :param sentence: List of tokens.
        :return: List of word vectors (numpy array).
        """
        sentence_vectors = []
        for word in sentence:
            if word in self.feature_exctractor_model.wv.key_to_index:
                sentence_vectors.append(self.feature_exctractor_model.wv[word])  # Word2Vec vector
            else:
                sentence_vectors.append(np.zeros(self.vector_size))  # Zero vector for unknown words
        # print("Sentence vectors:", sentence_vectors[:5])  # Displaying a few vectors
        return sentence_vectors

    def __getitem__(self, idx):
        # Get the sentence vector
        feature = self.get_sentence_vectors(self.sentences.iloc[idx])
        # Apply padding to the sentence vectors
        padded_feature = pad_sequences([feature], maxlen=50, dtype='float32', padding='post')
        
        # For sequence labels, pad them as well
        label = self.labels.iloc[idx]
        padded_label = pad_sequences([label], maxlen=50, padding='post', value=-1)  # Padding with -1 for labels
        return padded_feature[0], padded_label[0]


# Load Feature Extractor

In [None]:
from gensim.models import Word2Vec
from gensim.models import FastText

if(FEATURE_EXTRACTOR == 'fasttext'):
    feature_extractor_model  = FastText.load("/kaggle/working/models/fast_text_model.bin")
    print("fasttext Model loaded successfully!") 
else :
    feature_extractor_model =  Word2Vec.load("/kaggle/input/word2vec-model/word2vec_model.model")
    print("word2vec Model loaded successfully!")

In [None]:
# df_train = pd.read_parquet('/kaggle/input/final-95-train-set/train_95_with_NER_and_IS_labels (1).parquet')
# # Ensure the values are integers and remove NaN
# unique_values = df_train['IS_labels'].explode().dropna().unique()

# # Convert unique_values to integers (if necessary)
# unique_values = unique_values.astype(int)

# # Use inverse_transform
# print("Decoded labels:", IS_label_encoder.inverse_transform(unique_values))

# # Print unique encoded values
# print("Unique values:", unique_values)

# Train Loop

In [None]:
import torch
import gc
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
import torch.nn as nn
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec

# Assuming CustomDataSet and LSTMModel are defined as you provided

def save_checkpoint(model, optimizer, epoch, loss, file_path):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss
    }
    torch.save(checkpoint, file_path)
    print(f"Checkpoint saved at {file_path}")

# Training loop
def train_model(file,feature_extractor ,model, num_epochs=10, batch_size=32, learning_rate=0.001, chunk_size=10000,pipeline="IS",feature_extr = "word2vec"):
    # Load the dataset 
    if pipeline == "IS":
        dataset = ISDataSet(file, feature_extractor_model=feature_extractor ,vector_size=feature_extractor.vector_size )
    else :
        dataset = NERDataSet(file,feature_extractor_model=feature_extractor ,vector_size=feature_extractor.vector_size )
        
    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    # Model, loss function, and optimizer initialization
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=-1)  # -1 is the padding index
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in range(num_epochs):
        epoch_loss = 0
        total_correct_train = 0
        total_samples_train = 0
        print(f"Epoch {epoch + 1}/{num_epochs}")

        model.train()
        
        for batch_idx, (sentences_batch, labels_batch) in enumerate(tqdm(train_loader, desc=f"Training Epoch {epoch+1}_{pipeline}_{feature_extr}", unit="batch")):
            # Move batch data to the appropriate device
            sentences_batch = sentences_batch.to(device)
            labels_batch = labels_batch.to(device)
            # print("sentences batch size :",sentences_batch.shape)
            # print("labels_batch batch size :",labels_batch.shape)
            
           
            # Forward pass
            outputs = model(sentences_batch)  # Shape: (batch_size, seq_len, num_classes)
            
            # Flatten the output and labels for CrossEntropyLoss
            outputs_flat = torch.flatten(outputs, start_dim=0, end_dim=1)  # (batch_size * seq_len, num_classes)
            targets_flat = labels_batch.view(-1).long()  # (batch_size * seq_len)
            # print("output size :",outputs_flat.shape)
            # print("targets_flat size :",targets_flat.shape)
            # Compute loss
            batch_loss = criterion(outputs_flat, targets_flat)
            optimizer.zero_grad()
            batch_loss.backward()
            optimizer.step()

            # Accumulate loss
            epoch_loss += batch_loss.item()

            # Accuracy calculation
            predictions = outputs.argmax(-1)  # (batch_size, seq_len)
            mask = labels_batch != -1  # Exclude padding from accuracy calculation
            total_correct_train += (predictions[mask] == labels_batch[mask]).sum().item()
            total_samples_train += mask.sum().item()  # Count valid labels (non-padding)

        # Calculate average loss and accuracy
        epoch_loss /= len(train_loader)  # Average loss per batch
        epoch_acc = total_correct_train / total_samples_train  # Accuracy: correct / total valid labels

        print(f'Epoch [{epoch + 1}/{num_epochs}] | Loss: {epoch_loss:.4f} | Accuracy: {epoch_acc:.4f}')
        
        # Save the model after every epoch
        save_checkpoint(model, optimizer, epoch, epoch_loss, f"IS_model_epoch_{epoch+1}.pth")

        # Clear GPU cache and collect garbage
        gc.collect()
        torch.cuda.empty_cache()

    print("Training complete!")
    


# NER

In [None]:

# full_entities = np.load('/kaggle/input/ner-entities/full_entities.npy')
# # Print the data or its shape
# print("full entities :")
# print(full_entities)


# # Using BIO Tagging
# full_bio_entities = [f"{letter}-{entity}" for entity in full_entities for letter in "BI"]
# full_bio_entities.append('O')
# full_bio_entities.append('B-COMPLEX_QUANTITY')
# full_bio_entities.append('I-COMPLEX_QUANTITY')
# full_bio_entities.remove('I-PIZZAORDER')
# full_bio_entities.remove('B-PIZZAORDER')
# full_bio_entities.remove('B-DRINKORDER')
# full_bio_entities.remove('I-DRINKORDER')
# label_encoder = LabelEncoder()
# label_encoder.fit(full_bio_entities)

# print(full_bio_entities)
# def extract_NER_labels(top: str, entities):
#     # Extract words and parenthesis
#     token_pattern=r"(?u)\b\w+(?:'\w+)?(?:\s*-\s*\w+)*\b|[()]"
#     tokens = regexp_tokenize(top, token_pattern)
    
#     labels = []
#     count = 0
#     not_str =""
#     complex_topping_begin = False 
#     is_beginning = True
#     order_type = ""
#     for token in tokens:
#         if token in ["PIZZAORDER", "DRINKORDER",'ORDER']:
#             count -= 1
#             continue
#         elif token == "(":
#             count += 1
#         elif token == ")":
#             count -= 1
#             if count == 0:
#                 is_beginning = True
#                 complex_topping_begin = False
#                 not_str = ""
#                 order_type = ""
#             if count < 0:
#                 count = 0
#         elif token == "COMPLEX_TOPPING":
#             order_type = "COMPLEX_TOPPING"
#             complex_topping_begin = True
#         elif token == "NOT":
#             not_str = "NOT_"
#         elif token in entities:
#             order_type = token
#         elif count == 0:
#             labels.append("O")
           
#         else:
#             if complex_topping_begin:
#                 if is_beginning:
#                     labels.append("B-COMPLEX_" + order_type)
#                     is_beginning = False
#                     continue
#                 else:
#                     labels.append("I-COMPLEX_" + order_type)
#                     continue
#             if is_beginning:
#                 labels.append("B-" + not_str + order_type)
#                 is_beginning = False
#                 continue
#             else:
#                 labels.append("I-" + not_str + order_type) 
#     # print(labels)
#     labels = label_encoder.transform(labels)
#     return labels


In [None]:
# # Function to apply extract_labels to each row
# def apply_extract_ner_labels(row, entities):
#     # Extract labels for each row's 'top' column
#     labels = extract_NER_labels(row['top'], entities)
    
#     # Return the labels (make sure it's in a proper format for storing in DataFrame)
#     return labels

# # # Apply the function to each row in the 'top' column and store the result in a new column 'IS_labels'
# df_train = pd.read_parquet('/kaggle/input/data-95/train_95.parquet')
# df_train['NER_labels'] = df_train.apply(lambda row: apply_extract_ner_labels(row, full_entities), axis=1)
# print(df_train.head())
# # Check the result



# # Save the modified DataFrame to a new Parquet file
# #df_train.to_parquet('train_95.parquet', index=False)

# #print("Data with IS_labels saved to train_95")

In [None]:
# print(len(full_bio_entities))
# df_train.to_parquet('/kaggle/working/train_95_with_ner.parquet', index=False)
# # 

In [None]:
# df_train = pd.read_parquet('/kaggle/working/train_95_with_ner.parquet') 
# print(df_train.columns)

In [None]:
# row_values = df_train[['top', 'NER_labels','src','tokenized']].iloc[0]
# print(row_values['top'])         # Value from 'top'
# print(row_values['NER_labels'])  # Value from 'NER_labels'
# print(row_values['tokenized'])  # Value from 'NER_labels'
# print(df_train[['top', 'NER_labels']].head())

# Training Call

In [None]:
 train_model(file="/kaggle/input/dataset-with-is-final-labels/train_95_with_NER_and_IS_labels.parquet",
             feature_extractor=feature_extractor_model ,
             model=LSTMModel(input_size=feature_extractor_model.vector_size, 
                       hidden_size=256,
                       output_size=OUTPUT_SIZE,
                       num_layers=5), 
            feature_extr=FEATURE_EXTRACTOR,
            pipeline=PIPELINE,
            num_epochs=10)

# Testing IS

In [None]:
import pandas as pd
df_dev = pd.read_json('/kaggle/input/dataset-dev/PIZZA_dev.json', lines=True)
print(df_dev.head())

# Remove rows where 'dev.PCFG_ERR' is "true"
df_dev = df_dev[df_dev['dev.PCFG_ERR'] == "False"]

# Verify the filtering
print(f"Original rows: {len(df_dev)}")


In [None]:



df_dev['dev.TOP'] = df_dev['dev.TOP'].str.replace(r"^\(ORDER\s?", "", regex=True)



df_dev['dev.TOP'] = df_dev['dev.TOP'].str.replace(r"\)$", "", regex=True)



In [None]:
# Function to apply extract_labels to each row
def apply_extract_labels(row, entities):
    labels = extract_labels_IS(row['dev.TOP'], entities)
    return labels.tolist()

# # Apply the function to each row in the 'top' column and store the result in a new column 'IS_labels'
df_dev['IS_labels'] = df_dev.progress_apply(lambda row: apply_extract_labels(row,['PIZZAORDER','DRINKORDER'] ), axis=1)


print(df_dev[['dev.TOP', 'IS_labels']].head())
labels = df_dev['IS_labels']
print(IS_label_encoder.inverse_transform(labels[16]))

# # Save the modified DataFrame to a new Parquet file
df_dev.to_parquet('/kaggle/working/dev_95_with_NER_and_IS_labels.parquet', index=False)
print("Data with IS_labels saved to dev")


In [None]:

model_file_path = '/kaggle/input/word2vec-model/word2vec_model.model'

# Load the Word2Vec model
word2vec_model = Word2Vec.load(model_file_path)
print("Word2Vec model loaded successfully.")

In [None]:
# Function to apply extract_labels to each row
def apply_extract_ner_labels(row, entities):
    # Extract labels for each row's 'top' column
    #print(f"Processing index: {row.name}")
    # print(row['dev.TOP'])
    labels = extract_NER_labels(row['dev.TOP'], entities)
    
    # Return the labels (make sure it's in a proper format for storing in DataFrame)
    return labels

df_dev['NER_labels'] = df_dev.apply(lambda row: apply_extract_ner_labels(row, full_entities), axis=1)
print(df_dev.head())

In [None]:
token_pattern=r"(?u)\b\w+(?:'\w+)?(?:\s*-\s*\w+)*\b"
df_dev["tokenized"] = df_dev["dev.SRC"].progress_apply(lambda x: regexp_tokenize(x, token_pattern)) 
df_dev.to_parquet('/kaggle/working/dev_with_labels.parquet', index=False)
print("Tokenized src data saved to parquet file.")
print(df_dev['tokenized'].head())

In [None]:

# Function to Load Trained Model
def load_trained_model(checkpoint_path, model, optimizer=None):
    checkpoint = torch.load(checkpoint_path, map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
    model.load_state_dict(checkpoint['model_state_dict'])
    if optimizer:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    print(f"Loaded model from {checkpoint_path}")
    return model


# Evaluate on Dev Dataset
def evaluate_model(dev_file, model, word2vec_model, batch_size=32):
    # Create Dataset and DataLoader
    dev_dataset = CustomDataSet(dev_file, word2vec_model)
    dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()  # Set model to evaluation mode

    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for sentences_batch, labels_batch in dev_loader:
            sentences_batch = sentences_batch.to(device)
            labels_batch = labels_batch.to(device)

            outputs = model(sentences_batch)
            predictions = torch.argmax(outputs, dim=-1)  # Get class with highest score
            
            # Remove padding for evaluation
            for i in range(labels_batch.size(0)):  # Batch size
                valid_labels = labels_batch[i][labels_batch[i] != -1]
                valid_predictions = predictions[i][:len(valid_labels)]
                all_labels.extend(valid_labels.cpu().numpy())
                all_predictions.extend(valid_predictions.cpu().numpy())

    # Calculate Metrics
    print("Classification Report:")
    print(classification_report(all_labels, all_predictions))
    print("Accuracy Score:", accuracy_score(all_labels, all_predictions))


# Main Code to Load Model and Evaluate
if __name__ == "__main__":
    # File paths
    train_word2vec_file = "/kaggle/input/word2vec-model/word2vec_model.model"  # Path to Word2Vec model
    dev_file = "/kaggle/input/dataset-dev-parquet/dev_with_labels.parquet"  # Path to dev dataset
    checkpoint_path = "/kaggle/input/is-model/IS_model_epoch_5.pth"  # Trained model checkpoint

    # Load Word2Vec model
    word2vec_model = Word2Vec.load(train_word2vec_file)

    # Define model architecture
    input_size = 200  # Should match Word2Vec vector size
    hidden_size = 256
    output_size = 5  # Number of output classes
    num_layers= 5
    lstm_model = LSTMModel(input_size, hidden_size, output_size,num_layers)

    # Load trained model
    lstm_model = load_trained_model(checkpoint_path, lstm_model)

    # Evaluate on dev dataset
    evaluate_model(dev_file, lstm_model, word2vec_model)