In [1]:
# Install the required libraries
!pip install -U transformers==4.40.2 pytorch-lightning torchmetrics

# Import necessary libraries
import torch
from transformers import MegaConfig, MegaForSequenceClassification, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import os
import pytorch_lightning as pl
from torchmetrics import Accuracy


Collecting transformers==4.40.2
  Downloading transformers-4.40.2-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers==4.40.2)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.40.2-py3-none-any.whl (9.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m100.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m89.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.20.3
    Uninstalling tokenizers-0.20.3

In [2]:
# Define file paths for train and test data
train_file = "/kaggle/input/lra-listops-reduced/train_d20s.tsv"
test_file = "/kaggle/input/lra-listops-reduced/test_d20s.tsv"

# Load the data
train_df = pd.read_csv(train_file, sep="\t", header=0)
test_df = pd.read_csv(test_file, sep="\t", header=0)

# Remove any possible header rows included as data
train_df = train_df[train_df["Target"] != "Target"]
test_df = test_df[test_df["Target"] != "Target"]

# Convert labels to integers
train_df["Target"] = train_df["Target"].astype(int)
test_df["Target"] = test_df["Target"].astype(int)

# Filter sequences by length
train_df = train_df[train_df["Source"].apply(len) < 1024]
test_df = test_df[test_df["Source"].apply(len) < 1024]

# Shuffle the training data
train_df = train_df.sample(frac=1, random_state=42)

print("Training data example:")
print(train_df.head())


Training data example:
       Target                                             Source
33140       2                 ( ( ( ( ( [MED 1 ) 7 ) 3 ) 2 ) ] )
16844       2  ( ( ( ( [SM 4 ) 8 ) ( ( ( ( [MIN 0 ) ( ( ( ( [...
50823       6  ( ( ( ( [MAX 6 ) ( ( ( ( [MED 2 ) 2 ) 7 ) ] ) ...
73017       3  ( ( ( ( [MED 0 ) ( ( ( ( ( [MED ( ( ( ( ( ( [S...
5116        6  ( ( ( [MIN 6 ) ( ( ( ( ( ( [MAX 0 ) 3 ) 9 ) ( ...


In [3]:
class LRADataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels.astype(int)  # Ensure labels are integers
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        # Tokenize and encode the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.long),
        }


In [4]:
class LRADataModule(pl.LightningDataModule):
    def __init__(self, train_df, test_df, tokenizer, max_len, batch_size):
        super().__init__()
        self.train_df = train_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.batch_size = batch_size

    def setup(self, stage=None):
        self.train_dataset = LRADataset(
            texts=self.train_df["Source"].to_numpy(),
            labels=self.train_df["Target"].to_numpy(),
            tokenizer=self.tokenizer,
            max_len=self.max_len,
        )
        self.test_dataset = LRADataset(
            texts=self.test_df["Source"].to_numpy(),
            labels=self.test_df["Target"].to_numpy(),
            tokenizer=self.tokenizer,
            max_len=self.max_len,
        )

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size)


In [5]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

# Initialize a WordLevel tokenizer
tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))

# Set the pre-tokenization strategy
tokenizer.pre_tokenizer = Whitespace()

# Prepare a trainer with special tokens
trainer = WordLevelTrainer(special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])

# Train the tokenizer on your text file
with open("listops_sequences.txt", "w") as f:
    for sequence in train_df["Source"]:
        f.write(sequence + "\n")

tokenizer.train(["listops_sequences.txt"], trainer)

# Save the tokenizer
tokenizer.save("custom_tokenizer.json")
print("Custom tokenizer saved!")


Custom tokenizer saved!


In [6]:
from torchmetrics.classification import Accuracy

class MEGAClassifier(pl.LightningModule):
    def __init__(self, config, num_classes=10, lr=1e-3):
        super().__init__()
        self.model = MegaForSequenceClassification(config)
        self.criterion = torch.nn.CrossEntropyLoss()
        self.accuracy = Accuracy(task="multiclass", num_classes=num_classes)
        self.lr = lr

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        logits = self(input_ids, attention_mask)
        loss = self.criterion(logits, labels)
        preds = torch.argmax(logits, dim=1)

        self.log("train_loss", loss)
        self.log("train_acc", self.accuracy(preds, labels))
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        logits = self(input_ids, attention_mask)
        loss = self.criterion(logits, labels)
        preds = torch.argmax(logits, dim=1)

        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", self.accuracy(preds, labels), prog_bar=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)
        return [optimizer], [scheduler]


In [7]:
# Initialize tokenizer and data
MAX_LEN = 1024
BATCH_SIZE = 128
EPOCHS = 10
LEARNING_RATE = 1e-3

from transformers import PreTrainedTokenizerFast

# Load the custom tokenizer
tokenizer = PreTrainedTokenizerFast(tokenizer_file="custom_tokenizer.json")
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
print("Custom tokenizer loaded!")


# Define data module
data_module = LRADataModule(train_df, test_df, tokenizer, MAX_LEN, BATCH_SIZE)

# Define model
config = MegaConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=16,
    num_hidden_layers=4,
    num_labels=10,
    max_positions=MAX_LEN,
    bidirectional=True,
    is_decoder=False,
    use_cache=False,
    activation_function="silu",
    attention_activation_function="softmax",
    norm_type="layernorm",
    dropout=0.1,
    attention_dropout=0.0,
    weight_decay=0.01,
)
model = MEGAClassifier(config=config, num_classes=10, lr=LEARNING_RATE)

# Initialize trainer
trainer = pl.Trainer(
    max_epochs=EPOCHS,
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    devices=torch.cuda.device_count() if torch.cuda.is_available() else 1,
    log_every_n_steps=10,
)

# Train the model
trainer.fit(model, datamodule=data_module)


Custom tokenizer loaded!


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
/opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [10]:
# Save the trained model
output_dir = "./mega_model_listops_reduced_best_pl/"
os.makedirs(output_dir, exist_ok=True)
model.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model and tokenizer saved to {output_dir}")


Model and tokenizer saved to ./mega_model_listops_reduced_best_pl/


In [17]:
!ls -la /kaggle/working/lightning_logs/version_0/checkpoints

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


total 1320
drwxr-xr-x 2 root root    4096 Dec  6 20:42  .
drwxr-xr-x 3 root root    4096 Dec  6 20:00  ..
-rw------- 1 root root 1341216 Dec  6 20:42 'epoch=9-step=6580.ckpt'


In [45]:
# Load the model and tokenizer
model = MEGAClassifier.load_from_checkpoint("lightning_logs/version_0/checkpoints/epoch=9-step=6580.ckpt", config=config)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

# Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Prepare a sample sequence
sequences = ["( MIN 1 2 9)", "( SM 1 5 1 1 )", "( MAX 1 3 9 5 4 1 9 5 1)"]
encoding = tokenizer(
    sequences,
    add_special_tokens=True,
    max_length=16,
    padding="max_length",
    truncation=True,
    return_tensors="pt",
)

print(encoding)

# Move input tensors to the same device as the model
input_ids = encoding["input_ids"].to(device)
attention_mask = encoding["attention_mask"].to(device)

# Perform inference
model.eval()
with torch.no_grad():
    logits = model(input_ids, attention_mask)
    probabilities = torch.softmax(logits, dim=-1)
    predictions = torch.argmax(probabilities, dim=-1)



print(f"Predictions: {predictions.tolist()}")


{'input_ids': tensor([[ 5, 19, 10, 11, 12,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 5, 22, 10, 14, 10, 10,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 5, 21, 10,  9, 12, 14, 17, 10, 12, 14, 10,  6,  0,  0,  0,  0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])}
Predictions: [0, 7, 8]
