## Imports

In [97]:
# Install the required version of transformers
!pip install -U transformers==4.40.2

# Import necessary libraries
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import os
from transformers import ReformerConfig, ReformerForSequenceClassification, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
import torch.nn as nn


  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




## Importing Data

In [98]:
# Define the file paths
train_file = '/kaggle/input/depth-20/train_d20s.tsv'
test_file = '/kaggle/input/depth-20/test_d20s.tsv'

# Load the data
train_df = pd.read_csv(train_file, sep='\t', header=0)
test_df = pd.read_csv(test_file, sep='\t', header=0)
# Preview the data
print(train_df.head())

# Load the data
train_df = pd.read_csv(train_file, sep='\t', header=0)
test_df = pd.read_csv(test_file, sep='\t', header=0)
# Preview the data
print(train_df.head())
print(train_df.columns)


   Target                                             Source
0       6  ( ( ( ( ( [MAX ( ( ( ( ( ( [MED 4 ) 6 ) 6 ) 0 ...
1       7   ( ( ( ( [SM ( ( ( [MED 6 ) 5 ) ] ) ) 1 ) 1 ) ] )
2       4                 ( ( ( ( ( [MAX 3 ) 4 ) 3 ) 3 ) ] )
3       0  ( ( ( ( ( [MIN 0 ) 0 ) ( ( ( [MAX 4 ) ( ( ( ( ...
4       9  ( ( ( ( ( ( [SM ( ( ( ( [MIN 5 ) ( ( ( [MAX ( ...
   Target                                             Source
0       6  ( ( ( ( ( [MAX ( ( ( ( ( ( [MED 4 ) 6 ) 6 ) 0 ...
1       7   ( ( ( ( [SM ( ( ( [MED 6 ) 5 ) ] ) ) 1 ) 1 ) ] )
2       4                 ( ( ( ( ( [MAX 3 ) 4 ) 3 ) 3 ) ] )
3       0  ( ( ( ( ( [MIN 0 ) 0 ) ( ( ( [MAX 4 ) ( ( ( ( ...
4       9  ( ( ( ( ( ( [SM ( ( ( ( [MIN 5 ) ( ( ( [MAX ( ...
Index(['Target', 'Source'], dtype='object')


## Basic data cleaning && exploration

In [99]:
def clean_text(text):
    return ''.join(str(text).split())

# Apply the cleaning function to the 'Source' column
#train_df['Source'] = train_df['Source'].apply(clean_text)
#test_df['Source'] = test_df['Source'].apply(clean_text)


In [100]:

# Remove any possible header rows included as data
train_df = train_df[train_df['Target'] != 'Target']
test_df = test_df[test_df['Target'] != 'Target']

# Convert labels to integers
train_df['Target'] = train_df['Target'].astype(int)
test_df['Target'] = test_df['Target'].astype(int)
print(train_df.head())
print("---------------")
for i in range(8) : 
    print(len(train_df['Source'][i]))

   Target                                             Source
0       6  ( ( ( ( ( [MAX ( ( ( ( ( ( [MED 4 ) 6 ) 6 ) 0 ...
1       7   ( ( ( ( [SM ( ( ( [MED 6 ) 5 ) ] ) ) 1 ) 1 ) ] )
2       4                 ( ( ( ( ( [MAX 3 ) 4 ) 3 ) 3 ) ] )
3       0  ( ( ( ( ( [MIN 0 ) 0 ) ( ( ( [MAX 4 ) ( ( ( ( ...
4       9  ( ( ( ( ( ( [SM ( ( ( ( [MIN 5 ) ( ( ( [MAX ( ...
---------------
73
48
34
271
501
577
72
172


In [101]:
# Compute the lengths of the original sequences
seqLengths= train_df['Source'].apply(lambda x: len(x))

# Describe the sequence lengths
print("Training data sequence lengths:")
print(seqLengths.describe())

print(seqLengths.head())





Training data sequence lengths:
count    90000.000000
mean       277.778200
std        491.629979
min          1.000000
25%         49.000000
50%         99.000000
75%        259.000000
max       7593.000000
Name: Source, dtype: float64
0     73
1     48
2     34
3    271
4    501
Name: Source, dtype: int64


In [102]:
import pandas as pd

# Filter the training and test datasets based on the condition
train_df = train_df[train_df['Source'].apply(lambda x: len(x)) < 512]
test_df = test_df[test_df['Source'].apply(lambda x: len(x)) < 512]

# # Randomly sample a percentage of the filtered train data
# train_df = train_df.sample(frac=1, random_state=42)  # Set random_state for reproducibility

# Check the filtered dataframe
seqLengths = train_df['Source'].apply(lambda x: len(x))

# Describe the sequence lengths
print("Training data sequence lengths:")
print(seqLengths.describe())

print(seqLengths.head())


Training data sequence lengths:
count    77650.000000
mean       124.938416
std        110.377955
min          1.000000
25%         43.000000
50%         81.000000
75%        165.000000
max        511.000000
Name: Source, dtype: float64
0     73
1     48
2     34
3    271
4    501
Name: Source, dtype: int64


In [103]:
# Save the sequences to a text file for tokenizer training
with open("listops_sequences.txt", "w") as f:
    for sequence in train_df["Source"]:
        f.write(sequence + "\n")


In [104]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

# Initialize a WordLevel tokenizer
tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))

# Set the pre-tokenization strategy
tokenizer.pre_tokenizer = Whitespace()

# Prepare a trainer with special tokens
trainer = WordLevelTrainer(special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])

# Train the tokenizer on your text file
tokenizer.train(["listops_sequences.txt"], trainer)

# Save the tokenizer
tokenizer.save("custom_tokenizer.json")


In [105]:
from transformers import PreTrainedTokenizerFast

# Load the custom tokenizer
tokenizer = PreTrainedTokenizerFast(tokenizer_file="custom_tokenizer.json")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
print(tokenizer)

PreTrainedTokenizerFast(name_or_path='', vocab_size=23, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'pad_token': '[PAD]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


In [106]:
class LRADataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels.astype(int)  # Ensure labels are integers
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        # Tokenize and encode the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [107]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    dataset = LRADataset(
        texts=df['Source'].to_numpy(),
        labels=df['Target'].to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(
        dataset,
        batch_size=batch_size,
        num_workers=10
    )

# Parameters
MAX_LEN = 512
BATCH_SIZE = 8

# Create data loaders
train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(test_df, tokenizer, MAX_LEN, BATCH_SIZE)




In [108]:
num_labels = train_df["Source"].nunique()
print(num_labels)
print(tokenizer.vocab_size)


77650
23


In [109]:
from transformers import GPT2Config, GPT2ForSequenceClassification

# Custom GPT-2 configuration for ListOps task
config = GPT2Config(
    vocab_size=23,  # Adjust for the ListOps vocabulary
    n_positions=512,  # Max sequence length (optimized for ListOps)
    n_embd=32,  # Reduced hidden state dimensionality for efficiency
    n_layer=4,  # Fewer Transformer layers for smaller model size
    n_head=4,  # Fewer attention heads for smaller memory footprint
    n_inner=512,  # Size of inner feed-forward layer
    activation_function="gelu_new",  # GELU activation function
    resid_pdrop=0.1,  # Dropout probability for residual connections
    embd_pdrop=0.1,  # Dropout probability for embeddings
    attn_pdrop=0.1,  # Dropout probability for attention probabilities
    layer_norm_epsilon=1e-5,  # Epsilon for layer normalization
    initializer_range=0.02,  # Standard deviation for weight initialization
    summary_type="cls_index",  # Summarization strategy for classification
    summary_use_proj=True,  # Use a projection layer for summarization
    summary_activation=None,  # Activation function for summarization
    summary_proj_to_labels=True,  # Project summary to labels
    summary_first_dropout=0.1,  # Dropout probability for the summarization layer
    scale_attn_weights=True,  # Scale attention weights
    use_cache=True,  # Enable caching
    bos_token_id=0,  # Adjusted for ListOps vocabulary
    eos_token_id=1,  # Adjusted for ListOps vocabulary
    scale_attn_by_inverse_layer_idx=False,  # No scaling by inverse layer index
    reorder_and_upcast_attn=False,  # Standard attention reordering
    num_labels=10, 
    pad_token_id=0# Number of classes in the ListOps task
)

# Initialize the GPT-2 model for sequence classification
model = GPT2ForSequenceClassification(config)

# Print the model configuration for verification
print(model.config)

GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 1,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "LABEL_9": 9
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 32,
  "n_head": 4,
  "n_inner": 512,
  "n_layer": 4,
  "n_positions": 512,
  "pad_token_id": 0,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_ind

In [110]:
from transformers import AdamW, get_linear_schedule_with_warmup
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs")
    model = nn.DataParallel(model)  # Wrap the model for multiple GPUs

model = model.to(device)

# Optimizer and scheduler
EPOCHS = 5
optimizer = AdamW(model.parameters(), lr=1e-3)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=total_steps // 10,
    num_training_steps=total_steps
)

# Loss function
loss_fn = torch.nn.CrossEntropyLoss().to(device)




In [111]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler):
    model.train()
    losses = []
    correct_predictions = 0

    for batch in data_loader:
        input_ids = batch["input_ids"].to(device, non_blocking=True)
        attention_mask = batch["attention_mask"].to(device, non_blocking=True)
        labels = batch["labels"].to(device, non_blocking=True)

        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)

        # Update metrics
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)


def eval_model(model, data_loader, loss_fn, device):
    model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device, non_blocking=True)
            attention_mask = batch["attention_mask"].to(device, non_blocking=True)
            labels = batch["labels"].to(device, non_blocking=True)

            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)

            # Update metrics
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)


In [None]:
history = {
    'train_acc': [],
    'train_loss': [],
    'val_acc': [],
    'val_loss': []
}

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
        model,
        test_data_loader,
        loss_fn,
        device
    )

    print(f'Val   loss {val_loss} accuracy {val_acc}')

    history['train_acc'].append(train_acc.cpu().numpy())
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc.cpu().numpy())
    history['val_loss'].append(val_loss)


Epoch 1/5
----------


  self.pid = os.fork()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to a

## Saving the trained model

In [None]:
# Specify the directory to save the model
output_dir = './my_model_listops_reduced_best_scheduled/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the trained model and tokenizer
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model saved to {output_dir}")
