## Imports

In [None]:
# Install the required version of transformers
!pip install -U transformers==4.40.2

# Import necessary libraries
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import os
from transformers import ReformerConfig, ReformerForSequenceClassification, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
import torch.nn as nn


## Importing Data

In [None]:
# Define the file paths
train_file = '/kaggle/input/depth-20/train_d20s.tsv'
test_file = '/kaggle/input/depth-20/test_d20s.tsv'

# Load the data
train_df = pd.read_csv(train_file, sep='\t', header=0)
test_df = pd.read_csv(test_file, sep='\t', header=0)
# Preview the data
print(train_df.head())

# Load the data
train_df = pd.read_csv(train_file, sep='\t', header=0)
test_df = pd.read_csv(test_file, sep='\t', header=0)
# Preview the data
print(train_df.head())
print(train_df.columns)


## Basic data cleaning && exploration

In [None]:
def clean_text(text):
    return ''.join(str(text).split())

# Apply the cleaning function to the 'Source' column
#train_df['Source'] = train_df['Source'].apply(clean_text)
#test_df['Source'] = test_df['Source'].apply(clean_text)


In [None]:

# Remove any possible header rows included as data
train_df = train_df[train_df['Target'] != 'Target']
test_df = test_df[test_df['Target'] != 'Target']

# Convert labels to integers
train_df['Target'] = train_df['Target'].astype(int)
test_df['Target'] = test_df['Target'].astype(int)
print(train_df.head())
print("---------------")
for i in range(8) : 
    print(len(train_df['Source'][i]))

In [None]:
# Compute the lengths of the original sequences
seqLengths= train_df['Source'].apply(lambda x: len(x))

# Describe the sequence lengths
print("Training data sequence lengths:")
print(seqLengths.describe())

print(seqLengths.head())





In [None]:
import pandas as pd

# Filter the training and test datasets based on the condition
train_df = train_df[train_df['Source'].apply(lambda x: len(x)) < 1024]
test_df = test_df[test_df['Source'].apply(lambda x: len(x)) < 1024]

# # Randomly sample a percentage of the filtered train data
# train_df = train_df.sample(frac=1, random_state=42)  # Set random_state for reproducibility

# Check the filtered dataframe
seqLengths = train_df['Source'].apply(lambda x: len(x))

# Describe the sequence lengths
print("Training data sequence lengths:")
print(seqLengths.describe())

print(seqLengths.head())


In [None]:
# Save the sequences to a text file for tokenizer training
with open("listops_sequences.txt", "w") as f:
    for sequence in train_df["Source"]:
        f.write(sequence + "\n")


In [None]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

# Initialize a WordLevel tokenizer
tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))

# Set the pre-tokenization strategy
tokenizer.pre_tokenizer = Whitespace()

# Prepare a trainer with special tokens
trainer = WordLevelTrainer(special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])

# Train the tokenizer on your text file
tokenizer.train(["listops_sequences.txt"], trainer)

# Save the tokenizer
tokenizer.save("custom_tokenizer.json")


In [None]:
from transformers import PreTrainedTokenizerFast

# Load the custom tokenizer
tokenizer = PreTrainedTokenizerFast(tokenizer_file="custom_tokenizer.json")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
print(tokenizer)

In [None]:
class LRADataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels.astype(int)  # Ensure labels are integers
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        # Tokenize and encode the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    dataset = LRADataset(
        texts=df['Source'].to_numpy(),
        labels=df['Target'].to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(
        dataset,
        batch_size=batch_size,
        num_workers=10
    )

# Parameters
MAX_LEN = 1024
BATCH_SIZE = 64

# Create data loaders
train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(test_df, tokenizer, MAX_LEN, BATCH_SIZE)


In [None]:
num_labels = train_df["Source"].nunique()
print(num_labels)
print(tokenizer.vocab_size)


In [None]:
config = ReformerConfig(
    vocab_size=23,                    # ListOps vocab size
    hidden_size=64,                  # Hidden state dimensionality
    num_hidden_layers=6,              # Number of layers
    num_attention_heads=8,            # 8 attention heads
    attention_head_size=32,           # Size of each attention head
    axial_pos_embds=True,             # Use axial positional embeddings
    axial_pos_embds_dim=[32, 32],   # Axial dimensions (sum = hidden_size)
    axial_pos_shape=[32, 32],         # Axial shape (product = sequence length)
    attn_layers=["local", "lsh", "local", "lsh", "local", "lsh"],  # Match num_hidden_layers
    feed_forward_size=512,            # Large feed-forward size
    lsh_num_chunks_before=2,          # More chunked look-backs for LSH
    lsh_num_chunks_after=1,           # Chunked look-ahead for LSH
    lsh_attention_probs_dropout_prob=0.1,  # Dropout for LSH attention
    local_chunk_length=64,            # Local attention chunk size
    local_attention_probs_dropout_prob=0.1,  # Dropout for local attention
    hidden_dropout_prob=0.1,          # Dropout for fully connected layers
    max_position_embeddings=1024,     # Maximum sequence length
    num_labels=10,                    # Number of output labels
)

# Initialize the model
model = ReformerForSequenceClassification(config)

# Display the configuration to verify
print(model.config)


In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs")
    model = nn.DataParallel(model)  # Wrap the model for multiple GPUs

model = model.to(device)

# Optimizer and scheduler
EPOCHS = 5
optimizer = AdamW(model.parameters(), lr=1e-3)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=total_steps // 10,
    num_training_steps=total_steps
)

# Loss function
loss_fn = torch.nn.CrossEntropyLoss().to(device)


In [None]:
def train_epoch(
    model,
    data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler
):
    model = model.train()
    losses = []
    correct_predictions = 0

    for batch in data_loader:
        input_ids = batch["input_ids"].to(device, non_blocking=True)
        attention_mask = batch["attention_mask"].to(device, non_blocking=True)
        labels = batch["labels"].to(device, non_blocking=True)

        outputs = model(
            input_ids=input_ids.cuda(),
            attention_mask=attention_mask.cuda(),
            labels=labels.cuda()
        )
        loss = outputs.loss
        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

def eval_model(
    model,
    data_loader,
    loss_fn,
    device
):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device, non_blocking=True)
            attention_mask = batch["attention_mask"].to(device, non_blocking=True)
            labels = batch["labels"].to(device, non_blocking=True)

            outputs = model(
                input_ids=input_ids.cuda(),
                attention_mask=attention_mask.cuda(),
                labels=labels.cuda()
            )
            loss = outputs.loss
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)


In [None]:
history = {
    'train_acc': [],
    'train_loss': [],
    'val_acc': [],
    'val_loss': []
}

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
        model,
        test_data_loader,
        loss_fn,
        device
    )

    print(f'Val   loss {val_loss} accuracy {val_acc}')

    history['train_acc'].append(train_acc.cpu().numpy())
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc.cpu().numpy())
    history['val_loss'].append(val_loss)


## Saving the trained model

In [None]:
# Specify the directory to save the model
output_dir = './my_model_listops_reduced_best_scheduled/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the trained model and tokenizer
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model saved to {output_dir}")
