In [None]:
!pip install --upgrade datasets>=2.18.0 huggingface-hub>=0.21.2 fsspec>=2023.12.0 transformers

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2025.3.0 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-cupti-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-cupti-cu12 12.5.82 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-nvrtc-cu12 12.5.82 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-runtime-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-runtime-cu12 12.5

In [None]:
!pip install datasets transformers tokenizers tqdm



In [None]:
!pip install datasets transformers tokenizers tqdm



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from datasets import load_dataset, DatasetDict, concatenate_datasets
from datasets import ClassLabel

# Define the same label format used in yelp_review_full
custom_ds = load_dataset("csv", data_files="/content/drive/MyDrive/model_checkpoints2/sarcastic_reviews.csv")  # This returns a DatasetDict
class_label = ClassLabel(names=['1 star', '2 star', '3 stars', '4 stars', '5 stars'])

# Map integer labels (0–4) to strings
def convert_label(example):
    example['label'] = class_label.int2str(example['label'])
    return example

# Apply the conversion
custom_train = custom_ds['train'].map(convert_label)
custom_train = custom_train.cast_column('label', class_label)



Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/436 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/436 [00:00<?, ? examples/s]

In [None]:
## Transformer Text Classifier with Hugging Face BPE Subtokenizer
# Import required libraries

# Cell 1: Install required packages if not already installed

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pickle
import math
from datasets import concatenate_datasets
from datasets import load_dataset
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
import os
import gc
import numpy as np
from tqdm.notebook import tqdm
import time

# Cell 2: Initialize and train BPE tokenizer
VOCAB_SIZE = 30000
MIN_FREQUENCY = 2
SPECIAL_TOKENS = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"]

# Train a Byte-Level BPE tokenizer on the training texts
def train_bpe_tokenizer(text_iterable, vocab_size=VOCAB_SIZE, min_frequency=MIN_FREQUENCY):
    # Use a small sample for training the tokenizer to save memory
    if isinstance(text_iterable, list):
        sample_size = min(30000, len(text_iterable))
        import random
        random.seed(42)
        sample_texts = random.sample(text_iterable, sample_size)
    else:
        # For dataset iterables
        sample_texts = []
        for i, text in enumerate(text_iterable):
            if i >= 30000:
                break
            sample_texts.append(text)

    tokenizer = ByteLevelBPETokenizer(
        lowercase=True,
        add_prefix_space=True
    )
    tokenizer.train_from_iterator(
        sample_texts,
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        special_tokens=SPECIAL_TOKENS
    )
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
        ("[CLS]", tokenizer.token_to_id("[CLS]"))
    )
    tokenizer.enable_truncation(max_length=256)
    tokenizer.enable_padding(
        length=256,
        pad_id=tokenizer.token_to_id("[PAD]"),
        pad_token="[PAD]"
    )
    return tokenizer

# Cell 3: Transformer model architecture (unchanged)
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class TransformerInputLayer(nn.Module):
    def __init__(self, vocab_size, d_model, max_len=256, dropout=0.1):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_len)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids):
        x = self.token_embedding(input_ids)
        x = self.positional_encoding(x)
        return self.dropout(x)

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, d_model, num_heads):     # apple [128] ---> Wq , Wk , Wv  --> Q,K,V  = Q.K = attention(0.78) --> [128] -->king
        super().__init__()
        assert d_model % num_heads == 0
        self.d_k = d_model // num_heads
        self.num_heads = num_heads
        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.out_linear = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x, mask=None):
        B, T, D = x.size()
        Q = self.q_linear(x).view(B, T, self.num_heads, self.d_k).transpose(1, 2)
        K = self.k_linear(x).view(B, T, self.num_heads, self.d_k).transpose(1, 2)
        V = self.v_linear(x).view(B, T, self.num_heads, self.d_k).transpose(1, 2)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attn = torch.softmax(scores, dim=-1)
        out = torch.matmul(self.dropout(attn), V)
        out = out.transpose(1, 2).contiguous().view(B, T, D)
        return self.out_linear(out)

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.relu1 = nn.ReLU()
        self.linear2 = nn.Linear(d_ff, d_ff//2)
        self.relu2 = nn.ReLU()
        self.linear3 = nn.Linear(d_ff//2, d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        return self.linear3(self.dropout(self.relu2(self.linear2(self.relu1(self.linear1(x))))))  #--2 layer
class TransformerEncoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff):
        super().__init__()
        self.attn = MultiHeadSelfAttention(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.ff = FeedForward(d_model, d_ff)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x, mask=None):
        attn_out = self.attn(x, mask)
        x = self.norm1(x + self.dropout(attn_out))
        ff_out = self.ff(x)
        return self.norm2(x + self.dropout(ff_out))

class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, d_ff, num_layers, max_len, dropout):
        super().__init__()
        self.input_layer = TransformerInputLayer(vocab_size, d_model, max_len, dropout)
        self.layers = nn.ModuleList([
            TransformerEncoderBlock(d_model, num_heads, d_ff)
            for _ in range(num_layers)
        ])

    def forward(self, input_ids, mask=None):
        x = self.input_layer(input_ids)
        for layer in self.layers:
            x = layer(x, mask)
        return x

class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, num_classes, d_model, num_heads, d_ff, num_layers, max_len, dropout):
        super().__init__()
        self.encoder = TransformerEncoder(vocab_size, d_model, num_heads, d_ff, num_layers, max_len, dropout)
        self.classifier = nn.Linear(d_model, num_classes)

    def forward(self, input_ids, mask=None):
        x = self.encoder(input_ids, mask)
        cls_out = x[:, 0, :]
        return self.classifier(cls_out)

# Cell 4: Memory-efficient dataset class
class TokenizedDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_len=256, text_field='text', label_field='label'):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.text_field = text_field
        self.label_field = label_field

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        text = item[self.text_field]
        label = item[self.label_field]

        encoding = self.tokenizer.encode(text)
        input_ids = torch.tensor(encoding.ids, dtype=torch.long)

        return input_ids, torch.tensor(label, dtype=torch.long)

# Cell 5: Hyperparameters and data loading
D_MODEL = 128  # 512
NUM_HEADS = 4  # 8
D_FF = 512     # 2048
NUM_LAYERS = 4
MAX_LEN = 256
DROPOUT = 0.1
BATCH_SIZE = 64
LR = 3e-4
EPOCHS = 30
DATASET_NAME = 'yelp_review_full'

# Cell 6: Load and process data
def load_and_process_data(dataset_name=DATASET_NAME):
    print(f"Loading dataset '{dataset_name}'...")
    ds = load_dataset(dataset_name)
    train = concatenate_datasets([ds['train'],custom_train])
    test = ds['test']

    unique_labels = set(train['label'])
    num_classes = len(unique_labels)
    print(f"Detected {num_classes} classes in dataset '{dataset_name}'")

    print("Training BPE tokenizer...")
    tokenizer = train_bpe_tokenizer(train['text'], vocab_size=VOCAB_SIZE)
    print(f"Tokenizer vocabulary size: {tokenizer.get_vocab_size()}")

    # Create memory-efficient datasets
    print("Creating memory-efficient datasets...")
    train_dataset = TokenizedDataset(train, tokenizer)
    test_dataset = TokenizedDataset(test, tokenizer)

    print(f"Training samples: {len(train_dataset)}")
    print(f"Test samples: {len(test_dataset)}")

    return tokenizer, train_dataset, test_dataset, num_classes

# Initialize model, loss, and optimizer
def init_model(vocab_size, num_classes):
    model = TransformerClassifier(
        vocab_size=vocab_size,
        num_classes=num_classes,
        d_model=D_MODEL,
        num_heads=NUM_HEADS,
        d_ff=D_FF,
        num_layers=NUM_LAYERS,
        max_len=MAX_LEN,
        dropout=DROPOUT
    )

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=LR)

    return model, criterion, optimizer, device

# Cell 7: Checkpoint functions
def setup_checkpointing():
    if os.path.exists('/content/drive/MyDrive/model_checkpoints2'):
        CHECKPOINT_DIR = '/content/drive/MyDrive/model_checkpoints2'
    else:
        CHECKPOINT_DIR = './model_checkpoints'

    os.makedirs(CHECKPOINT_DIR, exist_ok=True)
    CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'transformer_classifier_checkpoint_best_best.pth')
    return CHECKPOINT_PATH

def save_checkpoint(model, optimizer, epoch, loss, path):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
    }, path)
    print(f"Checkpoint saved at epoch {epoch+1}")

def load_checkpoint(model, optimizer, path, device):
    if os.path.exists(path):
        checkpoint = torch.load(path, map_location=device)
        model.load_state_dict(checkpoint)
    return 0


# Cell 8: Training and evaluation functions
def train_epoch(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0

    progress_bar = tqdm(train_loader, desc="Training")
    for inputs, labels in progress_bar:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        logits = model(inputs)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Calculate accuracy
        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

        # Update progress bar
        progress_bar.set_postfix({
            "loss": f"{loss.item():.4f}",
            "acc": f"{correct/total:.4f}"
        })

    avg_loss = total_loss / len(train_loader)
    accuracy = correct / total
    return avg_loss, accuracy

def evaluate(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in tqdm(test_loader, desc="Evaluating"):
            inputs, labels = inputs.to(device), labels.to(device)
            logits = model(inputs)
            loss = criterion(logits, labels)

            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    avg_loss = total_loss / len(test_loader)
    accuracy = correct / total
    return avg_loss, accuracy

# Cell 9: Main training loop
def train_model(model, train_dataset, test_dataset, criterion, optimizer, device, checkpoint_path, epochs=EPOCHS):
    # Set up data loaders with appropriate batch size and num_workers
    NWORKERS = 12
    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=NWORKERS,
        pin_memory=True
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=NWORKERS,
        pin_memory=True
    )

    # Try to load checkpoint
    start_epoch = load_checkpoint(model, optimizer, checkpoint_path, device)

    best_accuracy = 0.0

    for epoch in range(start_epoch, epochs):
        start_time = time.time()

        # Train for one epoch
        train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, device)

        # Evaluate
        val_loss, val_acc = evaluate(model, test_loader, criterion, device)

        # Report metrics
        epoch_time = time.time() - start_time
        print(f"Epoch {epoch+1}/{epochs} | Time: {epoch_time:.2f}s")
        print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
        print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

        # Save checkpoint
        save_checkpoint(model, optimizer, epoch, val_loss, checkpoint_path)

        # Save best model
        if val_acc > best_accuracy:
            best_accuracy = val_acc
            best_model_path = checkpoint_path
            torch.save(model.state_dict(), best_model_path)
            print(f"New best model saved with accuracy: {best_accuracy:.4f}")

        # Force garbage collection to free memory
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    return model

# Cell 10: Example inference
def predict_sentiment(text, model, tokenizer, device, num_classes):
    enc = tokenizer.encode(text)
    input_ids = torch.tensor([enc.ids]).to(device)
    model.eval()
    with torch.no_grad():
        logits = model(input_ids)
        probs = torch.softmax(logits, dim=1)
        pred = torch.argmax(probs, dim=1).item()
        # Map score to sentiment (for Yelp dataset)
        if num_classes == 5:  # Yelp dataset
            sentiment_map = {
                0: "Very Negative (1 star)",
                1: "Negative (2 stars)",
                2: "Neutral (3 stars)",
                3: "Positive (4 stars)",
                4: "Very Positive (5 stars)"
            }
            return sentiment_map.get(pred, f"Class {pred}"), probs[0][pred].item()
        else:
            return pred, probs[0][pred].item()

# Cell 11: Save model artifacts
def save_artifacts(model, tokenizer, device):
    # Save model state
    torch.save(model.state_dict(), "transformer_classifier.pth")

    # Save tokenizer
    with open("tokenizer.json", "w") as f:
        f.write(tokenizer.to_str())

    print("Model and tokenizer saved successfully.")

# Cell 12: Main execution
def main():
    # Load and process data
    tokenizer, train_dataset, test_dataset, num_classes = load_and_process_data()

    # Initialize model and related components
    model, criterion, optimizer, device = init_model(tokenizer.get_vocab_size(), num_classes)

    # Setup checkpointing
    checkpoint_path = setup_checkpointing()

    with open("tokenizer.json", "w") as f:
        f.write(tokenizer.to_str())
    # Train model
    model = train_model(model, train_dataset, test_dataset, criterion, optimizer, device, checkpoint_path)

    # Save artifacts
    save_artifacts(model, tokenizer, device)

    # Example inference
    sample_text = "The food was delicious and the service was excellent!"
    sentiment, confidence = predict_sentiment(sample_text, model, tokenizer, device, num_classes)
    print(f"Sample text: '{sample_text}'")
    print(f"Predicted sentiment: {sentiment} (confidence: {confidence:.4f})")

main()

Loading dataset 'yelp_review_full'...
Detected 5 classes in dataset 'yelp_review_full'
Training BPE tokenizer...
Tokenizer vocabulary size: 30000
Creating memory-efficient datasets...
Training samples: 650436
Test samples: 50000
Using device: cuda


Training:   0%|          | 0/10164 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch 1/30 | Time: 743.26s
Train Loss: 0.7662 | Train Acc: 0.6710
Val Loss: 0.8769 | Val Acc: 0.6202
Checkpoint saved at epoch 1
New best model saved with accuracy: 0.6202


Training:   0%|          | 0/10164 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch 2/30 | Time: 861.25s
Train Loss: 0.7583 | Train Acc: 0.6745
Val Loss: 0.8873 | Val Acc: 0.6196
Checkpoint saved at epoch 2


Training:   0%|          | 0/10164 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch 3/30 | Time: 867.66s
Train Loss: 0.7512 | Train Acc: 0.6782
Val Loss: 0.9086 | Val Acc: 0.6202
Checkpoint saved at epoch 3


Training:   0%|          | 0/10164 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch 4/30 | Time: 886.97s
Train Loss: 0.7439 | Train Acc: 0.6813
Val Loss: 0.8885 | Val Acc: 0.6202
Checkpoint saved at epoch 4
New best model saved with accuracy: 0.6202


Training:   0%|          | 0/10164 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch 5/30 | Time: 875.97s
Train Loss: 0.7381 | Train Acc: 0.6846
Val Loss: 0.9069 | Val Acc: 0.6205
Checkpoint saved at epoch 5
New best model saved with accuracy: 0.6205


Training:   0%|          | 0/10164 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch 6/30 | Time: 915.43s
Train Loss: 0.7301 | Train Acc: 0.6886
Val Loss: 0.9081 | Val Acc: 0.6206
Checkpoint saved at epoch 6
New best model saved with accuracy: 0.6206


Training:   0%|          | 0/10164 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch 7/30 | Time: 875.10s
Train Loss: 0.7242 | Train Acc: 0.6911
Val Loss: 0.9152 | Val Acc: 0.6208
Checkpoint saved at epoch 7
New best model saved with accuracy: 0.6208


Training:   0%|          | 0/10164 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch 8/30 | Time: 890.47s
Train Loss: 0.7162 | Train Acc: 0.6945
Val Loss: 0.9229 | Val Acc: 0.6176
Checkpoint saved at epoch 8


Training:   0%|          | 0/10164 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch 9/30 | Time: 870.69s
Train Loss: 0.7107 | Train Acc: 0.6978
Val Loss: 0.9193 | Val Acc: 0.6200
Checkpoint saved at epoch 9


Training:   0%|          | 0/10164 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch 10/30 | Time: 901.02s
Train Loss: 0.7033 | Train Acc: 0.7011
Val Loss: 0.9415 | Val Acc: 0.6199
Checkpoint saved at epoch 10


Training:   0%|          | 0/10164 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch 11/30 | Time: 837.74s
Train Loss: 0.6970 | Train Acc: 0.7038
Val Loss: 0.9396 | Val Acc: 0.6172
Checkpoint saved at epoch 11


Training:   0%|          | 0/10164 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch 12/30 | Time: 894.07s
Train Loss: 0.6912 | Train Acc: 0.7062
Val Loss: 0.9762 | Val Acc: 0.6188
Checkpoint saved at epoch 12


Training:   0%|          | 0/10164 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch 13/30 | Time: 836.24s
Train Loss: 0.6825 | Train Acc: 0.7104
Val Loss: 0.9728 | Val Acc: 0.6165
Checkpoint saved at epoch 13


Training:   0%|          | 0/10164 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch 14/30 | Time: 837.58s
Train Loss: 0.6776 | Train Acc: 0.7129
Val Loss: 0.9823 | Val Acc: 0.6144
Checkpoint saved at epoch 14


Training:   0%|          | 0/10164 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch 15/30 | Time: 853.83s
Train Loss: 0.6712 | Train Acc: 0.7154
Val Loss: 0.9826 | Val Acc: 0.6164
Checkpoint saved at epoch 15


Training:   0%|          | 0/10164 [00:00<?, ?it/s]

In [None]:
# ## Transformer Text Classifier with Hugging Face BPE Subtokenizer
# # Import required libraries

# # Cell 1: Install required packages if not already installed

# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import Dataset, DataLoader
# import pickle
# import math
# from datasets import concatenate_datasets
# from datasets import load_dataset
# from tokenizers import ByteLevelBPETokenizer
# from tokenizers.processors import BertProcessing
# import os
# import gc
# import numpy as np
# from tqdm.notebook import tqdm
# import time

# # Cell 2: Initialize and train BPE tokenizer
# VOCAB_SIZE = 30000
# MIN_FREQUENCY = 2
# SPECIAL_TOKENS = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"]

# # Train a Byte-Level BPE tokenizer on the training texts
# def train_bpe_tokenizer(text_iterable, vocab_size=VOCAB_SIZE, min_frequency=MIN_FREQUENCY):
#     # Use a small sample for training the tokenizer to save memory
#     if isinstance(text_iterable, list):
#         sample_size = min(30000, len(text_iterable))
#         import random
#         random.seed(42)
#         sample_texts = random.sample(text_iterable, sample_size)
#     else:
#         # For dataset iterables
#         sample_texts = []
#         for i, text in enumerate(text_iterable):
#             if i >= 30000:
#                 break
#             sample_texts.append(text)

#     tokenizer = ByteLevelBPETokenizer(
#         lowercase=True,
#         add_prefix_space=True
#     )
#     tokenizer.train_from_iterator(
#         sample_texts,
#         vocab_size=vocab_size,
#         min_frequency=min_frequency,
#         special_tokens=SPECIAL_TOKENS
#     )
#     tokenizer._tokenizer.post_processor = BertProcessing(
#         ("[SEP]", tokenizer.token_to_id("[SEP]")),
#         ("[CLS]", tokenizer.token_to_id("[CLS]"))
#     )
#     tokenizer.enable_truncation(max_length=256)
#     tokenizer.enable_padding(
#         length=256,
#         pad_id=tokenizer.token_to_id("[PAD]"),
#         pad_token="[PAD]"
#     )
#     return tokenizer

# # Cell 3: Transformer model architecture (unchanged)
# class PositionalEncoding(nn.Module):
#     def __init__(self, d_model, max_len=512):
#         super().__init__()
#         pe = torch.zeros(max_len, d_model)
#         position = torch.arange(0, max_len).unsqueeze(1)
#         div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
#         pe[:, 0::2] = torch.sin(position * div_term)
#         pe[:, 1::2] = torch.cos(position * div_term)
#         pe = pe.unsqueeze(0)
#         self.register_buffer('pe', pe)

#     def forward(self, x):
#         return x + self.pe[:, :x.size(1)]

# class TransformerInputLayer(nn.Module):
#     def __init__(self, vocab_size, d_model, max_len=256, dropout=0.1):
#         super().__init__()
#         self.token_embedding = nn.Embedding(vocab_size, d_model)
#         self.positional_encoding = PositionalEncoding(d_model, max_len)
#         self.dropout = nn.Dropout(dropout)

#     def forward(self, input_ids):
#         x = self.token_embedding(input_ids)
#         x = self.positional_encoding(x)
#         return self.dropout(x)

# class MultiHeadSelfAttention(nn.Module):
#     def __init__(self, d_model, num_heads):     # apple [128] ---> Wq , Wk , Wv  --> Q,K,V  = Q.K = attention(0.78) --> [128] -->king
#         super().__init__()
#         assert d_model % num_heads == 0
#         self.d_k = d_model // num_heads
#         self.num_heads = num_heads
#         self.q_linear = nn.Linear(d_model, d_model)
#         self.k_linear = nn.Linear(d_model, d_model)
#         self.v_linear = nn.Linear(d_model, d_model)
#         self.out_linear = nn.Linear(d_model, d_model)
#         self.dropout = nn.Dropout(0.1)

#     def forward(self, x, mask=None):
#         B, T, D = x.size()
#         Q = self.q_linear(x).view(B, T, self.num_heads, self.d_k).transpose(1, 2)
#         K = self.k_linear(x).view(B, T, self.num_heads, self.d_k).transpose(1, 2)
#         V = self.v_linear(x).view(B, T, self.num_heads, self.d_k).transpose(1, 2)
#         scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
#         if mask is not None:
#             scores = scores.masked_fill(mask == 0, -1e9)
#         attn = torch.softmax(scores, dim=-1)
#         out = torch.matmul(self.dropout(attn), V)
#         out = out.transpose(1, 2).contiguous().view(B, T, D)
#         return self.out_linear(out)

# class FeedForward(nn.Module):
#     def __init__(self, d_model, d_ff):
#         super().__init__()
#         self.linear1 = nn.Linear(d_model, d_ff)
#         self.relu1 = nn.ReLU()
#         self.linear2 = nn.Linear(d_ff, d_ff//2)
#         self.relu2 = nn.ReLU()
#         self.linear3 = nn.Linear(d_ff//2, d_model)
#         self.dropout = nn.Dropout(0.1)

#     def forward(self, x):
#         return self.linear3(self.dropout(self.relu2(self.linear2(self.relu1(self.linear1(x))))))  #--2 layer
# class TransformerEncoderBlock(nn.Module):
#     def __init__(self, d_model, num_heads, d_ff):
#         super().__init__()
#         self.attn = MultiHeadSelfAttention(d_model, num_heads)
#         self.norm1 = nn.LayerNorm(d_model)
#         self.ff = FeedForward(d_model, d_ff)
#         self.norm2 = nn.LayerNorm(d_model)
#         self.dropout = nn.Dropout(0.1)

#     def forward(self, x, mask=None):
#         attn_out = self.attn(x, mask)
#         x = self.norm1(x + self.dropout(attn_out))
#         ff_out = self.ff(x)
#         return self.norm2(x + self.dropout(ff_out))

# class TransformerEncoder(nn.Module):
#     def __init__(self, vocab_size, d_model, num_heads, d_ff, num_layers, max_len, dropout):
#         super().__init__()
#         self.input_layer = TransformerInputLayer(vocab_size, d_model, max_len, dropout)
#         self.layers = nn.ModuleList([
#             TransformerEncoderBlock(d_model, num_heads, d_ff)
#             for _ in range(num_layers)
#         ])

#     def forward(self, input_ids, mask=None):
#         x = self.input_layer(input_ids)
#         for layer in self.layers:
#             x = layer(x, mask)
#         return x

# class TransformerClassifier(nn.Module):
#     def __init__(self, vocab_size, num_classes, d_model, num_heads, d_ff, num_layers, max_len, dropout):
#         super().__init__()
#         self.encoder = TransformerEncoder(vocab_size, d_model, num_heads, d_ff, num_layers, max_len, dropout)
#         self.classifier = nn.Linear(d_model, num_classes)

#     def forward(self, input_ids, mask=None):
#         x = self.encoder(input_ids, mask)
#         cls_out = x[:, 0, :]
#         return self.classifier(cls_out)

# # Cell 4: Memory-efficient dataset class
# class TokenizedDataset(Dataset):
#     def __init__(self, dataset, tokenizer, max_len=256, text_field='text', label_field='label'):
#         self.dataset = dataset
#         self.tokenizer = tokenizer
#         self.max_len = max_len
#         self.text_field = text_field
#         self.label_field = label_field

#     def __len__(self):
#         return len(self.dataset)

#     def __getitem__(self, idx):
#         item = self.dataset[idx]
#         text = item[self.text_field]
#         label = item[self.label_field]

#         encoding = self.tokenizer.encode(text)
#         input_ids = torch.tensor(encoding.ids, dtype=torch.long)

#         return input_ids, torch.tensor(label, dtype=torch.long)

# # Cell 5: Hyperparameters and data loading
# D_MODEL = 128  # 512
# NUM_HEADS = 4  # 8
# D_FF = 512     # 2048
# NUM_LAYERS = 2 # 4
# MAX_LEN = 256
# DROPOUT = 0.1
# BATCH_SIZE = 64
# LR = 3e-4
# EPOCHS = 30
# DATASET_NAME = 'yelp_review_full'

# # Cell 6: Load and process data
# def load_and_process_data(dataset_name=DATASET_NAME):
#     print(f"Loading dataset '{dataset_name}'...")
#     ds = load_dataset(dataset_name)
#     train = concatenate_datasets([ds['train'],custom_train])
#     test = ds['test']

#     unique_labels = set(train['label'])
#     num_classes = len(unique_labels)
#     print(f"Detected {num_classes} classes in dataset '{dataset_name}'")

#     print("Training BPE tokenizer...")
#     tokenizer = train_bpe_tokenizer(train['text'], vocab_size=VOCAB_SIZE)
#     print(f"Tokenizer vocabulary size: {tokenizer.get_vocab_size()}")

#     # Create memory-efficient datasets
#     print("Creating memory-efficient datasets...")
#     train_dataset = TokenizedDataset(train, tokenizer)
#     test_dataset = TokenizedDataset(test, tokenizer)

#     print(f"Training samples: {len(train_dataset)}")
#     print(f"Test samples: {len(test_dataset)}")

#     return tokenizer, train_dataset, test_dataset, num_classes

# # Initialize model, loss, and optimizer
# def init_model(vocab_size, num_classes):
#     model = TransformerClassifier(
#         vocab_size=vocab_size,
#         num_classes=num_classes,
#         d_model=D_MODEL,
#         num_heads=NUM_HEADS,
#         d_ff=D_FF,
#         num_layers=NUM_LAYERS,
#         max_len=MAX_LEN,
#         dropout=DROPOUT
#     )

#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     print(f"Using device: {device}")
#     model.to(device)

#     criterion = nn.CrossEntropyLoss()
#     optimizer = optim.AdamW(model.parameters(), lr=LR)

#     return model, criterion, optimizer, device

# # Cell 7: Checkpoint functions
# def setup_checkpointing():
#     if os.path.exists('/content/drive/MyDrive/model_checkpoints2'):
#         CHECKPOINT_DIR = '/content/drive/MyDrive/model_checkpoints2'
#     else:
#         CHECKPOINT_DIR = './model_checkpoints'

#     os.makedirs(CHECKPOINT_DIR, exist_ok=True)
#     CHECKPOINT_PATH = os.path.join(CHECKPOINT_DIR, 'transformer_classifier_checkpoint_best_best.pth')
#     return CHECKPOINT_PATH

# def save_checkpoint(model, optimizer, epoch, loss, path):
#     torch.save({
#         'epoch': epoch,
#         'model_state_dict': model.state_dict(),
#         'optimizer_state_dict': optimizer.state_dict(),
#         'loss': loss,
#     }, path)
#     print(f"Checkpoint saved at epoch {epoch+1}")

# def load_checkpoint(model, optimizer, path, device):
#     if os.path.exists(path):
#         checkpoint = torch.load(path, map_location=device)
#         model.load_state_dict(checkpoint)
#     return 0


# # Cell 8: Training and evaluation functions
# def train_epoch(model, train_loader, optimizer, criterion, device):
#     model.train()
#     total_loss = 0.0
#     correct = 0
#     total = 0

#     progress_bar = tqdm(train_loader, desc="Training")
#     for inputs, labels in progress_bar:
#         inputs, labels = inputs.to(device), labels.to(device)

#         optimizer.zero_grad()
#         logits = model(inputs)
#         loss = criterion(logits, labels)
#         loss.backward()
#         optimizer.step()

#         total_loss += loss.item()

#         # Calculate accuracy
#         preds = torch.argmax(logits, dim=1)
#         correct += (preds == labels).sum().item()
#         total += labels.size(0)

#         # Update progress bar
#         progress_bar.set_postfix({
#             "loss": f"{loss.item():.4f}",
#             "acc": f"{correct/total:.4f}"
#         })

#     avg_loss = total_loss / len(train_loader)
#     accuracy = correct / total
#     return avg_loss, accuracy

# def evaluate(model, test_loader, criterion, device):
#     model.eval()
#     total_loss = 0.0
#     correct = 0
#     total = 0

#     with torch.no_grad():
#         for inputs, labels in tqdm(test_loader, desc="Evaluating"):
#             inputs, labels = inputs.to(device), labels.to(device)
#             logits = model(inputs)
#             loss = criterion(logits, labels)

#             total_loss += loss.item()
#             preds = torch.argmax(logits, dim=1)
#             correct += (preds == labels).sum().item()
#             total += labels.size(0)

#     avg_loss = total_loss / len(test_loader)
#     accuracy = correct / total
#     return avg_loss, accuracy

# # Cell 9: Main training loop
# def train_model(model, train_dataset, test_dataset, criterion, optimizer, device, checkpoint_path, epochs=EPOCHS):
#     # Set up data loaders with appropriate batch size and num_workers
#     NWORKERS = 12
#     train_loader = DataLoader(
#         train_dataset,
#         batch_size=BATCH_SIZE,
#         shuffle=True,
#         num_workers=NWORKERS,
#         pin_memory=True
#     )

#     test_loader = DataLoader(
#         test_dataset,
#         batch_size=BATCH_SIZE,
#         shuffle=False,
#         num_workers=NWORKERS,
#         pin_memory=True
#     )

#     # Try to load checkpoint
#     start_epoch = load_checkpoint(model, optimizer, checkpoint_path, device)

#     best_accuracy = 0.0

#     for epoch in range(start_epoch, epochs):
#         start_time = time.time()

#         # Train for one epoch
#         train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, device)

#         # Evaluate
#         val_loss, val_acc = evaluate(model, test_loader, criterion, device)

#         # Report metrics
#         epoch_time = time.time() - start_time
#         print(f"Epoch {epoch+1}/{epochs} | Time: {epoch_time:.2f}s")
#         print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
#         print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

#         # Save checkpoint
#         save_checkpoint(model, optimizer, epoch, val_loss, checkpoint_path)

#         # Save best model
#         if val_acc > best_accuracy:
#             best_accuracy = val_acc
#             best_model_path = checkpoint_path
#             torch.save(model.state_dict(), best_model_path)
#             print(f"New best model saved with accuracy: {best_accuracy:.4f}")

#         # Force garbage collection to free memory
#         gc.collect()
#         if torch.cuda.is_available():
#             torch.cuda.empty_cache()

#     return model

# # Cell 10: Example inference
# def predict_sentiment(text, model, tokenizer, device, num_classes):
#     enc = tokenizer.encode(text)
#     input_ids = torch.tensor([enc.ids]).to(device)
#     model.eval()
#     with torch.no_grad():
#         logits = model(input_ids)
#         probs = torch.softmax(logits, dim=1)
#         pred = torch.argmax(probs, dim=1).item()
#         # Map score to sentiment (for Yelp dataset)
#         if num_classes == 5:  # Yelp dataset
#             sentiment_map = {
#                 0: "Very Negative (1 star)",
#                 1: "Negative (2 stars)",
#                 2: "Neutral (3 stars)",
#                 3: "Positive (4 stars)",
#                 4: "Very Positive (5 stars)"
#             }
#             return sentiment_map.get(pred, f"Class {pred}"), probs[0][pred].item()
#         else:
#             return pred, probs[0][pred].item()

# # Cell 11: Save model artifacts
# def save_artifacts(model, tokenizer, device):
#     # Save model state
#     torch.save(model.state_dict(), "transformer_classifier.pth")

#     # Save tokenizer
#     with open("tokenizer.json", "w") as f:
#         f.write(tokenizer.to_str())

#     print("Model and tokenizer saved successfully.")

# # Cell 12: Main execution
# def main():
#     # Load and process data
#     tokenizer, train_dataset, test_dataset, num_classes = load_and_process_data()

#     # Initialize model and related components
#     model, criterion, optimizer, device = init_model(tokenizer.get_vocab_size(), num_classes)

#     # Setup checkpointing
#     checkpoint_path = setup_checkpointing()

#     with open("tokenizer.json", "w") as f:
#         f.write(tokenizer.to_str())
#     # Train model
#     model = train_model(model, train_dataset, test_dataset, criterion, optimizer, device, checkpoint_path)

#     # Save artifacts
#     save_artifacts(model, tokenizer, device)

#     # Example inference
#     sample_text = "The food was delicious and the service was excellent!"
#     sentiment, confidence = predict_sentiment(sample_text, model, tokenizer, device, num_classes)
#     print(f"Sample text: '{sample_text}'")
#     print(f"Predicted sentiment: {sentiment} (confidence: {confidence:.4f})")

# main()

Loading dataset 'yelp_review_full'...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/6.72k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/299M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/23.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Detected 5 classes in dataset 'yelp_review_full'
Training BPE tokenizer...
Tokenizer vocabulary size: 30000
Creating memory-efficient datasets...
Training samples: 650436
Test samples: 50000
Using device: cuda




Training:   0%|          | 0/10164 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/782 [00:00<?, ?it/s]

Epoch 1/30 | Time: 662.36s
Train Loss: 0.7732 | Train Acc: 0.6677
Val Loss: 0.8870 | Val Acc: 0.6196
Checkpoint saved at epoch 1
New best model saved with accuracy: 0.6196


Training:   0%|          | 0/10164 [00:00<?, ?it/s]