## Imports

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
from tqdm import tqdm
import pickle
import os

# Import the Encoder class from transformers_scratch.py
from transformers_scratch import Encoder

## Transformer Architecture

In [None]:
class SentimentTransformer(nn.Module):
    def __init__(
        self,
        vocab_size,
        pad_idx,
        embed_size=512,
        num_layers=6,
        forward_expansion=4,
        heads=8,
        dropout=0.1,
        device="cuda",
        max_length=512
    ):
        super(SentimentTransformer, self).__init__()

        self.encoder = Encoder(
            vocab_size,
            embed_size,
            num_layers,
            heads,
            device,
            forward_expansion,
            dropout,
            max_length
        )

        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(embed_size, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, 2)  # 2 classes for sentiment
        )

        self.pad_idx = pad_idx
        self.device = device

    def make_src_mask(self, src):
        src_mask = (src != self.pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask.to(self.device)

    def forward(self, src):
        src_mask = self.make_src_mask(src)
        encoder_out = self.encoder(src, src_mask)
        # Global average pooling over sequence length
        pooled = encoder_out.mean(dim=1)
        return self.classifier(pooled)

## IMDBDataset Class

In [None]:
class IMDBDataset(Dataset):
    def __init__(self, data, tokenizer, vocab, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.vocab = vocab
        self.max_length = max_length
        self.label_map = {'positive': 1, 'negative': 0}  # Map labels to integers

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text, label = self.data[idx]['review'], self.data[idx]['sentiment']
        tokens = self.tokenizer(text)[:self.max_length]
        # Convert tokens to indices
        indices = [self.vocab.get(token, self.vocab['<unk>']) for token in tokens]
        label = self.label_map[label]  # Convert label to integer
        return torch.tensor(indices, dtype=torch.long), torch.tensor(label, dtype=torch.long)

## Utility Functions

In [None]:
def collate_batch(batch):
    text_list, label_list = [], []
    for (_text, _label) in batch:
        label_list.append(_label)
        text_list.append(_text)
    return pad_sequence(text_list, padding_value=1, batch_first=True), torch.tensor(label_list)

def build_vocab(data, tokenizer, max_vocab_size=25000):
    counter = Counter()
    for example in data:
        counter.update(tokenizer(example['review']))
    vocab = {word: idx + 2 for idx, (word, _) in enumerate(counter.most_common(max_vocab_size))}
    vocab['<pad>'] = 0
    vocab['<unk>'] = 1
    return vocab

def load_data_from_csv(df):
    data = []
    for _, row in df.iterrows():
        data.append({'review': row['review'], 'sentiment': row['sentiment']})
    return data

## Save and Load the Model

In [None]:
def save_model(model, vocab, tokenizer, save_dir='saved_model'):
    os.makedirs(save_dir, exist_ok=True)
    # Save model state
    torch.save(model.state_dict(), os.path.join(save_dir, 'model.pth'))
    # Save vocabulary
    with open(os.path.join(save_dir, 'vocab.pkl'), 'wb') as f:
        pickle.dump(vocab, f)
    # Save tokenizer
    with open(os.path.join(save_dir, 'tokenizer.pkl'), 'wb') as f:
        pickle.dump(tokenizer, f)
    print(f"Model and artifacts saved to {save_dir}")

def load_model(save_dir='saved_model', device='cpu'):
    # Load vocabulary
    with open(os.path.join(save_dir, 'vocab.pkl'), 'rb') as f:
        vocab = pickle.load(f)
    # Load tokenizer
    with open(os.path.join(save_dir, 'tokenizer.pkl'), 'rb') as f:
        tokenizer = pickle.load(f)
    # Initialize model
    model = SentimentTransformer(
        vocab_size=len(vocab),
        pad_idx=vocab['<pad>'],
        embed_size=64,
        num_layers=2,
        heads=8,
        device=device
    ).to(device)
    # Load model state
    model.load_state_dict(torch.load(os.path.join(save_dir, 'model.pth'), map_location=device))
    model.eval()
    print(f"Model and artifacts loaded from {save_dir}")
    return model, vocab, tokenizer

## Training Function

In [None]:
def train_model():
    # Hyperparameters
    BATCH_SIZE = 32
    EPOCHS = 10
    LEARNING_RATE = 0.001
    MAX_LENGTH = 32
    EMBED_SIZE = 32
    NUM_HEADS = 8
    NUM_LAYERS = 2
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Load dataset from CSV
    df = pd.read_csv('IMDB Dataset.csv', on_bad_lines='skip', quotechar='"', engine='python')
    df_train = df.sample(frac=0.8, random_state=42)
    df_test = df.drop(df_train.index)
    train_data = load_data_from_csv(df_train)
    test_data = load_data_from_csv(df_test)

    # Tokenizer
    tokenizer = lambda x: x.split()  # Simple whitespace tokenizer

    # Build vocabulary
    vocab = build_vocab(train_data, tokenizer)

    # Create datasets
    train_dataset = IMDBDataset(train_data, tokenizer, vocab, MAX_LENGTH)
    test_dataset = IMDBDataset(test_data, tokenizer, vocab, MAX_LENGTH)

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                            shuffle=True, collate_fn=collate_batch)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                           shuffle=False, collate_fn=collate_batch)

    # Initialize model
    model = SentimentTransformer(
        vocab_size=len(vocab),
        pad_idx=vocab['<pad>'],
        embed_size=EMBED_SIZE,
        num_layers=NUM_LAYERS,
        heads=NUM_HEADS,
        device=DEVICE
    ).to(DEVICE)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    # Training loop
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{EPOCHS}')

        for batch_idx, (text, labels) in enumerate(progress_bar):
            text, labels = text.to(DEVICE), labels.to(DEVICE)

            optimizer.zero_grad()
            output = model(text)
            loss = criterion(output, labels)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            total_loss += loss.item()
            progress_bar.set_postfix({'loss': total_loss/(batch_idx+1)})

        # Validation
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for text, labels in test_loader:
                text, labels = text.to(DEVICE), labels.to(DEVICE)
                outputs = model(text)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        print(f'Epoch {epoch+1}, Accuracy: {100 * correct / total:.2f}%')

    # Save the model and artifacts
    save_model(model, vocab, tokenizer)

## Main Execution

In [None]:
if __name__ == "__main__":
    train_model()