In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import os
import requests
import zipfile
from collections import defaultdict
import string
import numpy as np

# Set up device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 1. Download and load GloVe embeddings manually
def load_glove(embedding_dim=100):
    # Download GloVe embeddings if not exists
    if not os.path.exists('glove.6B.100d.txt'):
        url = 'https://nlp.stanford.edu/data/glove.6B.zip'
        r = requests.get(url, allow_redirects=True)
        with open('glove.6B.zip', 'wb') as f:
            f.write(r.content)
        with zipfile.ZipFile('glove.6B.zip', 'r') as zip_ref:
            zip_ref.extractall()
    
    # Load embeddings
    embeddings = defaultdict(lambda: np.random.normal(scale=0.6, size=(embedding_dim,)))
    embeddings['<pad>'] = np.zeros(embedding_dim)
    embeddings['<unk>'] = np.zeros(embedding_dim)
    
    with open(f'glove.6B.{embedding_dim}d.txt', 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    
    return embeddings

# 2. Create vocabulary and embedding matrix
class Vocabulary:
    def __init__(self, embedding_dim=100):
        self.embeddings = load_glove(embedding_dim)
        self.stoi = {'<pad>': 0, '<unk>': 1}
        self.itos = {0: '<pad>', 1: '<unk>'}
        self.vectors = [np.zeros(100), np.zeros(100)]
        
        # Build vocabulary
        for idx, word in enumerate(self.embeddings.keys(), start=2):
            self.stoi[word] = idx
            self.itos[idx] = word
            self.vectors.append(self.embeddings[word])
            
        self.vectors = torch.tensor(np.array(self.vectors), dtype=torch.float32)
        
    def __len__(self):
        return len(self.vectors)

# 3. Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len=100):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokens = self.tokenize(text)
        indices = [self.vocab.stoi.get(token, 1) for token in tokens]  # 1 = <unk>
        
        # Pad/truncate
        if len(indices) < self.max_len:
            indices += [0] * (self.max_len - len(indices))
        else:
            indices = indices[:self.max_len]
            
        return torch.tensor(indices), torch.tensor(label)

    def tokenize(self, text):
        text = text.lower().translate(str.maketrans('', '', string.punctuation))
        return text.split()

# 4. Load IMDB dataset manually
def load_imdb():
    imdb = load_dataset('imdb')
    train_valid = imdb['train'].train_test_split(test_size=0.2, seed=1234)
    return {
        'train_texts': train_valid['train']['text'],
        'train_labels': train_valid['train']['label'],
        'valid_texts': train_valid['test']['text'],
        'valid_labels': train_valid['test']['label'],
        'test_texts': imdb['test']['text'],
        'test_labels': imdb['test']['label']
    }

data = load_imdb()
vocab = Vocabulary()

# 5. Create datasets and dataloaders
BATCH_SIZE = 64
MAX_LEN = 100

train_dataset = TextDataset(data['train_texts'], data['train_labels'], vocab, MAX_LEN)
valid_dataset = TextDataset(data['valid_texts'], data['valid_labels'], vocab, MAX_LEN)
test_dataset = TextDataset(data['test_texts'], data['test_labels'], vocab, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# 6. Model definition
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(vocab.vectors, freeze=False, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        hidden = self.dropout(torch.cat((hidden[-2], hidden[-1]), dim=1))
        return self.fc(hidden)

model = TextClassifier(
    vocab_size=len(vocab),
    embedding_dim=100,
    hidden_dim=256,
    num_classes=1
).to(device)

# 7. Training setup
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

# 8. Training loop
def train_epoch(model, loader):
    model.train()
    total_loss = 0
    correct = 0
    
    for inputs, labels in loader:
        inputs, labels = inputs.to(device), labels.float().to(device)
        optimizer.zero_grad()
        outputs = model(inputs).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        preds = torch.round(torch.sigmoid(outputs))
        correct += (preds == labels).sum().item()
    
    return total_loss / len(loader), correct / len(loader.dataset)

def evaluate(model, loader):
    model.eval()
    total_loss = 0
    correct = 0
    
    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.float().to(device)
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            preds = torch.round(torch.sigmoid(outputs))
            correct += (preds == labels).sum().item()
    
    return total_loss / len(loader), correct / len(loader.dataset)

# Run training
N_EPOCHS = 2
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_epoch(model, train_loader)
    valid_loss, valid_acc = evaluate(model, valid_loader)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best_model.pt')
    
    print(f'Epoch {epoch+1}')
    print(f'Train Loss: {train_loss:.4f} | Acc: {train_acc*100:.2f}%')
    print(f'Valid Loss: {valid_loss:.4f} | Acc: {valid_acc*100:.2f}%')

# Final evaluation
model.load_state_dict(torch.load('best_model.pt'))
test_loss, test_acc = evaluate(model, test_loader)
print(f'Test Loss: {test_loss:.4f} | Acc: {test_acc*100:.2f}%')

Epoch 1
Train Loss: 0.6787 | Acc: 57.90%
Valid Loss: 0.6001 | Acc: 69.40%
Epoch 2
Train Loss: 0.5984 | Acc: 67.84%
Valid Loss: 0.4883 | Acc: 78.68%
Test Loss: 0.5106 | Acc: 76.81%
