# Dataset 2: Linux Kernel

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pickle
import json
import re
from collections import Counter
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import requests
from pathlib import Path

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")


Using device: cuda
GPU: NVIDIA H100 80GB HBM3
Memory: 85.03 GB


## 1. DATA LOADING AND PREPROCESSING

In [2]:
def load_data(url="https://cs.stanford.edu/people/karpathy/char-rnn/linux_input.txt"):
    """Download and load the Linux kernel code dataset"""
    print("Downloading dataset...")
    response = requests.get(url)
    text = response.text
    print(f"Dataset loaded: {len(text)} characters")
    return text

def preprocess_text(text, vocab_size=100000):
    """
    Preprocess text for code (preserve special characters)
    Split by newlines to treat each line as a statement
    """
    print("\nPreprocessing text...")
    
    # Split into lines and filter empty lines
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    
    # Tokenize into words (keeping code-relevant symbols)
    all_words = []
    for line in lines:
        # Split on whitespace but keep the structure
        words = line.split()
        all_words.extend(words)
    
    print(f"Total words: {len(all_words)}")
    
    # Build vocabulary from most frequent words
    word_counts = Counter(all_words)
    most_common = word_counts.most_common(vocab_size - 1)  # -1 for <UNK> token
    
    vocab = ['<UNK>'] + [word for word, _ in most_common]
    word_to_idx = {word: idx for idx, word in enumerate(vocab)}
    idx_to_word = {idx: word for word, idx in word_to_idx.items()}
    
    print(f"\nVocabulary size: {len(vocab)}")
    print(f"\n10 most frequent words:")
    for word, count in most_common[:10]:
        print(f"  {word}: {count}")
    
    print(f"\n10 least frequent words in vocabulary:")
    for word, count in most_common[-10:]:
        print(f"  {word}: {count}")
    
    return all_words, vocab, word_to_idx, idx_to_word

def create_sequences(words, word_to_idx, context_length):
    """Create input-output sequences for training"""
    X, y = [], []
    
    for i in range(len(words) - context_length):
        context = words[i:i + context_length]
        target = words[i + context_length]
        
        # Convert to indices (use <UNK> for unknown words)
        context_indices = [word_to_idx.get(w, 0) for w in context]
        target_idx = word_to_idx.get(target, 0)
        
        X.append(context_indices)
        y.append(target_idx)
    
    return np.array(X), np.array(y)


## 2. DATASET CLASS

In [3]:
class WordPredictionDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.LongTensor(X)
        self.y = torch.LongTensor(y)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

## 3. MODEL DEFINITION

In [4]:
class NextWordMLP(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_length, 
                 hidden_size=1024, activation='relu'):
        super(NextWordMLP, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.flatten_size = context_length * embedding_dim
        
        # Choose activation function
        if activation == 'relu':
            self.activation = nn.ReLU()
        elif activation == 'tanh':
            self.activation = nn.Tanh()
        else:
            raise ValueError(f"Unknown activation: {activation}")
        
        # MLP layers
        self.fc1 = nn.Linear(self.flatten_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, vocab_size)
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.context_length = context_length
        self.activation_name = activation
    
    def forward(self, x):
        # x shape: (batch_size, context_length)
        embedded = self.embedding(x)  # (batch_size, context_length, embedding_dim)
        flattened = embedded.view(embedded.size(0), -1)  # (batch_size, context_length * embedding_dim)
        
        hidden = self.activation(self.fc1(flattened))
        output = self.fc2(hidden)
        
        return output


## 4. TRAINING FUNCTION

In [5]:
def train_model(model, train_loader, val_loader, epochs, lr, device, model_name):
    """Train a single model"""
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    train_losses = []
    val_losses = []
    val_accuracies = []
    
    print(f"\n{'='*60}")
    print(f"Training: {model_name}")
    print(f"{'='*60}")
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        train_loss = 0
        train_batches = 0
        
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            train_batches += 1
        
        avg_train_loss = train_loss / train_batches
        train_losses.append(avg_train_loss)
        
        # Validation phase
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        val_batches = 0
        
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                val_loss += loss.item()
                val_batches += 1
                
                _, predicted = torch.max(outputs.data, 1)
                total += y_batch.size(0)
                correct += (predicted == y_batch).sum().item()
        
        avg_val_loss = val_loss / val_batches
        val_accuracy = 100 * correct / total
        
        val_losses.append(avg_val_loss)
        val_accuracies.append(val_accuracy)
        
        if (epoch + 1) % 10 == 0 or epoch == 0:
            print(f"Epoch [{epoch+1}/{epochs}] - "
                  f"Train Loss: {avg_train_loss:.4f}, "
                  f"Val Loss: {avg_val_loss:.4f}, "
                  f"Val Acc: {val_accuracy:.2f}%")
    
    print(f"\nFinal Results - Val Loss: {val_losses[-1]:.4f}, Val Acc: {val_accuracies[-1]:.2f}%")
    
    return {
        'train_losses': train_losses,
        'val_losses': val_losses,
        'val_accuracies': val_accuracies
    }


## 5. SAVE MODEL AND METADATA

In [6]:
def save_model_artifacts(model, history, vocab, word_to_idx, idx_to_word, 
                        config, model_name, save_dir='models'):
    """Save model, weights, vocabulary, and training history"""
    Path(save_dir).mkdir(exist_ok=True)
    
    # Save model weights
    model_path = f"{save_dir}/{model_name}_weights.pth"
    torch.save(model.state_dict(), model_path)
    
    # Save vocabulary and mappings
    vocab_data = {
        'vocab': vocab,
        'word_to_idx': word_to_idx,
        'idx_to_word': idx_to_word
    }
    with open(f"{save_dir}/{model_name}_vocab.pkl", 'wb') as f:
        pickle.dump(vocab_data, f)
    
    # Save config
    with open(f"{save_dir}/{model_name}_config.json", 'w') as f:
        json.dump(config, f, indent=2)
    
    # Save training history
    with open(f"{save_dir}/{model_name}_history.pkl", 'wb') as f:
        pickle.dump(history, f)
    
    print(f"Saved artifacts for {model_name}")

def plot_training_history(history, model_name, save_dir='models'):
    """Plot and save training curves"""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Loss curves
    ax1.plot(history['train_losses'], label='Train Loss')
    ax1.plot(history['val_losses'], label='Val Loss')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.set_title(f'{model_name} - Loss Curves')
    ax1.legend()
    ax1.grid(True)
    
    # Accuracy curve
    ax2.plot(history['val_accuracies'], label='Val Accuracy', color='green')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy (%)')
    ax2.set_title(f'{model_name} - Validation Accuracy')
    ax2.legend()
    ax2.grid(True)
    
    plt.tight_layout()
    plt.savefig(f"{save_dir}/{model_name}_curves.png", dpi=150)
    plt.close()

## 6. MAIN TRAINING PIPELINE

In [7]:
def main():
    # Hyperparameter configurations
    configs = [
        # context_length, embedding_dim, activation
        (3, 32, 'relu'),
        (3, 32, 'tanh'),
        (3, 64, 'relu'),
        (3, 64, 'tanh'),
        (5, 32, 'relu'),
        (5, 32, 'tanh'),
        (5, 64, 'relu'),
        (5, 64, 'tanh'),
    ]
    
    # Training parameters
    VOCAB_SIZE = 100000
    HIDDEN_SIZE = 1024
    EPOCHS = 200
    BATCH_SIZE = 512  # Large batch for H100
    LEARNING_RATE = 0.001
    VAL_SPLIT = 0.1
    
    # Load and preprocess data
    text = load_data()
    all_words, vocab, word_to_idx, idx_to_word = preprocess_text(text, VOCAB_SIZE)
    
    # Train each model configuration
    for idx, (context_length, embedding_dim, activation) in enumerate(configs, 1):
        model_name = f"model_{idx}_ctx{context_length}_emb{embedding_dim}_{activation}"
        
        print(f"\n{'#'*60}")
        print(f"# MODEL {idx}/8: Context={context_length}, Embedding={embedding_dim}, Activation={activation}")
        print(f"{'#'*60}")
        
        # Create sequences for this context length
        print(f"\nCreating sequences with context length {context_length}...")
        X, y = create_sequences(all_words, word_to_idx, context_length)
        print(f"Total sequences: {len(X)}")
        
        # Train-validation split
        split_idx = int(len(X) * (1 - VAL_SPLIT))
        X_train, y_train = X[:split_idx], y[:split_idx]
        X_val, y_val = X[split_idx:], y[split_idx:]
        
        print(f"Train samples: {len(X_train)}, Val samples: {len(X_val)}")
        
        # Create datasets and dataloaders
        train_dataset = WordPredictionDataset(X_train, y_train)
        val_dataset = WordPredictionDataset(X_val, y_val)
        
        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, 
                                 shuffle=True, num_workers=4, pin_memory=True)
        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, 
                               shuffle=False, num_workers=4, pin_memory=True)
        
        # Create model
        model = NextWordMLP(
            vocab_size=len(vocab),
            embedding_dim=embedding_dim,
            context_length=context_length,
            hidden_size=HIDDEN_SIZE,
            activation=activation
        )
        
        print(f"\nModel architecture:")
        print(f"  Vocab size: {len(vocab)}")
        print(f"  Embedding dim: {embedding_dim}")
        print(f"  Context length: {context_length}")
        print(f"  Hidden size: {HIDDEN_SIZE}")
        print(f"  Activation: {activation}")
        print(f"  Total parameters: {sum(p.numel() for p in model.parameters()):,}")
        
        # Train model
        history = train_model(
            model, train_loader, val_loader, 
            EPOCHS, LEARNING_RATE, device, model_name
        )
        
        # Save model and artifacts
        config = {
            'context_length': context_length,
            'embedding_dim': embedding_dim,
            'activation': activation,
            'vocab_size': len(vocab),
            'hidden_size': HIDDEN_SIZE,
            'epochs': EPOCHS,
            'batch_size': BATCH_SIZE,
            'learning_rate': LEARNING_RATE,
            'final_val_loss': history['val_losses'][-1],
            'final_val_accuracy': history['val_accuracies'][-1]
        }
        
        save_model_artifacts(
            model, history, vocab, word_to_idx, idx_to_word,
            config, model_name
        )
        
        # Plot training curves
        plot_training_history(history, model_name)
        
        # Generate sample predictions
        print(f"\nSample predictions for {model_name}:")
        model.eval()
        with torch.no_grad():
            sample_idx = np.random.randint(0, len(X_val), 3)
            for i in sample_idx:
                context = X_val[i]
                true_next = y_val[i]
                
                context_words = [idx_to_word[idx] for idx in context]
                true_word = idx_to_word[true_next]
                
                # Predict
                context_tensor = torch.LongTensor([context]).to(device)
                output = model(context_tensor)
                _, predicted = torch.max(output, 1)
                predicted_word = idx_to_word[predicted.item()]
                
                print(f"  Context: {' '.join(context_words)}")
                print(f"  True: {true_word} | Predicted: {predicted_word}")
                print()
        
        # Clear GPU memory
        del model, train_loader, val_loader, train_dataset, val_dataset
        torch.cuda.empty_cache()
    
    print("\n" + "="*60)
    print("ALL 8 MODELS TRAINED SUCCESSFULLY!")
    print("="*60)
    print("\nSaved files in 'models/' directory:")
    print("  - model_*_weights.pth (model weights)")
    print("  - model_*_vocab.pkl (vocabulary)")
    print("  - model_*_config.json (configuration)")
    print("  - model_*_history.pkl (training history)")
    print("  - model_*_curves.png (loss/accuracy plots)")


## RUN

In [None]:
if __name__ == "__main__":
    main()

Downloading dataset...
Dataset loaded: 6206996 characters

Preprocessing text...
Total words: 759639

Vocabulary size: 100000

10 most frequent words:
  *: 33504
  =: 28003
  {: 18915
  if: 17702
  }: 16965
  the: 16080
  */: 13445
  /*: 12190
  struct: 10997
  return: 10130

10 least frequent words in vocabulary:
  AUDIT_MMAP:: 1
  "fd=%d: 1
  flags=0x%x",: 1
  context->mmap.fd,: 1
  context->mmap.flags);: 1
  AUDIT_EXECVE:: 1
  audit_log_execve_info(context,: 1
  &ab);: 1
  audit_proctitle_rtrim(char: 1
  *proctitle,: 1

############################################################
# MODEL 1/8: Context=3, Embedding=32, Activation=relu
############################################################

Creating sequences with context length 3...
Total sequences: 759636
Train samples: 683672, Val samples: 75964

Model architecture:
  Vocab size: 100000
  Embedding dim: 32
  Context length: 3
  Hidden size: 1024
  Activation: relu
  Total parameters: 105,799,328

Training: model_1_ctx3_emb32_r

  context_tensor = torch.LongTensor([context]).to(device)


Total sequences: 759636
Train samples: 683672, Val samples: 75964

Model architecture:
  Vocab size: 100000
  Embedding dim: 32
  Context length: 3
  Hidden size: 1024
  Activation: tanh
  Total parameters: 105,799,328

Training: model_2_ctx3_emb32_tanh
Epoch [1/200] - Train Loss: 6.9371, Val Loss: 6.5403, Val Acc: 17.30%
Epoch [10/200] - Train Loss: 2.7366, Val Loss: 7.9863, Val Acc: 20.79%
Epoch [20/200] - Train Loss: 1.9241, Val Loss: 9.0893, Val Acc: 20.75%
Epoch [30/200] - Train Loss: 1.5757, Val Loss: 9.9596, Val Acc: 19.98%
Epoch [40/200] - Train Loss: 1.4122, Val Loss: 10.4973, Val Acc: 19.01%
Epoch [50/200] - Train Loss: 1.3199, Val Loss: 10.9529, Val Acc: 19.04%
Epoch [60/200] - Train Loss: 1.2578, Val Loss: 11.3230, Val Acc: 18.14%
Epoch [70/200] - Train Loss: 1.2140, Val Loss: 11.6303, Val Acc: 18.00%
Epoch [80/200] - Train Loss: 1.1807, Val Loss: 11.8812, Val Acc: 17.30%
Epoch [90/200] - Train Loss: 1.1519, Val Loss: 12.1529, Val Acc: 17.83%
Epoch [100/200] - Train Loss: 1