# Training

### Below you will find the code for training the Char-RNN model. **Make sure to enable GPU acceleration (T4 x2) in kaggle before proceeding with this assignment**


You are encouraged to make adjustments to the below hyperparameters as you wish.
Note that 50 epochs require about 1 hour of training on Kaggle GPU with the provided params.
When trained for 50 epochs these params give Train Loss: 1.0388 and Val Loss: 1.0015. The code automatically saves checkpoints every 10 epochs which can be adjust based on your preference. The code also does a basic level of logging of the loss values which is saved as a csv file

**Since training the model might take several hours if you're new to Kaggle we recommend using the Save Version button on the top right along with Save & Run All(Commit). This will run all the cell in this notebook in the background since running the cells in this notebook editor will be automatically stopped after 30 minutes of inactivity. Once you use the save option you can view the output logs and files once all the code blocks finish running**

In [1]:
# ============================================================================
# Character-Level LSTM Training Script for Mahatma Gandhi Dataset 
# ============================================================================

import json
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.amp
from tqdm import tqdm
import pandas as pd
import os
import re

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if device.type == 'cuda':
    torch.backends.cudnn.benchmark = True

# ============================================================================
# HYPERPARAMETERS - Recommended Starting Values
# ============================================================================

# Model architecture
HIDDEN_SIZE = 512           # Size of LSTM hidden state (256, 512, or 1024)
NUM_LAYERS = 2              # Number of LSTM layers (2-3 recommended)
EMBEDDING_DIM = 128         # Character embedding dimension

# Training parameters
BATCH_SIZE = 512             # Batch size (256 also works)
SEQ_LENGTH = 100            # Sequence length for training (50-200)
LEARNING_RATE = 0.002       # Learning rate (0.001-0.003 for Adam)
NUM_EPOCHS = 50             # Number of training epochs
GRADIENT_CLIP = 5.0         # Gradient clipping threshold

# Regularization
DROPOUT = 0.3               # Dropout rate (0.2-0.5)

# Paths
DATA_PATH = '/kaggle/input/collected-works-mahatma-gandhi-a-json-dataset/The-Collected-Works-Mahatma-Gandhi.json'
OUTPUT_DIR = '/kaggle/working/'

# Define blacklist for unsuitable document types to exclude
BLACKLIST_DOCUMENT_TYPE = ['TELEGRAM', 'CABLE']

# ============================================================================
# DATA LOADING AND PREPROCESSING
# ============================================================================

def load_and_prepare_data(data_path, blacklist_types):
    """Load JSON dataset and extract training text from contents field."""
    print("Loading dataset...")
    with open(data_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    print(f"Total documents in dataset: {len(data)}")
    
    # Filter out blacklisted document types
    filtered_data = [doc for doc in data if doc.get('document_type') not in blacklist_types]
    print(f"Documents after filtering: {len(filtered_data)}")
    print(f"Filtered out {len(data) - len(filtered_data)} documents")
    
    # Extract all text from 'contents' field
    all_text = []
    for doc in filtered_data:
        content = doc.get('contents', '')
        if content:
            # Remove footnote markers like {1}, {2}, etc.
            content = re.sub(r'\{\d+\}', '', content)
            all_text.append(content)
    
    # Join all text with newlines
    text = '\n\n'.join(all_text)
    print(f"Total characters in corpus: {len(text)}")
    
    return text

def create_char_mappings(text):
    #Create character to index and index to character mappings.
    chars = sorted(list(set(text)))
    vocab_size = len(chars)
    
    char_to_idx = {ch: i for i, ch in enumerate(chars)}
    idx_to_char = {i: ch for i, ch in enumerate(chars)}
    
    print(f"Vocabulary size: {vocab_size}")
    print(f"First 50 characters in vocab: {''.join(chars[:50])}")
    
    return char_to_idx, idx_to_char, vocab_size

# ============================================================================
# DATASET CLASS
# ============================================================================
class CharDataset(Dataset):
    #Character-level dataset for sequence modeling.
    def __init__(self, text, char_to_idx, seq_length):
        self.text = text
        self.char_to_idx = char_to_idx
        self.seq_length = seq_length
        
        # Encode the entire text
        self.encoded = [char_to_idx[ch] for ch in text]
        
        # Calculate number of sequences
        self.num_sequences = len(self.encoded) // seq_length
        
    def __len__(self):
        return self.num_sequences
    
    def __getitem__(self, idx):
        start_idx = idx * self.seq_length
        end_idx = start_idx + self.seq_length + 1
        
        # Get sequence and target (shifted by 1)
        sequence = self.encoded[start_idx:end_idx]
        
        # Input and target
        x = torch.tensor(sequence[:-1], dtype=torch.long)
        y = torch.tensor(sequence[1:], dtype=torch.long)
        
        return x, y

# ============================================================================
# MODEL DEFINITION
# ============================================================================
class CharLSTM(nn.Module):
    #Character-level LSTM language model.
    
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, dropout):
        super(CharLSTM, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layers
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_size,
            num_layers,
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True
        )
        
        # Output layer
        self.fc = nn.Linear(hidden_size, vocab_size)
        
        # Dropout (applied after LSTM)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, hidden=None):
        # Embedding
        embedded = self.embedding(x)  # (batch_size, seq_length, embedding_dim)
        
        # LSTM
        lstm_out, hidden = self.lstm(embedded, hidden)  # (batch_size, seq_length, hidden_size)
        
        # Apply dropout
        lstm_out = self.dropout(lstm_out)
        
        # Output layer
        output = self.fc(lstm_out)  # (batch_size, seq_length, vocab_size)
        
        # Return only logits to avoid DataParallel gather shape issues on `hidden`
        return output
    
    def forward_hidden(self, x, hidden=None):
        embedded = self.embedding(x)
        lstm_out, hidden = self.lstm(embedded, hidden)
        lstm_out = self.dropout(lstm_out)
        output = self.fc(lstm_out)
        return output, hidden
    
    def init_hidden(self, batch_size):
        #Initialize hidden state.
        weight = next(self.parameters())
        hidden = (
            weight.new_zeros(self.num_layers, batch_size, self.hidden_size),
            weight.new_zeros(self.num_layers, batch_size, self.hidden_size)
        )
        return hidden

class CharLSTM(nn.Module):
    """
    Character-level LSTM language model with Additive (Bahdanau) self-attention.
    - Embedding -> LSTM -> Additive attention over LSTM outputs with causal mask -> final FC logits
    """

    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, dropout, attn_dim=None):
        super(CharLSTM, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout_prob = dropout

        # Embedding layer (chars -> vectors)
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.emb_ln = nn.LayerNorm(embedding_dim)    # normalize token embeddings

        # normalize LSTM outputs before attention projections
        self.pre_attn_ln = nn.LayerNorm(hidden_size)
        
        # post-attention LayerNorm (after residual combine)
        self.post_attn_ln = nn.LayerNorm(hidden_size)
        
        # LSTM layers (batch_first=True -> input/outputs are (B, T, D))
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_size,
            num_layers,
            dropout=dropout if num_layers > 1 else 0.0,
            batch_first=True
        )

        # Attention dimensionality (internal additive attention dim)
        # default: half the hidden size (but at least 1)
        if attn_dim is None:
            self.attn_dim = max(1, hidden_size // 2)
        else:
            self.attn_dim = attn_dim

        # Additive attention parameters.
        # We follow Bahdanau-style score: score(q_i, k_j) = v^T tanh(W_q q_i + W_k k_j)
        # Here, q_i and k_j are both the LSTM outputs at different positions.
        self.W_q = nn.Linear(hidden_size, self.attn_dim, bias=False)  # project query (per-position)
        self.W_k = nn.Linear(hidden_size, self.attn_dim, bias=False)  # project key (per-position)
        # v maps the tanh(...) -> scalar score
        self.v = nn.Linear(self.attn_dim, 1, bias=False)

        # Value projection: project LSTM output into "value" space (we keep same hidden_size for residuals)
        self.W_v = nn.Linear(hidden_size, hidden_size, bias=False)

        # Optional output projection / residual combine (after attention + residual)
        self.output_proj = nn.Linear(hidden_size, hidden_size)

        # Final classifier: map from hidden-size per timestep -> vocab logits
        self.fc = nn.Linear(hidden_size, vocab_size)

        # Dropout after LSTM / attention
        self.dropout = nn.Dropout(dropout)

        # Initialize weights (standard, keep final layers small)
        self._init_weights()

    def _init_weights(self):
        # Kaiming init for linear layers except final small initial for v/fc output
        for m in [self.W_q, self.W_k, self.W_v, self.output_proj]:
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight, nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0.0)

        # Initialize v (score vector) with small values to avoid large initial scores
        nn.init.uniform_(self.v.weight, -1e-3, 1e-3)
        if self.v.bias is not None:
            nn.init.constant_(self.v.bias, 0.0)

        # Final fc tiny init => helps stabilize initial Q/softmax behavior
        nn.init.uniform_(self.fc.weight, -1e-3, 1e-3)
        if self.fc.bias is not None:
            nn.init.constant_(self.fc.bias, 0.0)

    def forward(self, x, hidden=None):
        """
        Forward pass returning logits for each position.
        x: (batch_size, seq_len) long tensor of token indices
        hidden: optional LSTM hidden (h0, c0)
        returns: logits (batch_size, seq_len, vocab_size)
        """
        # 1) Embedding
        embedded = self.embedding(x)                        # (B, T, E)
        embedded = self.emb_ln(embedded)
        # 2) LSTM -> outputs for all timesteps
        lstm_out, hidden = self.lstm(embedded, hidden)     # (B, T, H), hidden=(h_n, c_n)
        lstm_out = self.dropout(lstm_out)                  # (B, T, H)
        lstm_norm = self.pre_attn_ln(lstm_out)
        # 3) Additive (Bahdanau) attention (causal)
        # Project to attention space
        # Q = W_q(lstm_out)  -> (B, T, A)
        # K = W_k(lstm_out)  -> (B, T, A)
        Q = self.W_q(lstm_norm)
        K = self.W_k(lstm_norm)

        # We will compute additive scores for every pair (i query, j key):
        # score_{i,j} = v^T tanh(Q_i + K_j)
        # Efficiently compute via broadcasting:
        # Q.unsqueeze(2): (B, T_q, 1, A)
        # K.unsqueeze(1): (B, 1, T_k, A)
        # Q_plus_K -> (B, T, T, A)
        # v(tanh(...)) -> (B, T, T, 1) -> squeeze -> (B, T, T)
        # Note: here T_q == T_k == seq_len (self-attention)

        B, T, _ = Q.size()
        device = Q.device

        # Expand for pairwise additive combination (broadcast sum)
        # Use `unsqueeze` rather than `repeat` to keep memory friendly
        Q_exp = Q.unsqueeze(2)         # (B, T, 1, A)
        K_exp = K.unsqueeze(1)         # (B, 1, T, A)
        # additive combination and non-linearity
        additive = torch.tanh(Q_exp + K_exp)   # (B, T, T, A)

        # linear map v: produce scalar scores for each (i, j)
        scores = self.v(additive).squeeze(-1)  # (B, T, T)

        # 4) Causal mask: positions j > i are masked out (lower-triangular allowed)
        # Build a (T, T) lower-triangular boolean mask: True for allowed entries (j <= i)
        # We keep this on the same device
        with torch.no_grad():
            causal_mask = torch.tril(torch.ones((T, T), dtype=torch.bool, device=device))  # (T, T)

        # broadcast mask to (B, T, T) and set disallowed positions to a large negative value
        # so softmax ~ 0 there.
        # determine an appropriate fill scalar for the scores' dtype
        fill_value = torch.tensor(torch.finfo(scores.dtype).min, device=scores.device, dtype=scores.dtype)
        
        # mask out illegal future positions with the dtype-appropriate minimum
        scores = scores.masked_fill(~causal_mask.unsqueeze(0), fill_value)

        # 5) Softmax over keys (last dimension j), producing attention weights for each query i
        attn_weights = torch.softmax(scores, dim=-1)   # (B, T, T)

        # 6) Values projection
        V = self.W_v(lstm_out)                         # (B, T, H)   -> projected values

        # 7) Compute attention output: for each query position i, weighted sum over keys j
        # attn_weights (B, T, T) @ V (B, T, H) -> (B, T, H)
        attn_out = torch.bmm(attn_weights, V)          # (B, T, H)

        # 8) Residual connection: combine attention output with LSTM output
        # and optionally apply a small output projection & non-linearity
        combined = attn_out + lstm_out                # (B, T, H)  residual
        combined = torch.tanh(self.output_proj(combined))  # (B, T, H)

        combined = self.dropout(combined)

        # 9) Final classifier -> per-timestep logits
        logits = self.fc(combined)                     # (B, T, vocab_size)

        # Return logits (and keep hidden as internal LSTM hidden if caller provided/needs it)
        return logits

    def forward_hidden(self, x, hidden=None):
        """
        Same as forward but returns logits and hidden (useful for generation pipeline).
        """
        # Embedding
        embedded = self.embedding(x)                        # (B, T, E)

        # LSTM
        lstm_out, hidden = self.lstm(embedded, hidden)     # (B, T, H)
        lstm_out = self.dropout(lstm_out)

        # Attention (same ops as forward)
        Q = self.W_q(lstm_out)
        K = self.W_k(lstm_out)
        B, T, _ = Q.size()
        device = Q.device

        Q_exp = Q.unsqueeze(2)         # (B, T, 1, A)
        K_exp = K.unsqueeze(1)         # (B, 1, T, A)
        additive = torch.tanh(Q_exp + K_exp)   # (B, T, T, A)
        scores = self.v(additive).squeeze(-1)  # (B, T, T)

        with torch.no_grad():
            causal_mask = torch.tril(torch.ones((T, T), dtype=torch.bool, device=device))
        # determine an appropriate fill scalar for the scores' dtype
        fill_value = torch.tensor(torch.finfo(scores.dtype).min, device=scores.device, dtype=scores.dtype)
        
        # mask out illegal future positions with the dtype-appropriate minimum
        scores = scores.masked_fill(~causal_mask.unsqueeze(0), fill_value)
        attn_weights = torch.softmax(scores, dim=-1)
        V = self.W_v(lstm_out)                         # (B, T, H)
        attn_out = torch.bmm(attn_weights, V)          # (B, T, H)
        combined = attn_out + lstm_out
        combined = torch.tanh(self.output_proj(combined))
        combined = self.dropout(combined)
        logits = self.fc(combined)                     # (B, T, vocab_size)

        return logits, hidden

    def init_hidden(self, batch_size):
        # Initialize LSTM hidden/cell state (num_layers, batch_size, hidden_size)
        weight = next(self.parameters())
        return (
            weight.new_zeros(self.num_layers, batch_size, self.hidden_size),
            weight.new_zeros(self.num_layers, batch_size, self.hidden_size)
        )


# ============================================================================
# TRAINING FUNCTIONS (AMP-enabled)
# ============================================================================
def train_epoch(model, dataloader, criterion, optimizer, device, gradient_clip, scaler):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    
    progress_bar = tqdm(dataloader, desc='Training')
    
    for batch_idx, (x, y) in enumerate(progress_bar):
        x, y = x.to(device), y.to(device)
        
        optimizer.zero_grad()
        
        with torch.amp.autocast(device_type='cuda' if device.type == 'cuda' else 'cpu'):
            output = model(x)                 # model(x) now returns only logits
            loss = criterion(output.view(-1, output.size(-1)), y.view(-1))
        
        # scale and backward
        scaler.scale(loss).backward()
        # unscale before clipping
        scaler.unscale_(optimizer)
        nn.utils.clip_grad_norm_(model.parameters(), gradient_clip)
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item()
        progress_bar.set_postfix({'loss': loss.item()})
    
    return total_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    #Evaluate the model.
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            
            output = model(x)   # model(x) returns only logits
            loss = criterion(output.view(-1, output.size(-1)), y.view(-1))
            
            total_loss += loss.item()
    
    return total_loss / len(dataloader)

def generate_text(model, start_str, char_to_idx, idx_to_char, length=500, temperature=0.8):
    #Generate text from the model
    model.eval()

    base_model = model.module if hasattr(model, 'module') else model

    with torch.no_grad():
        # Convert start string to indices
        chars = [char_to_idx.get(ch, 0) for ch in start_str]
        input_seq = torch.tensor([chars], dtype=torch.long).to(device)
        
        hidden = base_model.init_hidden(1)
        hidden = tuple([h.to(device) for h in hidden])
        
        generated = start_str
        
        for _ in range(length):
            output, hidden = base_model.forward_hidden(input_seq, hidden)
            
            # Get last output
            output = output[0, -1, :] / temperature
            
            # Sample from distribution
            probs = torch.softmax(output, dim=0)
            next_char_idx = torch.multinomial(probs, 1).item()
            
            # Append to generated text
            next_char = idx_to_char[next_char_idx]
            generated += next_char
            
            # Update input (append last token)
            input_seq = torch.tensor([[next_char_idx]], dtype=torch.long).to(device)
        
        return generated

# ============================================================================
# MAIN TRAINING SCRIPT
# ============================================================================
def main():
    # Load and prepare data
    text = load_and_prepare_data(DATA_PATH, BLACKLIST_DOCUMENT_TYPE)
    char_to_idx, idx_to_char, vocab_size = create_char_mappings(text)
    
    # Create datasets
    # Use 90% for training, 10% for validation
    split_idx = int(len(text) * 0.9)
    train_text = text[:split_idx]
    val_text = text[split_idx:]
    
    print(f"\nTraining set size: {len(train_text)} characters")
    print(f"Validation set size: {len(val_text)} characters")
    
    train_dataset = CharDataset(train_text, char_to_idx, SEQ_LENGTH)
    val_dataset = CharDataset(val_text, char_to_idx, SEQ_LENGTH)
    
    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=4,
        pin_memory=(device.type == 'cuda'),
        persistent_workers=True
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=2,
        pin_memory=(device.type == 'cuda'),
        persistent_workers=True
    )
    
    print(f"\nNumber of training batches: {len(train_loader)}")
    print(f"Number of validation batches: {len(val_loader)}")
    
    # Create model
    model = CharLSTM(vocab_size, EMBEDDING_DIM, HIDDEN_SIZE, NUM_LAYERS, DROPOUT).to(device)
    
    # Multi-GPU support
    if torch.cuda.device_count() > 1:
        print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
        model = nn.DataParallel(model)
    
    # Count parameters
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"\nModel has {num_params:,} trainable parameters")
    
    # Loss and optimizer
    scaler = torch.amp.GradScaler()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    
    # Learning rate scheduler
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=3, verbose=True
    )
    
    # Training history
    history = {
        'epoch': [],
        'train_loss': [],
        'val_loss': [],
        'learning_rate': []
    }
    
    best_val_loss = float('inf')
    
    # Training loop
    print(f"\n{'='*60}")
    print("Starting training...")
    print(f"{'='*60}\n")
    
    for epoch in range(1, NUM_EPOCHS + 1):
        print(f"\nEpoch {epoch}/{NUM_EPOCHS}")
        print("-" * 60)
        
        # Train
        train_loss = train_epoch(model, train_loader, criterion, optimizer, device, GRADIENT_CLIP, scaler)
        
        # Validate
        val_loss = evaluate(model, val_loader, criterion, device)
        
        # Update scheduler
        scheduler.step(val_loss)
        
        # Get current learning rate
        current_lr = optimizer.param_groups[0]['lr']
        
        # Log metrics
        history['epoch'].append(epoch)
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['learning_rate'].append(current_lr)
        
        print(f"\nTrain Loss: {train_loss:.4f}")
        print(f"Val Loss: {val_loss:.4f}")
        print(f"Learning Rate: {current_lr:.6f}")
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'train_loss': train_loss,
                'val_loss': val_loss,
                'char_to_idx': char_to_idx,
                'idx_to_char': idx_to_char,
                'vocab_size': vocab_size,
                'hyperparameters': {
                    'hidden_size': HIDDEN_SIZE,
                    'num_layers': NUM_LAYERS,
                    'embedding_dim': EMBEDDING_DIM,
                    'seq_length': SEQ_LENGTH,
                    'dropout': DROPOUT
                }
            }, os.path.join(OUTPUT_DIR, 'best_model.pt'))
            print(f"✓ Saved best model (val_loss: {val_loss:.4f})")
        
        # Generate sample text every 5 epochs
        if (epoch-1) % 7 == 0:
            print("\n" + "="*60)
            print("Sample generation:")
            print("="*60)
            # generation uses forward_hidden on base model; works whether model is DataParallel or not
            sample = generate_text(model, "I believe that", char_to_idx, idx_to_char, length=300)
            print(sample)
            print("="*60)
        
        # Save checkpoint
        if epoch % 10 == 0:
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }, os.path.join(OUTPUT_DIR, f'checkpoint_epoch_{epoch}.pt'))
    
    # Save training history
    history_df = pd.DataFrame(history)
    history_df.to_csv(os.path.join(OUTPUT_DIR, 'training_metrics.csv'), index=False)
    print(f"\nTraining history saved to {os.path.join(OUTPUT_DIR, 'training_metrics.csv')}")
    
    # Final model save
    torch.save({
        'epoch': NUM_EPOCHS,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'char_to_idx': char_to_idx,
        'idx_to_char': idx_to_char,
        'vocab_size': vocab_size,
    }, os.path.join(OUTPUT_DIR, 'final_model.pt'))
    
    print(f"\n{'='*60}")
    print("Training completed!")
    print(f"{'='*60}")
    print(f"Best validation loss: {best_val_loss:.4f}")
    print(f"Models saved in: {OUTPUT_DIR}")

if __name__ == '__main__':
    main()

Using device: cuda
Loading dataset...
Total documents in dataset: 45458
Documents after filtering: 43168
Filtered out 2290 documents
Total characters in corpus: 79917160
Vocabulary size: 192
First 50 characters in vocab: 
 !"$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQ

Training set size: 71925444 characters
Validation set size: 7991716 characters

Number of training batches: 1405
Number of validation batches: 157
Using 2 GPUs with DataParallel

Model has 4,328,640 trainable parameters





Starting training...


Epoch 1/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:12<00:00,  7.29it/s, loss=1.64]



Train Loss: 2.9693
Val Loss: 1.5281
Learning Rate: 0.002000
✓ Saved best model (val_loss: 1.5281)

Sample generation:
I believe that you should have a ost of when it I cannot pamint under agimations, as will in the nebligation of even in Ramnagab, replied ecerpibed to conseitate at nation to a send to be in the one. I better was a resolution. The represences to says, such not so more any satyaglaty and a nighted saja your condac

Epoch 2/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:27<00:00,  6.77it/s, loss=1.17]



Train Loss: 1.2528
Val Loss: 1.1015
Learning Rate: 0.002000
✓ Saved best model (val_loss: 1.1015)

Epoch 3/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:27<00:00,  6.76it/s, loss=1.15]



Train Loss: 1.1467
Val Loss: 1.0686
Learning Rate: 0.002000
✓ Saved best model (val_loss: 1.0686)

Epoch 4/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:28<00:00,  6.75it/s, loss=1.11]



Train Loss: 1.1211
Val Loss: 1.0544
Learning Rate: 0.002000
✓ Saved best model (val_loss: 1.0544)

Epoch 5/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:30<00:00,  6.69it/s, loss=1.1]



Train Loss: 1.1066
Val Loss: 1.0437
Learning Rate: 0.002000
✓ Saved best model (val_loss: 1.0437)

Epoch 6/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:28<00:00,  6.74it/s, loss=1.11]



Train Loss: 1.0965
Val Loss: 1.0367
Learning Rate: 0.002000
✓ Saved best model (val_loss: 1.0367)

Epoch 7/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:30<00:00,  6.67it/s, loss=1.08]



Train Loss: 1.0889
Val Loss: 1.0319
Learning Rate: 0.002000
✓ Saved best model (val_loss: 1.0319)

Epoch 8/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:30<00:00,  6.69it/s, loss=1.08]



Train Loss: 1.0829
Val Loss: 1.0268
Learning Rate: 0.002000
✓ Saved best model (val_loss: 1.0268)

Sample generation:
I believe thatters today. And has it been such a committerictings to the Colour ited laterics. I dongred the Gandi Shri Vanik Mahommin Sud tod earntended by objectings oftended. The Muslim Confectings it is said to doned to a large goverty. The notematurn permffectings only will the some in it afters the that. Th

Epoch 9/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:30<00:00,  6.68it/s, loss=1.06]



Train Loss: 1.0778
Val Loss: 1.0230
Learning Rate: 0.002000
✓ Saved best model (val_loss: 1.0230)

Epoch 10/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:31<00:00,  6.64it/s, loss=1.08]



Train Loss: 1.0735
Val Loss: 1.0193
Learning Rate: 0.002000
✓ Saved best model (val_loss: 1.0193)

Epoch 11/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:30<00:00,  6.67it/s, loss=1.06]



Train Loss: 1.0701
Val Loss: 1.0172
Learning Rate: 0.002000
✓ Saved best model (val_loss: 1.0172)

Epoch 12/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:31<00:00,  6.65it/s, loss=1.06]



Train Loss: 1.0668
Val Loss: 1.0143
Learning Rate: 0.002000
✓ Saved best model (val_loss: 1.0143)

Epoch 13/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:32<00:00,  6.62it/s, loss=1.07]



Train Loss: 1.0639
Val Loss: 1.0122
Learning Rate: 0.002000
✓ Saved best model (val_loss: 1.0122)

Epoch 14/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:32<00:00,  6.62it/s, loss=1.05]



Train Loss: 1.0614
Val Loss: 1.0116
Learning Rate: 0.002000
✓ Saved best model (val_loss: 1.0116)

Epoch 15/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:32<00:00,  6.60it/s, loss=1.07]



Train Loss: 1.0589
Val Loss: 1.0091
Learning Rate: 0.002000
✓ Saved best model (val_loss: 1.0091)

Sample generation:
I believe thatand bected by trendly redully attly to. And I have new to only graced to dond thememe. I hadd to all worry tome behart therrity on and hend to me tor mand the in yours tongurn has betward the in Previdum. I will learty and ably gorty to wherebe ton torteding the torrey ton the Kalthway The Botha, Ma

Epoch 16/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:33<00:00,  6.59it/s, loss=1.06]



Train Loss: 1.0567
Val Loss: 1.0081
Learning Rate: 0.002000
✓ Saved best model (val_loss: 1.0081)

Epoch 17/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:33<00:00,  6.58it/s, loss=1.07]



Train Loss: 1.0545
Val Loss: 1.0066
Learning Rate: 0.002000
✓ Saved best model (val_loss: 1.0066)

Epoch 18/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:33<00:00,  6.57it/s, loss=1.05]



Train Loss: 1.0527
Val Loss: 1.0042
Learning Rate: 0.002000
✓ Saved best model (val_loss: 1.0042)

Epoch 19/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:33<00:00,  6.59it/s, loss=1.06]



Train Loss: 1.0509
Val Loss: 1.0039
Learning Rate: 0.002000
✓ Saved best model (val_loss: 1.0039)

Epoch 20/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:33<00:00,  6.59it/s, loss=1.04]



Train Loss: 1.0492
Val Loss: 1.0021
Learning Rate: 0.002000
✓ Saved best model (val_loss: 1.0021)

Epoch 21/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:34<00:00,  6.54it/s, loss=1.04]



Train Loss: 1.0478
Val Loss: 1.0011
Learning Rate: 0.002000
✓ Saved best model (val_loss: 1.0011)

Epoch 22/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:33<00:00,  6.57it/s, loss=1.05]



Train Loss: 1.0463
Val Loss: 0.9998
Learning Rate: 0.002000
✓ Saved best model (val_loss: 0.9998)

Sample generation:
I believe thatty tonst test to the mattend no meass on on tol ongly thes to an bect to hespeast tivest give prouctiff and as tols he tead in ally again ites to dees to forty. I hand wark at and and fourty afte tors. We wand ther weakes forty forty forty. The was a men there tare harge tatked any to hergly in past

Epoch 23/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:34<00:00,  6.55it/s, loss=1.05]



Train Loss: 1.0451
Val Loss: 0.9982
Learning Rate: 0.002000
✓ Saved best model (val_loss: 0.9982)

Epoch 24/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:33<00:00,  6.58it/s, loss=1.05]



Train Loss: 1.0438
Val Loss: 0.9978
Learning Rate: 0.002000
✓ Saved best model (val_loss: 0.9978)

Epoch 25/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:34<00:00,  6.54it/s, loss=1.05]



Train Loss: 1.0427
Val Loss: 0.9976
Learning Rate: 0.002000
✓ Saved best model (val_loss: 0.9976)

Epoch 26/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:33<00:00,  6.58it/s, loss=1.05]



Train Loss: 1.0416
Val Loss: 0.9967
Learning Rate: 0.002000
✓ Saved best model (val_loss: 0.9967)

Epoch 27/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:34<00:00,  6.56it/s, loss=1.04]



Train Loss: 1.0406
Val Loss: 0.9953
Learning Rate: 0.002000
✓ Saved best model (val_loss: 0.9953)

Epoch 28/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:33<00:00,  6.57it/s, loss=1.04]



Train Loss: 1.0396
Val Loss: 0.9953
Learning Rate: 0.002000
✓ Saved best model (val_loss: 0.9953)

Epoch 29/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:33<00:00,  6.58it/s, loss=1.04]



Train Loss: 1.0387
Val Loss: 0.9950
Learning Rate: 0.002000
✓ Saved best model (val_loss: 0.9950)

Sample generation:
I believe that beff evendignst I dark wand leasssinttity tork forty witimattly but as gregry on uppress to me. Q. I have my learty towad forty on ongeridno mit assons but torkett forty to to to thery to thes ally as to nonst and any forty ity to agingly wort toest. There furty mente himbess on to stand gon in hea

Epoch 30/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:34<00:00,  6.55it/s, loss=1.03]



Train Loss: 1.0378
Val Loss: 0.9939
Learning Rate: 0.002000
✓ Saved best model (val_loss: 0.9939)

Epoch 31/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:33<00:00,  6.58it/s, loss=1.05]



Train Loss: 1.0369
Val Loss: 0.9931
Learning Rate: 0.002000
✓ Saved best model (val_loss: 0.9931)

Epoch 32/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:34<00:00,  6.54it/s, loss=1.06]



Train Loss: 1.0362
Val Loss: 0.9920
Learning Rate: 0.002000
✓ Saved best model (val_loss: 0.9920)

Epoch 33/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:33<00:00,  6.59it/s, loss=1.02]



Train Loss: 1.0354
Val Loss: 0.9922
Learning Rate: 0.002000

Epoch 34/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:34<00:00,  6.56it/s, loss=1.03]



Train Loss: 1.0347
Val Loss: 0.9917
Learning Rate: 0.002000
✓ Saved best model (val_loss: 0.9917)

Epoch 35/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:34<00:00,  6.54it/s, loss=1.03]



Train Loss: 1.0340
Val Loss: 0.9909
Learning Rate: 0.002000
✓ Saved best model (val_loss: 0.9909)

Epoch 36/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:34<00:00,  6.55it/s, loss=1.03]



Train Loss: 1.0335
Val Loss: 0.9905
Learning Rate: 0.002000
✓ Saved best model (val_loss: 0.9905)

Sample generation:
I believe thaty tos to thard int to bey on beft tingly. Our concects haves tout on an out a 1934 13 and I aght on tindd tes to beft to thes vegottly befect. This will best onced to mittinded ton thes. The Apriss wenters froutind to work and to beff wellds geand to faid it sucty is timpst conttceas. But to cause t

Epoch 37/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:35<00:00,  6.53it/s, loss=1.05]



Train Loss: 1.0327
Val Loss: 0.9897
Learning Rate: 0.002000
✓ Saved best model (val_loss: 0.9897)

Epoch 38/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:34<00:00,  6.56it/s, loss=1.04]



Train Loss: 1.0321
Val Loss: 0.9887
Learning Rate: 0.002000
✓ Saved best model (val_loss: 0.9887)

Epoch 39/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:34<00:00,  6.56it/s, loss=1.05]



Train Loss: 1.0316
Val Loss: 0.9898
Learning Rate: 0.002000

Epoch 40/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:35<00:00,  6.52it/s, loss=1.04]



Train Loss: 1.0311
Val Loss: 0.9885
Learning Rate: 0.002000
✓ Saved best model (val_loss: 0.9885)

Epoch 41/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:33<00:00,  6.58it/s, loss=1.02]



Train Loss: 1.0304
Val Loss: 0.9866
Learning Rate: 0.002000
✓ Saved best model (val_loss: 0.9866)

Epoch 42/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:33<00:00,  6.57it/s, loss=1.03]



Train Loss: 1.0300
Val Loss: 0.9879
Learning Rate: 0.002000

Epoch 43/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:33<00:00,  6.57it/s, loss=1.04]



Train Loss: 1.0294
Val Loss: 0.9863
Learning Rate: 0.002000
✓ Saved best model (val_loss: 0.9863)

Sample generation:
I believe thatdly way an wity as my fast ton wight tot inces ons and durrably witry formitty for tand tombly to beces oncedd to beft missit and as tend to tempt ton ity tom ton ity tity ton timbly as torty on any tist on arady tindity to man. Fort I mempty ton on timbass: “Thermpece oncequst tist alty arity witen

Epoch 44/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:34<00:00,  6.56it/s, loss=1.02]



Train Loss: 1.0290
Val Loss: 0.9861
Learning Rate: 0.002000
✓ Saved best model (val_loss: 0.9861)

Epoch 45/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:34<00:00,  6.56it/s, loss=1.03]



Train Loss: 1.0284
Val Loss: 0.9861
Learning Rate: 0.002000

Epoch 46/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:34<00:00,  6.55it/s, loss=1.01]



Train Loss: 1.0281
Val Loss: 0.9856
Learning Rate: 0.002000
✓ Saved best model (val_loss: 0.9856)

Epoch 47/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:33<00:00,  6.57it/s, loss=1.03]



Train Loss: 1.0275
Val Loss: 0.9844
Learning Rate: 0.002000
✓ Saved best model (val_loss: 0.9844)

Epoch 48/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:35<00:00,  6.53it/s, loss=1.02]



Train Loss: 1.0271
Val Loss: 0.9849
Learning Rate: 0.002000

Epoch 49/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:34<00:00,  6.54it/s, loss=1.04]



Train Loss: 1.0269
Val Loss: 0.9851
Learning Rate: 0.002000

Epoch 50/50
------------------------------------------------------------


Training: 100%|██████████| 1405/1405 [03:33<00:00,  6.57it/s, loss=1.02]



Train Loss: 1.0264
Val Loss: 0.9852
Learning Rate: 0.002000

Sample generation:
I believe that on on yolk ond whake ton yectly 's. ist tind thes to hearculty torty on tatmon suconcest tight onsind I flictly onsidgind mist itty to tainty to way. Themmes tombly tort want oncevect torty tors wherond to mattly forty no mist tress to yolks on taity lesst ind timps to. T2- Vand tindisticumps Raisf

Training history saved to /kaggle/working/training_metrics.csv

Training completed!
Best validation loss: 0.9844
Models saved in: /kaggle/working/


# Inference

You can run inference on the trained model using the below provided code. Feel free to adjust the starting prompts and parameters as per your requirements.

In [2]:
# ===============================================
# Inference Script for Character-Level LSTM Model 
# ===============================================

from collections import OrderedDict
import torch
import torch.nn as nn
import os

MODEL_PATH = '/kaggle/working/best_model.pt'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def _strip_module_prefix(state_dict):
    new_state = OrderedDict()
    for k, v in state_dict.items():
        new_key = k
        if k.startswith('module.'):
            new_key = k[len('module.'):]
        new_state[new_key] = v
    return new_state

def load_model(model_path):
    # Load a trained model from checkpoint
    checkpoint = torch.load(model_path, map_location=device)
    
    hyperparams = checkpoint.get('hyperparameters', {})
    vocab_size = checkpoint['vocab_size']
    
    # Create model instance (same architecture as training)
    model = CharLSTM(
        vocab_size=vocab_size,
        embedding_dim=hyperparams.get('embedding_dim', 128),
        hidden_size=hyperparams.get('hidden_size', 512),
        num_layers=hyperparams.get('num_layers', 2),
        dropout=hyperparams.get('dropout', 0.3)
    )
    
    # Prepare state dict
    sd = checkpoint['model_state_dict']
    sd = _strip_module_prefix(sd)
    
    model.load_state_dict(sd)
    
    if torch.cuda.device_count() > 1:
          pass
    
    model.to(device)
    model.eval()
    
    char_to_idx = checkpoint['char_to_idx']
    idx_to_char = checkpoint['idx_to_char']
    
    return model, char_to_idx, idx_to_char

def generate_text_inference(model, start_str, char_to_idx, idx_to_char, length=500, temperature=0.8):
    
    model.eval()
    base_model = model.module if hasattr(model, 'module') else model

    with torch.no_grad():
        # Convert start string to indices
        chars = [char_to_idx.get(ch, 0) for ch in start_str]
        input_seq = torch.tensor([chars], dtype=torch.long).to(device)
        
        # Initialize hidden state
        hidden = base_model.init_hidden(1)
        hidden = tuple([h.to(device) for h in hidden])
        
        generated = start_str
        
        for _ in range(length):
            # Use the base model's forward_hidden to obtain (output, hidden)
            output, hidden = base_model.forward_hidden(input_seq, hidden)
            last_logits = output[0, -1, :] / temperature
            probs = torch.softmax(last_logits, dim=0)
            next_char_idx = torch.multinomial(probs, 1).item()
            next_char = idx_to_char[next_char_idx]
            generated += next_char
            input_seq = torch.tensor([[next_char_idx]], dtype=torch.long).to(device)
        
        return generated

def main_inference():
    model_path = MODEL_PATH
    print(f"Loading model from: {model_path} ...")
    model, char_to_idx, idx_to_char = load_model(model_path)
    print("Model loaded successfully!")
    print(f"Vocabulary size: {len(char_to_idx)}")
    
    prompts = [
        "I believe that",
        "Harijan",# non english based memory testing of something extensively present
        "ngr", # word completion from between (congress, anger)
        "I want to be remembered as",
        "Dear Friend,"
    ]
    
    print("\n" + "="*80)
    print("GENERATING TEXT SAMPLES")
    print("="*80)
    
    for prompt in prompts:
        print(f"\n{'─'*80}")
        print(f"Prompt: '{prompt}'")
        print(f"{'─'*80}")
        generated = generate_text_inference(model, prompt, char_to_idx, idx_to_char, length=400, temperature=0.8)
        print(generated)
    
    # Example temperature comparisons
    prompts = ["I want to be remembered as", "Dear Friend,"]
    temperatures = [0.1, 0.5, 1.5, 2.0]
    for prompt in prompts:
        for temp in temperatures:
            print(f"\n{'─'*80}")
            print(f"Temperature: {temp}")
            print(f"{'─'*80}")
            generated = generate_text_inference(model, prompt, char_to_idx, idx_to_char, length=300, temperature=temp)
            print(generated)

if __name__ == '__main__':
    main_inference()

Loading model from: /kaggle/working/best_model.pt ...
Model loaded successfully!
Vocabulary size: 192

GENERATING TEXT SAMPLES

────────────────────────────────────────────────────────────────────────────────
Prompt: 'I believe that'
────────────────────────────────────────────────────────────────────────────────
I believe thatlly to thery forty to onstity ontitas tind tindly tort offectindly froums fority tom- solist re ourgand tombest attly fy tombest beity tons. Thew tand mentict ond afft tatker on actind ourse- best te ty omisstabyly wentind us forlikly. 7. A. & C. S. R. SUMAR, I begast froum froughtly ondedd wary ont tons torty to sors form ably tony thevest hes tindly to thest: “We secize ours trectind tonces tort

────────────────────────────────────────────────────────────────────────────────
Prompt: 'Harijan'
────────────────────────────────────────────────────────────────────────────────
Harijant on and to Vaid AMRENCELAR COLOPRA ASHRAMATI I haved way ary, tomista : 1. Oncess