### Imports + Generator (PixelShuffle)

In [1]:
import math
import torch
import torch.nn as nn

In [2]:
class ResidualBlock(nn.Module):
    def __init__(self, n_feats: int):
        super().__init__()
        # Residual block
        self.block = nn.Sequential(
            nn.Conv2d(n_feats, n_feats, 3, padding=1),  # First conv
            nn.ReLU(inplace=True),
            nn.Conv2d(n_feats, n_feats, 3, padding=1),  # Second conv
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x + self.block(x)

In [3]:
class UpsampleBlock(nn.Module):
    """X2 upsample with PixelShuffle; chain this block log2(scale) times."""
    def __init__(self, n_feats: int):
        super().__init__()
        self.conv = nn.Conv2d(n_feats, 4 * n_feats, 3, padding=1)
        self.ps = nn.PixelShuffle(2)
        self.act = nn.ReLU(inplace=True)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.act(self.ps(self.conv(x)))

In [4]:
class UpscaleGeneratorV0(nn.Module):
    """
    Minimal upscaler:
      - scale ∈ {2,4,8}
      - alpha, preserve_graphics are placeholders for future steps.
    """
    def __init__(self, scale: int = 4, n_feats: int = 64, n_res: int = 16):
        super().__init__()
        assert scale in {2, 4, 8}, "Scale must be 2, 4, or 8."
        self.scale = scale

        # Shallow feature extractor
        self.head = nn.Conv2d(3, n_feats, 3, padding=1)

        # Lightweight residual blocks
        self.body = nn.Sequential(*[ResidualBlock(n_feats) for _ in range(n_res)])

        # Upsampling blocks
        up_blocks = []
        steps = int(math.log2(scale))
        for _ in range(steps):
            up_blocks.append(UpsampleBlock(n_feats))
        self.upsampler = nn.Sequential(*up_blocks)

        # Reconstruction to RGB
        self.tail = nn.Conv2d(n_feats, 3, 3, padding=1)
    
    @torch.no_grad()
    def count_params(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

    def forward(self, x: torch.Tensor, alpha: float = 0.0, preserve_graphics: bool = False) -> torch.Tensor:
        # Alpha and preserve_graphics are just used yet; just part of the API.
        return self.tail(self.upsampler(self.body(self.head(x))))

Smoke test

In [5]:
# Repro seed
torch.manual_seed(0)

# Dummy LR image: 24x24 -> expect 96x96 for scale=4
lr = torch.randn(1, 3, 24, 24)

gen = UpscaleGeneratorV0(scale=4, n_feats=64, n_res=4)
with torch.no_grad():
    sr = gen(lr, alpha=0.0, preserve_graphics=False)

print("LR shape:", tuple(lr.shape))
print("SR shape:", tuple(sr.shape))
print("Trainable params (M):", round(gen.count_params() / 1e6, 3))

LR shape: (1, 3, 24, 24)
SR shape: (1, 3, 96, 96)
Trainable params (M): 0.594


### α‑conditioned residual blocks (FiLM) + new generator class

In [6]:
class AlphaMLP(nn.Module):
    """Maps scalar alpha -> per-channel (gamma, beta) for FiLM."""
    def __init__(self, n_feats: int, hidden: int = 32, gain: float = 0.1):
        super().__init__()
        # Create a simple 2-layer MLP:
        # Input: alpha (single value)
        # Hidden layer: 32 neurons by default
        # Output: 2*n_feats (gamma and beta for each channel)
        self.net = nn.Sequential(
            nn.Linear(1, hidden),          # First layer: 1 -> hidden_dim
            nn.ReLU(inplace=True),         # Activation function
            nn.Linear(hidden, 2 * n_feats), # Second layer: hidden_dim -> 2*n_feats
        )
        # Gain controls the strength of the modulation
        # Lower gain = gentler modulation of features
        # This prevents extreme values at the start of training
        self.gain = gain

    def forward(self, alpha: torch.Tensor):
        """
        alpha: shape (N,) in [0,1]
        returns gamma, beta with shape (N, C)
        """
        # Handle scalar input by converting to a 1D tensor
        if alpha.dim() == 0:
            alpha = alpha.view(1)  # Convert scalar to 1D tensor
            
        # Add feature dimension for Linear layer (N,) -> (N,1)
        out = self.net(alpha.unsqueeze(-1))  # Results in (N, 2*C)
        
        # Apply tanh to bound values between -1 and 1, then scale by gain
        # This keeps the modulation gentle, especially early in training
        out = torch.tanh(out) * self.gain
        
        # Split the output into gamma and beta components
        # gamma is multiplicative scale, beta is additive shift
        gamma, beta = out.chunk(2, dim=-1)  # Each is (N, C)
        
        return gamma, beta

In [7]:
class ResidualBlockCond(nn.Module):
    """
    Conditional Residual Block that applies Feature-wise Linear Modulation (FiLM).
    Unlike the basic ResidualBlock, this version can be modulated by external
    parameters (gamma, beta) to conditionally adjust the feature representations.
    """
    def __init__(self, n_feats: int):
        super().__init__()
        # First convolution: maintains feature dimensions (n_feats -> n_feats)
        # Uses 3x3 kernel with padding=1 to preserve spatial dimensions
        self.conv1 = nn.Conv2d(n_feats, n_feats, 3, padding=1)
        
        # ReLU activation with inplace=True to save memory
        # Applied between the two convolutions in the residual path
        self.act   = nn.ReLU(inplace=True)
        
        # Second convolution: also maintains feature dimensions
        # The output of this layer will be modulated by FiLM parameters
        self.conv2 = nn.Conv2d(n_feats, n_feats, 3, padding=1)

    def forward(self, x, gamma: torch.Tensor, beta: torch.Tensor):
        """
        Forward pass with Feature-wise Linear Modulation (FiLM).
        
        Args:
            x: Input feature tensor of shape (N, C, H, W)
            gamma: Multiplicative modulation parameter of shape (N, C)
                  Controls the scale/gain for each channel
            beta: Additive modulation parameter of shape (N, C)
                 Controls the bias/shift for each channel
        
        The FiLM transformation applies: output = (1 + gamma) * features + beta
        This allows the alpha parameter to control how features are processed.
        """
        # Apply first convolution to input features
        y = self.conv1(x)
        
        # Apply ReLU activation (non-linearity)
        y = self.act(y)
        
        # Apply second convolution - this output will be modulated
        y = self.conv2(y)
        
        # Extract batch size and channel dimensions for reshaping
        N, C, _, _ = y.shape
        
        # Apply FiLM modulation:
        # - Reshape gamma from (N,C) to (N,C,1,1) for broadcasting across H,W dims
        # - (1.0 + gamma) creates multiplicative modulation centered around 1.0
        # - Adding beta provides additive bias for each channel
        # This allows alpha to control the strength and direction of feature modulation
        y = y * (1.0 + gamma.view(N, C, 1, 1)) + beta.view(N, C, 1, 1)
        
        # Residual connection: add original input to modulated features
        # This preserves gradient flow and allows the network to learn identity mappings
        return x + y

class UpsampleBlock(nn.Module):
    """
    Upsampling block that doubles spatial resolution using PixelShuffle.
    This is a sub-pixel convolution approach that's more parameter efficient
    than transposed convolutions and avoids checkerboard artifacts.
    """
    def __init__(self, n_feats: int):
        super().__init__()
        # Convolution that increases channels by 4x (for 2x2 PixelShuffle)
        # Input: n_feats channels, Output: 4*n_feats channels
        # The 4x factor comes from 2x2 upsampling (2*2=4 sub-pixels per pixel)
        self.conv = nn.Conv2d(n_feats, 4 * n_feats, 3, padding=1)
        
        # PixelShuffle rearranges 4*n_feats channels into 2x larger spatial dims
        # Takes (N, 4*C, H, W) -> (N, C, 2*H, 2*W)
        # This is the core of sub-pixel convolution upsampling
        self.ps = nn.PixelShuffle(2)
        
        # ReLU activation applied after upsampling
        # Ensures positive feature values going into next layer
        self.act = nn.ReLU(inplace=True)
    
    def forward(self, x):
        """
        Forward pass: Conv -> PixelShuffle -> ReLU
        Doubles the spatial resolution while maintaining channel count.
        """
        # Apply convolution to increase channel count by 4x
        x = self.conv(x)
        
        # Rearrange channels to increase spatial resolution by 2x
        x = self.ps(x)
        
        # Apply activation for non-linearity
        return self.act(x)

In [8]:
class UpscaleGeneratorV1(nn.Module):
    """
    V1: like V0 but residuals are modulated by alpha via FiLM.
    API stays: forward(x, alpha, preserve_graphics)
    """
    def __init__(self, scale: int = 4, n_feats: int = 64, n_res: int = 4):
        super().__init__()
        # Ensure scale is a power of 2 between 2 and 8
        # This determines how many upsampling blocks we need
        assert scale in (2,4,8)
        self.scale = scale
        self.n_feats = n_feats

        # Initial feature extraction: RGB (3 channels) -> feature space (n_feats channels)
        # This conv layer learns to extract low-level features from the input image
        self.head = nn.Conv2d(3, n_feats, 3, padding=1)
        
        # Main processing backbone: stack of conditional residual blocks
        # Using ModuleList allows us to iterate over blocks in forward pass
        # Each block can be modulated differently by the alpha parameter via FiLM
        self.body = nn.ModuleList([ResidualBlockCond(n_feats) for _ in range(n_res)])

        # Upsampling path: chain multiple 2x upsampling blocks to reach target scale
        # For scale=4: need 2 blocks (2^2=4), for scale=8: need 3 blocks (2^3=8)
        steps = int(math.log2(scale))
        self.upsampler = nn.Sequential(*[UpsampleBlock(n_feats) for _ in range(steps)])
        
        # Final reconstruction: convert features back to RGB space
        # This is where the final super-resolved image is generated
        self.tail = nn.Conv2d(n_feats, 3, 3, padding=1)

        # Alpha MLP: converts scalar alpha value into FiLM parameters (gamma, beta)
        # Shared across all residual blocks - one alpha controls the entire network
        # Hidden=32: compact representation, gain=0.1: gentle modulation to start
        self.alpha_mlp = AlphaMLP(n_feats, hidden=32, gain=0.1)

    @torch.no_grad()
    def count_params(self):
        # Utility method to count trainable parameters for model size analysis
        # @torch.no_grad() prevents gradient tracking during parameter counting
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

    def forward(self, x, alpha: float = 0.0, preserve_graphics: bool = False):
        # Convert alpha to tensor if it's a scalar, ensuring it's on the right device/dtype
        # This handles both tensor and scalar inputs consistently
        if not torch.is_tensor(alpha):
            alpha = torch.tensor(alpha, dtype=x.dtype, device=x.device)
        
        # Defensive clamping: ensure alpha stays in valid [0,1] range
        # Values outside this range could cause unstable FiLM modulation
        alpha = torch.clamp(alpha, 0.0, 1.0)

        # Extract initial features from RGB input
        # Shape: (N, 3, H, W) -> (N, n_feats, H, W)
        feat = self.head(x)

        # Prepare FiLM parameters for all residual blocks
        # Get batch size to ensure alpha is broadcast correctly across the batch
        N = x.shape[0]
        
        # Expand alpha to match batch size: scalar -> (N,)
        # This allows different alpha values per batch item if needed
        a = alpha.expand(N)  # (N,)
        
        # Generate FiLM modulation parameters from alpha
        # gamma: multiplicative scaling per channel, beta: additive bias per channel
        # Both have shape (N, C) where C = n_feats
        gamma, beta = self.alpha_mlp(a)  # (N,C),(N,C)

        # Apply conditional residual blocks with FiLM modulation
        # Each block receives the same gamma/beta but can use them differently
        # The alpha parameter thus controls the processing style across all blocks
        for block in self.body:
            feat = block(feat, gamma, beta)

        # Upsample features to target resolution
        # Each UpsampleBlock doubles spatial dimensions: H,W -> 2H,2W
        # Final spatial size will be original_size * scale
        feat = self.upsampler(feat)
        
        # Convert upsampled features back to RGB image
        # Shape: (N, n_feats, scale*H, scale*W) -> (N, 3, scale*H, scale*W)
        out  = self.tail(feat)
        
        # Return the super-resolved image
        # preserve_graphics parameter is not used yet but part of future API
        return out

Check: α actually changes the output

In [9]:
# Dummy LR input
torch.manual_seed(0)
lr = torch.randn(1, 3, 24, 24)

# Create V1 generator with alpha conditioning
gen = UpscaleGeneratorV1(scale=4, n_feats=64, n_res=4)
with torch.no_grad():
    # Test different alpha values to see FiLM modulation effects
    sr_a0 = gen(lr, alpha=0.0)  # No modulation (alpha=0)
    sr_a5 = gen(lr, alpha=0.5)  # Medium modulation 
    sr_a1 = gen(lr, alpha=1.0)  # Full modulation (alpha=1)

# Helper function to compute mean absolute difference
def mad(a, b):  # mean absolute diff
    return (a - b).abs().mean().item()

# Report shapes and differences between alpha values
print("SR shape @alpha=0.0:", tuple(sr_a0.shape))
print("Δ(0.0, 0.5) MAD:", round(mad(sr_a0, sr_a5), 6))  # Difference between alpha=0 and 0.5
print("Δ(0.5, 1.0) MAD:", round(mad(sr_a5, sr_a1), 6))  # Difference between alpha=0.5 and 1.0
print("Params (M):", round(gen.count_params()/1e6, 3))  # Model size in millions of parameters

SR shape @alpha=0.0: (1, 3, 96, 96)
Δ(0.0, 0.5) MAD: 0.003001
Δ(0.5, 1.0) MAD: 0.002831
Params (M): 0.599


### Add `preserve_graphics` hook (mask plumbing)

Mask stub + integrate in forward (no effect yet)

In [10]:
import torch.nn.functional as F

def structure_mask_v0(x: torch.Tensor) -> torch.Tensor:
    """
    Placeholder structure mask.
    Returns a tensor of ones with shape (N,1,H,W) to keep behavior neutral.
    Later we'll detect edges/text to prioritize them.
    """
    # Extract batch size and spatial dimensions from input tensor
    N, C, H, W = x.shape
    # Return all-ones mask: no graphics preservation yet, just neutral passthrough
    return torch.ones((N, 1, H, W), dtype=x.dtype, device=x.device)

# Patch UpscaleGeneratorV1.forward to accept preserve_graphics and compute mask (no-op blend)
def _forward_with_mask(self, x, alpha: float = 0.0, preserve_graphics: bool = False):
    # ===== original body up to 'out' =====
    # Ensure alpha is a tensor with proper dtype and device
    if not torch.is_tensor(alpha):
        alpha = torch.tensor(alpha, dtype=x.dtype, device=x.device)
    # Clamp alpha to valid range [0,1] to prevent unstable FiLM modulation
    alpha = torch.clamp(alpha, 0.0, 1.0)

    # Extract initial features from RGB input using head convolution
    feat = self.head(x)
    # Get batch size for broadcasting alpha across batch dimension
    N = x.shape[0]
    # Expand scalar alpha to match batch size: scalar -> (N,)
    a = alpha.expand(N)
    # Generate FiLM parameters (gamma, beta) from alpha using MLP
    gamma, beta = self.alpha_mlp(a)

    # Apply conditional residual blocks with FiLM modulation
    for block in self.body:
        feat = block(feat, gamma, beta)

    # Upsample features to target resolution using pixel shuffle blocks
    feat = self.upsampler(feat)
    # Convert upsampled features back to RGB space
    out  = self.tail(feat)  # (N,3,Hs,Ws)

    # ===== graphics-preservation plumbing (neutral for now) =====
    if preserve_graphics:
        # Generate structure mask for graphics preservation (currently neutral)
        M = structure_mask_v0(out)  # (N,1,Hs,Ws)
        # Neutral blend: y = out (since M=1 and we don't change anything)
        # Future implementation will blend based on mask to preserve graphics
        y = out
        return y
    else:
        # Standard path: return upscaled output without graphics preservation
        return out

# Monkey-patch method (keeps notebook simple)
# Replace the forward method of UpscaleGeneratorV1 with our enhanced version
UpscaleGeneratorV1.forward = _forward_with_mask


##### Testing Graphics Preservation Feature

We'll test the newly added `preserve_graphics` parameter to verify that:
1. The mask generation function works correctly
2. The graphics preservation pipeline is properly integrated
3. Currently it should behave as a no-op (neutral effect) since the mask returns all ones

The test will compare outputs with and without `preserve_graphics=True` to confirm they are identical, validating that our plumbing is correct before implementing actual graphics detection.

In [11]:
torch.manual_seed(0)  # set random seed for reproducible tensor draws
lr = torch.randn(1, 3, 24, 24)  # create a dummy low-resolution image tensor (N=1, C=3, H=24, W=24)

gen = UpscaleGeneratorV1(scale=4, n_feats=64, n_res=4)  # instantiate the alpha‑conditioned upscaler
with torch.no_grad():  # disable gradient tracking for inference
    sr_off = gen(lr, alpha=0.3, preserve_graphics=False)  # run generator without graphics preservation
    sr_on  = gen(lr, alpha=0.3, preserve_graphics=True)   # run generator with graphics preservation enabled

mad = (sr_off - sr_on).abs().mean().item()  # compute mean absolute difference between the two outputs
print("SR shape:", tuple(sr_on.shape))  # print shape of super-resolved output
print("MAD (graphics off vs on):", mad)  # print the mean absolute difference (should be 0 for neutral mask)

SR shape: (1, 3, 96, 96)
MAD (graphics off vs on): 0.0


### Notebook micro-train (GPU availability sanity check)

In [12]:
import os, sys

print("PyTorch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# Small, fast settings so we can iterate
SCALE = 4            # {2,4,8}
BATCH_SIZE = 4
NUM_STEPS = 50       # tiny smoke run
LR = 2e-4
ALPHA = 0.0          # start fully faithful; we'll use the knob later
IMG_SIZE_HR = 96     # small patches -> faster

# Windows-friendly DataLoader defaults
NUM_WORKERS = 0      # Avoid issues on Windows
PIN_MEMORY = True    # helps GPU batches


PyTorch: 2.5.1
CUDA available: True
Device: cuda


### Dataset Configuration and Data Loading Pipeline

This section sets up the training dataset and data loader infrastructure:
- Configures paths for high-resolution training images
- Creates on-the-fly paired dataset (HR → LR via bicubic downsampling)
- Establishes data loading pipeline with batching and GPU optimization
- Handles fallback directories and validates data availability for training

In [13]:
from pathlib import Path
from PIL import Image
import random
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms.functional as TF

IMG_EXTS = {".png", ".jpg", ".jpeg", ".bmp", ".webp"}

class PairedOnTheFlyDataset(Dataset):
    """
    Loads HR images, creates LR via bicubic downscale by SCALE.
    Returns: {'lr': tensor, 'hr': tensor, 'path': str}
    """
    def __init__(self, root_hr: Path, fallback: Path | None, scale: int = 4, patch_hr: int | None = None):
        self.scale = scale
        self.patch_hr = patch_hr
        self.root_hr = Path(root_hr)
        self.paths = []
        if self.root_hr.exists():
            self.paths = [p for p in self.root_hr.rglob("*") if p.suffix.lower() in IMG_EXTS]
        if not self.paths and fallback and Path(fallback).exists():
            self.paths = [p for p in Path(fallback).rglob("*") if p.suffix.lower() in IMG_EXTS]
        if not self.paths:
            raise FileNotFoundError("No images found in data/HR/ or data/samples/. Add a few images to proceed.")

    def _center_crop_multiple(self, img: Image.Image, multiple: int) -> Image.Image:
        w, h = img.size
        W = (w // multiple) * multiple
        H = (h // multiple) * multiple
        left = (w - W) // 2
        top  = (h - H) // 2
        return img.crop((left, top, left + W, top + H))

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        path = self.paths[idx]
        hr = Image.open(path).convert("RGB")
        hr = self._center_crop_multiple(hr, self.scale)

        # Optional HR patch for speed
        if self.patch_hr is not None:
            w, h = hr.size
            if w >= self.patch_hr and h >= self.patch_hr:
                left = (w - self.patch_hr) // 2
                top  = (h - self.patch_hr) // 2
                hr = hr.crop((left, top, left + self.patch_hr, top + self.patch_hr))

        # Make LR via bicubic
        w, h = hr.size
        lr = hr.resize((w // self.scale, h // self.scale), Image.BICUBIC)

        # To tensors in [0,1]
        hr_t = TF.to_tensor(hr)
        lr_t = TF.to_tensor(lr)
        return {"lr": lr_t, "hr": hr_t, "path": str(path)}


In [14]:
from pathlib import Path
from PIL import Image
from tqdm import tqdm

DATA_ROOT = Path("..\\data")
HR_VALID = DATA_ROOT / "HR" / "valid"
HR_TRAIN = DATA_ROOT / "HR" / "train"

#### DataLoader + model to GPU
We'll reuse `UpscaleGeneratorV1` from earlier in the notebook

In [15]:
# Build dataset & loader
root_hr = Path("..\\data") / "HR"
fallback = Path("..\\data") / "samples"
train_ds = PairedOnTheFlyDataset(root_hr, fallback, scale=SCALE, patch_hr=IMG_SIZE_HR)
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                      num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY, drop_last=True)

# Grab model from previous cell; if needed, re-run its definition.
gen = UpscaleGeneratorV1(scale=SCALE, n_feats=64, n_res=4).to(device)
print("Params (M):", round(gen.count_params()/1e6, 3))


Params (M): 0.599


##### Tiny AMP training loop (L1 only, α=0 for now)
We’ll do a single-network (generator) optimization against an L1 reconstruction loss. This is not GAN training yet; it’s just to validate the pipeline, GPU, dataloader, and that loss decreases.

In [16]:
import torch.nn.functional as F
from torch.amp import autocast, GradScaler  # Updated import path for newer PyTorch versions
from time import perf_counter

# Set model to training mode (enables dropout, batch norm training behavior)
gen.train()

# Initialize Adam optimizer with specified learning rate and beta parameters
# betas=(0.9, 0.99) are slightly different from default (0.9, 0.999) for stability
opt = torch.optim.Adam(gen.parameters(), lr=LR, betas=(0.9, 0.99))

# Initialize gradient scaler for automatic mixed precision (AMP) training
# Only enabled when using CUDA to avoid potential issues on CPU
scaler = GradScaler(device=device.type)

# Training loop initialization
step = 0  # Current training step counter
t0 = perf_counter()  # Start time for measuring training duration
loss_hist = []  # List to store loss values for analysis

# Main training loop over batches
for batch in train_dl:
    # Stop training after reaching the specified number of steps
    if step >= NUM_STEPS:
        break
    
    # Move input tensors to GPU with non-blocking transfer for efficiency
    # non_blocking=True allows CPU-GPU transfer to overlap with computation
    lr = batch["lr"].to(device, non_blocking=True)  # Low-resolution input images
    hr = batch["hr"].to(device, non_blocking=True)  # High-resolution target images

    # Zero gradients from previous iteration
    # set_to_none=True is more memory efficient than setting to zero
    opt.zero_grad(set_to_none=True)
    
    # Forward pass with automatic mixed precision
    # autocast automatically uses float16 for compatible operations to save memory/speed
    with autocast(device_type=device.type):
        # Generate super-resolved image using current alpha value and no graphics preservation
        sr = gen(lr, alpha=ALPHA, preserve_graphics=False)
        
        # Compute L1 (Mean Absolute Error) loss between generated and target images
        # L1 loss encourages sharp reconstruction and is commonly used in super-resolution
        loss = F.l1_loss(sr, hr)

    # Backward pass with gradient scaling for mixed precision training
    # scaler.scale() scales the loss to prevent gradient underflow in float16
    scaler.scale(loss).backward()
    
    # Optimizer step with gradient unscaling and clipping if needed
    # scaler.step() unscales gradients before applying them
    scaler.step(opt)
    
    # Update the gradient scaler's internal state for next iteration
    # Adjusts scaling factor based on whether gradients were finite
    scaler.update()

    # Log training progress
    loss_hist.append(loss.item())  # Store loss value for later analysis
    
    # Print progress every 10 steps
    if step % 10 == 0:
        print(f"step {step:03d}  L1: {loss.item():.4f}")
    
    step += 1  # Increment step counter

# Training completion summary
t1 = perf_counter()  # End time
print(f"done {step} steps in {t1 - t0:.2f}s ; last L1={loss_hist[-1]:.4f}")
print("mean L1:", sum(loss_hist)/len(loss_hist))


step 000  L1: 0.5374
step 010  L1: 0.1424
step 020  L1: 0.1761
step 030  L1: 0.1041
step 040  L1: 0.0865
done 50 steps in 12.64s ; last L1=0.1017
mean L1: 0.16555019304156304


Save checkpoint for continuity

In [17]:
ckpt_dir = Path("..\\experiments") / "checkpoints"
ckpt_dir.mkdir(parents=True, exist_ok=True)
ckpt_path = ckpt_dir / "gen_v1_smoke.pt"

torch.save({
    "model": gen.state_dict(),
    "scale": SCALE,
    "alpha_hint": ALPHA,
    "meta": {"steps": step, "img_size_hr": IMG_SIZE_HR}
}, ckpt_path)

print("Saved:", ckpt_path.resolve())


Saved: \PixelForge\experiments\checkpoints\gen_v1_smoke.pt
