# Lab 3
## exercise 01

In [None]:
from typing import Any
import torch
from torch.autograd import Function
from torch import Tensor

# ---------- Part A: Custom autograd function for f(x1,x2) ----------
# f(x) = (x1 - 6)^2 + x2^2 - x1*x2
class MyFunction(Function):
    @staticmethod
    def forward(ctx: Any, x: Tensor) -> Tensor:
        # Save x for backward
        ctx.save_for_backward(x)
        x1, x2 = x[0], x[1]
        func_value = (x1 - 6)**2 + x2**2 - x1 * x2  # scalar tensor
        return func_value

    @staticmethod
    def backward(ctx: Any, grad_output: Tensor):
        (x,) = ctx.saved_tensors
        x1, x2 = x[0], x[1]
        # ∇f = [ 2(x1-6) - x2,  2x2 - x1 ]
        grad_x = torch.tensor([2*(x1 - 6) - x2, 2*x2 - x1], dtype=x.dtype, device=x.device)
        # Chain rule: grad_output is scalar multiplier from subsequent ops
        return grad_output * grad_x

func = MyFunction()

# Check: forward and backward at x = (6,6)
x = torch.tensor([6., 6.], requires_grad=True)
y = func.apply(x)
print('Function output f(6,6):', y.item())
y.backward()
print('Gradients at (6,6):', x.grad.tolist())    # Expect [-6, 6]


Function output f(6,6): 0.0
Gradients at (6,6): [-6.0, 6.0]


  func = MyFunction()


tensor([0., 0.])

In [4]:
class GradientDescentOptimizer:
    def __init__(self, func: Function, max_steps: int, alpha: float):
        """
        Init an Optimizer for performing GD.
        :param func: Function to apply.
        :param max_steps: Maximum number of GD steps.
        :param alpha: Learning Rate.
        """
        self.func = func
        self.max_steps = max_steps
        self.alpha = alpha

    def __call__(self, x: Tensor, tol: float = 1e-6, verbose: bool = True) -> Tensor:
        """
        Apply GD on a tensor.
        :param x: Input tensor (will not be modified).
        :param tol: Early-stopping threshold on gradient norm.
        :param verbose: If True, print progress.
        """
        # Work on a detached copy
        x_cp = x.detach().clone()
        x_cp.requires_grad_(True)

        for step in range(1, self.max_steps + 1):
            # Forward
            y = self.func.apply(x_cp)

            # Backward: compute ∇f(x_cp)
            if x_cp.grad is not None:
                x_cp.grad.zero_()
            y.backward()

            # Early stopping: small gradient norm
            grad_norm = x_cp.grad.norm().item()
            if verbose:
                print(f"step {step:2d}: x = {x_cp.detach().tolist()}, f(x) = {y.item():.6f}, ||grad|| = {grad_norm:.3e}")

            if grad_norm < tol:
                if verbose:
                    print(f"Early stop: gradient norm {grad_norm:.3e} < tol {tol:.1e}")
                break

            # Gradient step: x <- x - alpha * grad
            with torch.no_grad():
                x_cp.data -= self.alpha * x_cp.grad.data

        return x_cp

# Run the optimizer from x = (6,6)
x0 = torch.tensor([6., 6.], requires_grad=True)
gd_optimizer = GradientDescentOptimizer(func=MyFunction(), max_steps=10, alpha=0.1)  # alpha < 2/3 ensures convergence
x_new = gd_optimizer(x0, tol=1e-10, verbose=True)
print("Final x:", x_new.detach().tolist())

step  1: x = [6.0, 6.0], f(x) = 0.000000, ||grad|| = 8.485e+00
step  2: x = [6.599999904632568, 5.400000095367432], f(x) = -6.119997, ||grad|| = 5.940e+00
step  3: x = [7.019999980926514, 4.980000019073486], f(x) = -9.118799, ||grad|| = 4.158e+00
step  4: x = [7.314000129699707, 4.685999870300293], f(x) = -10.588211, ||grad|| = 2.910e+00
step  5: x = [7.519800186157227, 4.480199813842773], f(x) = -11.308224, ||grad|| = 2.037e+00
step  6: x = [7.663860321044922, 4.336139678955078], f(x) = -11.661030, ||grad|| = 1.426e+00
step  7: x = [7.764702320098877, 4.235297679901123], f(x) = -11.833906, ||grad|| = 9.983e-01
step  8: x = [7.835291862487793, 4.164708137512207], f(x) = -11.918613, ||grad|| = 6.988e-01
step  9: x = [7.884704113006592, 4.115295886993408], f(x) = -11.960123, ||grad|| = 4.892e-01
step 10: x = [7.91929292678833, 4.08070707321167], f(x) = -11.980459, ||grad|| = 3.424e-01
Final x: [7.94350528717041, 4.05649471282959]


  gd_optimizer = GradientDescentOptimizer(func=MyFunction(), max_steps=10, alpha=0.1)  # alpha < 2/3 ensures convergence
