## Recap

1. Task
2. Model
3. Loss
4. Training step



## Linear regression example

In [None]:
import torch
torch.manual_seed(0)
real_params = torch.tensor([1.0, 3.0], requires_grad=False) # y = 1 + 3x

params = torch.randn_like(real_params, requires_grad=True)
print('params:', params)
xaxis = torch.arange(-10, 10, 0.1)
print(xaxis.size())
yaxis = real_params[0] + real_params[1]*xaxis + torch.randn_like(xaxis)
print(yaxis.size())
dataset = torch.stack([xaxis, yaxis], dim=1)
print(dataset.size()) 

In [None]:
import matplotlib.pyplot as plt
plt.scatter(dataset[:,0], dataset[:,1])
plt.show()

In [None]:
def regression(x_input: torch.Tensor, params: torch.Tensor) -> torch.Tensor:
    """
    Input:
    x_input: torch.Tensor
    params: torch.Tensor

    Output:
    y_hat: torch.Tensor

    x_input.shape:
        - if x_input is a single data point: torch.Size([])
        - if x_input is a batch of data points: torch.Size([batch_size])
    
    params.shape: torch.Size([2])
    
    y_hat.shape:
        - if x_input is a single data point: torch.Size([])
        - if x_input is a batch of data points: torch.Size([batch_size])
    """
    y_hat = params[0] + params[1]*x_input
    return y_hat

def loss(y: torch.Tensor, y_hat: torch.Tensor) -> torch.Tensor:
    """
    Input:
    y: torch.Tensor
    y_hat: torch.Tensor (y_hat = params[0] + params[1]*x_input)

    Output:
    loss: torch.Tensor

    y.shape:
        - if y is a single data point: torch.Size([])
        - if y is a batch of data points: torch.Size([batch_size])
        
    y_hat.shape:
        - if y_hat is a single data point: torch.Size([])
        - if y_hat is a batch of data points: torch.Size([batch_size])

    loss.shape: torch.Size([]) ALWAYS A SCALAR
    """
    return (y - y_hat).pow(2).mean()

In [None]:
import numpy as np
torch.manual_seed(0)
params = torch.randn_like(real_params, requires_grad=True)
lr = 0.001
opt = torch.optim.SGD([params], lr=lr)

params_history_single = []
loss_history_single = []

n_updates = 0
for epoch in range(30):
    for i, datapoint in enumerate(dataset):
        y_hat = regression(datapoint[0], params)
        l = loss(datapoint[1], y_hat)
        #print(datapoint[0].shape, y_hat.shape, datapoint[1].shape)
        
        opt.zero_grad()
        l.backward()
        opt.step()
        n_updates += 1
    
        
    params_history_single.append(params.clone().detach().numpy())
    loss_history_single.append(l.clone().detach().numpy())

params_history_single = np.array(params_history_single)
loss_history_single = np.array(loss_history_single)

print('params:', params)
print('n_updates:', n_updates)

In [None]:
import numpy as np
torch.manual_seed(0)
params = torch.randn_like(real_params, requires_grad=True)
lr = 0.001
opt = torch.optim.SGD([params], lr=lr)

params_history_batch = []
loss_history_batch = []

N = len(dataset) # 200
num_batches = 20
batch_size = N // num_batches
print('batch_size:', batch_size)

# dataset = [[data1,... data10], [data11...data20], ....]

n_updates = 0
for epoch in range(30):
    for i in range(num_batches):
        start = i*batch_size
        end = i*batch_size+batch_size
        y_hat = regression(dataset[start:end,0], params)
        l = loss(dataset[start:end,1], y_hat)
        #print(dataset[i:i+batch_size,0].shape, y_hat.shape, dataset[i:i+batch_size,1].shape)
        opt.zero_grad()
        l.backward()
        opt.step()
        n_updates += 1
    

    params_history_batch.append(params.clone().detach().numpy())
    loss_history_batch.append(l.clone().detach().numpy())

print('params:', params)
print('n_updates:', n_updates)


In [None]:
torch.manual_seed(0)
params = torch.randn_like(real_params, requires_grad=True)
lr = 0.001
opt = torch.optim.SGD([params], lr=lr)

params_history_total = []
loss_history_total = []

n_updates = 0
for epoch in range(30):
    y_hat = regression(dataset[:,0], params)
    l = loss(dataset[:,1], y_hat)  
    #print(dataset[:,0].shape, y_hat.shape, dataset[:,1].shape)
    
    opt.zero_grad()
    l.backward()
    opt.step()
    params_history_total.append(params.detach().clone().numpy())
    loss_history_total.append(l.detach().numpy())
    n_updates += 1

print('params:', params)
print('n_updates:', n_updates)

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt

true_p0 = 1.0
true_p1 = 3.0

# y = p0 + p1*x
x = torch.tensor([-1.0, 1.0])
y = true_p0 + true_p1 * x  # y = [-2, 4]

def loss_p(p0, p1):
    return (y - (p0 + p1 * x)).pow(2).mean()

# Narrow the range around the optimum (1, 3)
p0_vals = np.linspace(0, 2, 100)
p1_vals = np.linspace(2, 4, 100)
p0_mesh, p1_mesh = np.meshgrid(p0_vals, p1_vals)

loss_vals = np.zeros_like(p0_mesh)
for i in range(100):
    for j in range(100):
        loss_vals[i, j] = loss_p(p0_mesh[i, j], p1_mesh[i, j])

# 3D plot of the loss surface
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot_surface(p0_mesh, p1_mesh, loss_vals, cmap='viridis')
ax.set_xlabel('p0')
ax.set_ylabel('p1')
plt.show()

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt

def plot_history(params_history_batch, loss_history_batch):
    # Ground truth parameters
    true_p0, true_p1 = 1.0, 3.0

    # Generate the simple dataset
    x = torch.tensor([-1.0, 1.0])
    y = true_p0 + true_p1 * x 

    # Loss function
    def loss_p(p0, p1):
        return (y - (p0 + p1 * x)).pow(2).mean()

    # Generate mesh grid around optimal parameters
    p0_vals = np.linspace(0, 2, 100)
    p1_vals = np.linspace(2, 4, 100)
    p0_mesh, p1_mesh = np.meshgrid(p0_vals, p1_vals)

    # Compute loss values for the mesh grid
    loss_vals = np.zeros_like(p0_mesh)
    for i in range(p0_mesh.shape[0]):
        for j in range(p0_mesh.shape[1]):
            loss_vals[i, j] = loss_p(p0_mesh[i, j], p1_mesh[i, j])

    # Convert param history to numpy for plotting
    params_history_batch = np.array(params_history_batch)
    loss_history_batch = np.array(loss_history_batch)

    # Plotting
    fig = plt.figure(figsize=(10, 8))
    ax = fig.add_subplot(111, projection='3d')

    # Plot the loss surface with transparency for visibility
    surf = ax.plot_surface(p0_mesh, p1_mesh, loss_vals, cmap='viridis', alpha=0.7, edgecolor='none')

    # Overlay optimization trajectory clearly
    ax.plot(
        params_history_batch[:, 0],
        params_history_batch[:, 1],
        loss_history_batch,
        color='red', linewidth=3, marker='o', markersize=5, label='Optimization Path'
    )

    # Highlight the initial and final points clearly
    ax.scatter(params_history_batch[0, 0], params_history_batch[0, 1], loss_history_batch[0], color='blue', s=80, label='Start', edgecolor='black')
    ax.scatter(params_history_batch[-1, 0], params_history_batch[-1, 1], loss_history_batch[-1], color='yellow', s=80, label='End', edgecolor='black')

    # Axes labels and legend
    ax.set_xlabel('p0', fontsize=12)
    ax.set_ylabel('p1', fontsize=12)
    
   

    ax.legend()

    plt.title("Clear 3D Loss Surface with Optimization Path", fontsize=14)
    plt.tight_layout()
    plt.show()



In [None]:
plot_history(params_history_single, loss_history_single)

In [None]:
plot_history(params_history_batch[2:], loss_history_batch[2:])

In [None]:
plot_history(params_history_total[2:], loss_history_total[2:])

## `torch.utils.Dataset`, `Dataloaders` and `torchvision.datasets`

`Dataset` provides an interface to access individual data points and defining your own dataset class.

`DataLoader` is an iterator which provides all these features like batching, shuffling, multi-process data loading and many more. It is defined starting from a `Dataset` object.

`torchvision.datasets` provides some common datasets like MNIST, CIFAR10, etc.

The `Dataset` is an abstraction to be able to load and process each sample of your dataset lazily, while the `DataLoader` takes care of shuffling/sampling/weigthed sampling, batching, using multiprocessing to load the data, use pinned memory etc.

In [None]:
from torchvision import transforms, datasets
from torch.utils.data import DataLoader, Dataset


train_dataset = datasets.MNIST("~/datasets/", train=True, download=True,
                        transform=transforms.ToTensor()
                        )

print(train_dataset)
print(train_dataset.data.shape)
print(train_dataset.targets.shape)
print(train_dataset.data[0])

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
print(train_dataloader)

In [None]:
iteration = next(iter(train_dataloader))
print(iteration[0].shape)
print(iteration[1].shape)
print(iteration[0][0])

In [None]:
# plot the image
import matplotlib.pyplot as plt

images, labels = next(iter(train_dataloader))

print(images.shape)
print(labels.shape)


plt.imshow(images[0].squeeze(), cmap='gray')
plt.show()