In [1]:
%matplotlib inline
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

## **Dataset**

In [2]:
num_inputs = 2

w = torch.tensor([2, -3.4])
b = 4.2

batch_size = 32

num_train = 1000
X_train = torch.normal(0, 1, (num_train, num_inputs))
y_train = torch.matmul(X_train, w.reshape(-1, 1)) + b + torch.normal(0, 0.01, (num_train, 1))

num_val = 1000
X_val = torch.normal(0, 1, (num_val, num_inputs))
y_val = torch.matmul(X_val, w.reshape(-1, 1)) + b + torch.normal(0, 0.01, (num_val, 1))

train_ds = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

val_ds = TensorDataset(X_val, y_val)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

## **Functions**

In [3]:
class LinearRegression(nn.Module):
    def __init__(self, num_inputs, lr, sigma=0.01):
        super().__init__()
        self.num_inputs = num_inputs
        self.lr = lr
        self.w = torch.normal(0, sigma, (num_inputs, 1), requires_grad=True)
        self.b = torch.zeros(1, requires_grad=True)

    def forward(self, X):
        return torch.matmul(X, self.w) + self.b

In [4]:
class SGD():
    def __init__(self, params, lr):
        self.params = params
        self.lr = lr

    def step(self):
        with torch.no_grad():  # gradient 계산 없이 주어진 연산만 진행해라
            for param in self.params:
                param -= self.lr * param.grad

    def zero_grad(self):
        for param in self.params:
            if param.grad is not None:
                param.grad.zero_()

## **Training**

In [5]:
model = LinearRegression(num_inputs=num_inputs, lr=0.03)
optimizer = SGD([model.w, model.b], model.lr)
criterion = nn.MSELoss()

num_epochs = 10
for i in range(num_epochs):
    model.train()

    num_train_batches = 0
    train_loss = 0
    for X, y in train_loader:
        optimizer.zero_grad()
        y_hat = model(X)
        loss = criterion(y_hat, y)
        loss.backward()   # loss를 parameter로 미분해라 → w.grad, b.grad 값 생김
        optimizer.step()  # 계산한 gradient에 따라 parameter를 업데이트해라

        num_train_batches += 1
        train_loss += loss.item()  # 값만 가져와라
        
    model.eval()

    num_val_batches = 0
    val_loss = 0
    with torch.no_grad():
        for X, y in val_loader:
            y_hat = model(X)
            loss = criterion(y_hat, y)

            num_val_batches += 1
            val_loss += loss.item()

    print(f'epoch {i:02d} train_loss={train_loss/num_train_batches:.4f} val_loss={val_loss/num_val_batches:.4f}')

epoch 00 train_loss=8.6772 val_loss=0.8581
epoch 01 train_loss=0.2194 val_loss=0.0223
epoch 02 train_loss=0.0058 val_loss=0.0007
epoch 03 train_loss=0.0002 val_loss=0.0001
epoch 04 train_loss=0.0001 val_loss=0.0001
epoch 05 train_loss=0.0001 val_loss=0.0001
epoch 06 train_loss=0.0001 val_loss=0.0001
epoch 07 train_loss=0.0001 val_loss=0.0001
epoch 08 train_loss=0.0001 val_loss=0.0001
epoch 09 train_loss=0.0001 val_loss=0.0001


In [6]:
model.w

tensor([[ 1.9994],
        [-3.4004]], requires_grad=True)

In [7]:
w

tensor([ 2.0000, -3.4000])

In [8]:
model.b

tensor([4.2003], requires_grad=True)

In [9]:
b

4.2

**정리하면**  
- model: require_grad=True인 paramters 필수 + forward() 메소드 필수
- optimizer: gradient와 learning rate에 따라 paramter를 업데이트 시키는 step() 메소드 필수 + gradient 초기화시키는 zero_grad() 메소드 필수
- loss: y_hat과 y로 계산하는 loss
- training 할때는 optimizer.zero_grad() → forward() → loss(y_hat, y) → → loss.backward() → optimizer.step()