In [1]:
import torch
from torch import nn
import torchvision
from torchvision import transforms
from torch.utils.data import TensorDataset, DataLoader

## **Data**
http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-iamges-idx3-ubyte.gz  
http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz  
http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-iamges-idx3-ubyte.gz  
http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-iamges-idx1-ubyte.gz  
`{root}\FashionMNIST\raw`

In [2]:
trans = transforms.Compose([transforms.Resize((32, 32)),  # upscale
                            transforms.ToTensor()])

data_train = torchvision.datasets.FashionMNIST(
    root='./data', train=True, transform=trans, download=False 
)
data_val = torchvision.datasets.FashionMNIST(
    root='./data', train=False, transform=trans, download=False
)

In [3]:
image, label = data_train[0]  # [image, label]
print(image.shape) # (channel, height, weight)
print(label)

torch.Size([1, 32, 32])
9


In [4]:
batch_size = 64
train_loader = DataLoader(data_train, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(data_val, batch_size=batch_size, shuffle=False)

## **From Scratch**

#### Model

In [5]:
def relu(X):
    a = torch.zeros_like(X)
    return torch.max(X, a)

def softmax(X):  # X.shape = (n, d)
    X_exp = torch.exp(X)  # elementwise
    partition = X_exp.sum(1, keepdims=True)  # shape: (n, 1)
    return X_exp / partition  # shape: (n, 1)

class MLPScratch(nn.Module):
    def __init__(self, num_inputs, num_outputs, num_hiddens, sigma=0.01):
        super().__init__()
        self.num_inputs = num_inputs
        self.W1 = nn.Parameter(torch.randn(num_inputs, num_hiddens) * sigma)
        self.b1 = nn.Parameter(torch.zeros(num_hiddens))
        self.W2 = nn.Parameter(torch.randn(num_hiddens, num_outputs) * sigma)
        self.b2 = nn.Parameter(torch.zeros(num_outputs))

    def forward(self, X):
        X = X.reshape(-1, self.num_inputs)
        H = relu(torch.matmul(X, self.W1) + self.b1)
        O = torch.matmul(H, self.W2) + self.b2
        return softmax(O)

#### Loss

In [6]:
def cross_entropy(y_hat, y):
    # y_hat: (B, q)
    # y: (B)
    # sum -y_i*log(y_hat_i)
    return -torch.log(y_hat[list(range(y_hat.shape[0])), y]).mean()  # 정의는 sum()인데 batch_size로 나눠주려고 mean() 씀

def accuracy(y_hat, y):
    # y_hat: (B, q)
    # y: (B)
    preds = y_hat.argmax(axis=1).type(y.dtype)  # (B)
    compare = (preds == y).type(torch.float32)  # (B)
    return compare.mean()

#### Training

In [7]:
lr = 0.1
model = MLPScratch(num_inputs=1*32*32, num_outputs=10, num_hiddens=256)

In [8]:
optimizer = torch.optim.SGD(params=model.parameters(), lr=lr)

In [9]:
%%time
max_epochs = 10

for i in range(max_epochs):
    train_loss = 0
    num_train_batches = 0
    
    for X, y in train_loader:
        optimizer.zero_grad()  # paramter.grad에 저장된 값을 None으로 초기화
        y_hat = model(X)
        loss = cross_entropy(y_hat, y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        num_train_batches += 1

    val_loss = 0
    val_acc = 0
    num_val_batches = 0
    with torch.no_grad():
        for X, y in val_loader:
            y_hat = model(X)
            loss = cross_entropy(y_hat, y)
            val_loss += loss.item()
            num_val_batches += 1
            val_acc += accuracy(y_hat, y)

    print(f'epoch={i:02d} | train_loss={train_loss/num_train_batches:.4f} | val_loss={val_loss/num_val_batches:.4f} | val_acc={val_acc/num_val_batches:.4f}')

epoch=00 | train_loss=0.6806 | val_loss=0.6302 | val_acc=0.7959
epoch=01 | train_loss=0.4514 | val_loss=0.4564 | val_acc=0.8349
epoch=02 | train_loss=0.4035 | val_loss=0.5585 | val_acc=0.7727
epoch=03 | train_loss=0.3772 | val_loss=0.4152 | val_acc=0.8489
epoch=04 | train_loss=0.3583 | val_loss=0.3967 | val_acc=0.8560
epoch=05 | train_loss=0.3421 | val_loss=0.4904 | val_acc=0.8115
epoch=06 | train_loss=0.3286 | val_loss=0.4114 | val_acc=0.8486
epoch=07 | train_loss=0.3176 | val_loss=0.3660 | val_acc=0.8678
epoch=08 | train_loss=0.3101 | val_loss=0.3547 | val_acc=0.8732
epoch=09 | train_loss=0.2994 | val_loss=0.3454 | val_acc=0.8745
CPU times: total: 12min 56s
Wall time: 2min 11s


## **Concise Implementation**

In [44]:
class MLP(nn.Module):
    def __init__(self, num_inputs, num_hiddens, num_outputs):
        super().__init__()
        self.num_inputs = num_inputs
        self.net = nn.Sequential(
            nn.Linear(num_inputs, num_hiddens),
            nn.ReLU(),
            nn.Linear(num_hiddens, num_outputs),
        )

    def forward(self, X):
        X = X.reshape(-1, self.num_inputs)
        return self.net(X)

In [45]:
def cross_entropy(y_hat, y):
    # y_hat: (B, q)
    # y: (B)
    # sum -y_i*log(y_hat_i)
    loss_fn = nn.CrossEntropyLoss()
    return loss_fn(y_hat, y)

def accuracy(y_hat, y):
    # y_hat: (B, q)
    # y: (B)
    preds = y_hat.argmax(axis=1).type(y.dtype)  # (B)
    compare = (preds == y).type(torch.float32)  # (B)
    return compare.mean()

In [46]:
lr = 0.1
model = MLP(num_inputs=1*32*32, num_hiddens=256, num_outputs=10)

In [47]:
optimizer = torch.optim.SGD(params=model.parameters(), lr=lr)

In [48]:
%%time
max_epochs = 10

for i in range(max_epochs):
    train_loss = 0
    num_train_batches = 0
    
    for X, y in train_loader:
        optimizer.zero_grad()  # paramter.grad에 저장된 값을 None으로 초기화
        y_hat = model(X)
        loss = cross_entropy(y_hat, y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        num_train_batches += 1

    val_loss = 0
    val_acc = 0
    num_val_batches = 0
    with torch.no_grad():
        for X, y in val_loader:
            y_hat = model(X)
            loss = cross_entropy(y_hat, y)
            val_loss += loss.item()
            num_val_batches += 1
            val_acc += accuracy(y_hat, y)

    print(f'epoch={i:02d} | train_loss={train_loss/num_train_batches:.4f} | val_loss={val_loss/num_val_batches:.4f} | val_acc={val_acc/num_val_batches:.4f}')

epoch=00 | train_loss=0.6254 | val_loss=0.5637 | val_acc=0.8024
epoch=01 | train_loss=0.4408 | val_loss=0.4310 | val_acc=0.8487
epoch=02 | train_loss=0.3976 | val_loss=0.4477 | val_acc=0.8359
epoch=03 | train_loss=0.3724 | val_loss=0.4297 | val_acc=0.8419
epoch=04 | train_loss=0.3510 | val_loss=0.3722 | val_acc=0.8651
epoch=05 | train_loss=0.3384 | val_loss=0.3619 | val_acc=0.8686
epoch=06 | train_loss=0.3237 | val_loss=0.3704 | val_acc=0.8637
epoch=07 | train_loss=0.3137 | val_loss=0.3483 | val_acc=0.8776
epoch=08 | train_loss=0.3059 | val_loss=0.3483 | val_acc=0.8748
epoch=09 | train_loss=0.2974 | val_loss=0.3666 | val_acc=0.8648
CPU times: total: 12min 7s
Wall time: 2min 2s
