# Introduction

Real world datasets have surprises in them such as missing data, outliers, and data entry errors. Code you create to analyze this data has its own set of surprises such as logical errors, syntax errors and conceptual errors. In the interest of progress it can be helpful to reduce variables to just your code before introducing additional complexity. In this example we will generate a synthetic dataset and form a base regression model for the task.

This notebook will cover a from scratch implementation and a more concise implementation.

Based on the approach described in https://d2l.ai/chapter_linear-regression/synthetic-regression-data.html

# Generating Data

We'll generate data from a pure linear function and then pollute the data with noise.

In [1]:
import torch
import random

# y = Xw + b + noise
class SyntheticRegressionData():
    def __init__(self, w, b, noise=0.1, training_count = 1000, validation_count = 1000, batch_size = 32):
        self.w = w
        self.b = b
        self.batch_size = batch_size
        self.training_count = training_count
        self.validation_count = validation_count
        self.observation_count = training_count + validation_count
        self.X = torch.randn(self.observation_count, len(w))
        noise = torch.randn(self.observation_count, 1) * noise
        self.y = torch.matmul(self.X, w.reshape((-1, 1))) + b + noise

    def get_tensorloader(self, tensors, train, indices=slice(0, None)):
        tensors = tuple(a[indices] for a in tensors)
        dataset = torch.utils.data.TensorDataset(*tensors)
        return torch.utils.data.DataLoader(dataset, self.batch_size, shuffle=train)
    
    def get_dataloader(self, train):
        if train:
            # Training data is in the front of the dataset
            i = slice(0, self.training_count)
        else:
            # Validation data is at the end of the dataset
            i = slice(self.training_count, None)
        return self.get_tensorloader((self.X, self.y), train, i)
    
    def training_dataloader(self):
        return self.get_dataloader(train=True)

    def validation_dataloader(self):
        return self.get_dataloader(train=False)
        

In [2]:
# Try our new class
data = SyntheticRegressionData(w=torch.tensor([5, -2.1]), b=3.1)
print(f"Features: {data.X[0]}")
print(f"Label: {data.y[0]}")

Features: tensor([-0.6643,  1.7131])
Label: tensor([-3.8266])


In [3]:
# Try out the training dataloader
X, y = next(iter(data.training_dataloader()))
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X shape: torch.Size([32, 2])
y shape: torch.Size([32, 1])


# Define the Optimizer (From Scratch)

As descibed in the article d2l.ai, we'll use mini-batch stochastic gradient descent for our optimizer

In [4]:
class SGD():
    def __init__(self, params, learning_rate):
        self.params = params
        self.learning_rate = learning_rate

    def step(self):
        for param in self.params:
            param -= self.learning_rate * param.grad

    def zero_grad(self):
        for param in self.params:
            if param.grad is not None:
                param.grad.zero_()

# Define the Model (From Scratch)

We'll use only primitives for the first model and then we'll reimplement using more features in the Torch framework

In [5]:
from torch import nn

class LinearRegressionV1(torch.nn.Module):
    def __init__(self, input_count, learning_rate, sigma=0.01):
        super().__init__()
        self.learning_rate = learning_rate
        # Draw data from the normal distribution centered at 0
        # with standard deviation sigma
        self.w = torch.normal(0, sigma, (input_count, 1), requires_grad=True)
        self.b = torch.zeros(1, requires_grad=True)

    def forward(self, X):
        # y = Xw + b
        return torch.matmul(X, self.w) + self.b

    def loss(self, y_predicted, y):
        # We use the squared loss function
        l = ((y_predicted - y) ** 2) / 2
        return l.mean()

    def training_step(self, batch):
        l = self.loss(self(*batch[:-1]), batch[-1])
        return l

    def validation_step(self, batch):
        l = self.loss(self(*batch[:-1]), batch[-1])
        return l

    def configure_optimizers(self):
        return SGD([self.w, self.b], self.learning_rate)
        

# Training

In each epoch, we process an iteration of a random batch of data. We continue processing iterations until all data has been seen.

For each iteration we do the following
 1. Grab a batch of training samples
 2. Compute the loss
 3. Compute the gradient on where to move next
 4. Update the model parameters

In [6]:
class MiniBatchTrainer():
    def __init__(self, max_epochs):
        self.max_epochs = max_epochs

    def prepare_data(self, data):
        self.training_dataloader = data.training_dataloader()
        self.validation_dataloader = data.validation_dataloader()
        self.training_batch_count = len(self.training_dataloader)
        self.validation_batch_count = len(self.validation_dataloader)

    def prepare_model(self, data):
        model.trainer = self
        self.model = model

    def fit_epoch(self, _callback = None):
        self.model.train()
        for batch in self.training_dataloader:
            loss = self.model.training_step(batch)
            if _callback:
                _callback("training loss", loss)
            self.optimizer.zero_grad()
            with torch.no_grad():
                loss.backward()
                self.optimizer.step()
                
            self.training_batch_index += 1

        self.model.eval()
        for batch in self.validation_dataloader:
            with torch.no_grad():
                loss = self.model.validation_step(batch)
                if _callback:
                    _callback("validation loss", loss)
            self.validation_batch_index +=1
    
    def fit(self, model, data, _callback = None):
        self.prepare_data(data)
        self.prepare_model(model)
        self.optimizer = model.configure_optimizers()
        self.epoch = 0
        self.training_batch_index = 0
        self.validation_batch_index = 0
        for self.epoch in range(self.max_epochs):
            _callback("epoch", self.epoch)
            self.fit_epoch(_callback)

# All Together

We'll bring it all together to declare the model, generate synthetic data and train the model.

In [7]:
def status_callback(topic, status):
    print(f"{topic}: {status}")

model = LinearRegressionV1(2, learning_rate=0.03)
data = SyntheticRegressionData(w=torch.tensor([5, -2.1]), b=3.1, training_count=256, validation_count=64)
trainer = MiniBatchTrainer(max_epochs=10)
trainer.fit(model, data, status_callback)

epoch: 0
training loss: 14.55424690246582
training loss: 20.985332489013672
training loss: 16.794666290283203
training loss: 22.17304229736328
training loss: 10.992134094238281
training loss: 9.377906799316406
training loss: 17.01113510131836
training loss: 12.273624420166016
validation loss: 6.824306964874268
validation loss: 15.008211135864258
epoch: 1
training loss: 8.786096572875977
training loss: 9.284029006958008
training loss: 11.024335861206055
training loss: 8.116580963134766
training loss: 8.700980186462402
training loss: 12.679927825927734
training loss: 9.02916431427002
training loss: 9.526836395263672
validation loss: 4.251908302307129
validation loss: 9.358114242553711
epoch: 2
training loss: 11.314159393310547
training loss: 6.041918754577637
training loss: 5.531486988067627
training loss: 5.653379917144775
training loss: 4.798514366149902
training loss: 4.812900543212891
training loss: 4.136332035064697
training loss: 5.866523265838623
validation loss: 2.664906024932861

In [8]:
with torch.no_grad():
    print(f"Error in estimating w: {data.w - model.w.reshape(data.w.shape)}")
    print(f"Error is estimating b: {data.b - model.b}")

Error in estimating w: tensor([ 0.6079, -0.1147])
Error is estimating b: tensor([0.1834])


# Define the Model (Concise)

Use the built in PyTorch classes to simplify our model definition.

In [9]:
from torch import nn

class LinearRegressionV2(nn.Module):
    def __init__(self, learning_rate):
        super().__init__()
        self.learning_rate = learning_rate
        self.net = nn.LazyLinear(1)
        self.net.weight.data.normal_(0, 0.01)
        self.net.bias.data.fill_(0)

    def forward(self, X):
        return self.net(X)

    def loss(self, y_predicted, y):
        fn = nn.MSELoss()
        return fn(y_predicted, y)

    def training_step(self, batch):
        l = self.loss(self(*batch[:-1]), batch[-1])
        return l

    def validation_step(self, batch):
        l = self.loss(self(*batch[:-1]), batch[-1])
        return l

    def configure_optimizers(self):
        return torch.optim.SGD(self.parameters(), self.learning_rate)

    def get_w_b(self):
        return (self.net.weight.data, self.net.bias.data)
        

In [10]:
model = LinearRegressionV2(learning_rate=0.03)
data = SyntheticRegressionData(w=torch.tensor([5, -2.1]), b=3.1, training_count=256, validation_count=64)
trainer = MiniBatchTrainer(max_epochs=10)
trainer.fit(model, data, status_callback)



epoch: 0
training loss: 60.92463684082031
training loss: 31.12190055847168
training loss: 27.985639572143555
training loss: 24.437040328979492
training loss: 24.39147186279297
training loss: 24.701032638549805
training loss: 11.940518379211426
training loss: 21.80307388305664
validation loss: 22.4771785736084
validation loss: 16.777027130126953
epoch: 1
training loss: 14.044628143310547
training loss: 16.353137969970703
training loss: 14.008424758911133
training loss: 10.475452423095703
training loss: 11.166021347045898
training loss: 9.426080703735352
training loss: 5.273672103881836
training loss: 7.150327682495117
validation loss: 8.76409912109375
validation loss: 6.6105780601501465
epoch: 2
training loss: 4.891706943511963
training loss: 7.176791191101074
training loss: 5.13761568069458
training loss: 3.826969861984253
training loss: 5.051006317138672
training loss: 2.134897232055664
training loss: 3.538323402404785
training loss: 2.7365832328796387
validation loss: 3.4302535057067

In [11]:
w, b = model.get_w_b()

print(f'error in estimating w: {data.w - w.reshape(data.w.shape)}')
print(f'error in estimating b: {data.b - b}')

error in estimating w: tensor([ 0.0389, -0.0282])
error in estimating b: tensor([0.0421])
