# Introduction

Real world datasets have surprises in them such as missing data, outliers, and data entry errors. Code you create to analyze this data has its own set of surprises such as logical errors, syntax errors and conceptual errors. In the interest of progress it can be helpful to reduce variables to just your code before introducing additional complexity. In this example we will generate a synthetic dataset and form a base regression model for the task.

Based on the approach described in https://d2l.ai/chapter_linear-regression/synthetic-regression-data.html

# Generating Data

We'll generate data from a pure linear function and then pollute the data with noise.

In [1]:
import torch
import random

# y = Xw + b + noise
class SyntheticRegressionData():
    def __init__(self, w, b, noise=0.1, training_count = 1000, validation_count = 1000, batch_size = 64):
        self.batch_size = batch_size
        self.training_count = training_count
        self.validation_count = validation_count
        self.observation_count = training_count + validation_count
        self.X = torch.randn(self.observation_count, len(w))
        noise = torch.randn(self.observation_count, 1) * noise
        self.y = torch.matmul(self.X, w.reshape((-1, 1))) + b + noise

    def get_tensorloader(self, tensors, train, indices=slice(0, None)):
        tensors = tuple(a[indices] for a in tensors)
        dataset = torch.utils.data.TensorDataset(*tensors)
        return torch.utils.data.DataLoader(dataset, self.batch_size, shuffle=train)
    
    def get_dataloader(self, train):
        if train:
            # Training data is in the front of the dataset
            i = slice(0, self.training_count)
        else:
            # Validation data is at the end of the dataset
            i = slice(self.training_count, None)
        return self.get_tensorloader((self.X, self.y), train, i)
    
    def training_dataloader(self):
        return self.get_dataloader(train=True)

    def validation_dataloader(self):
        return self.get_dataloader(train=False)
        

In [2]:
# Try our new class
data = SyntheticRegressionData(w=torch.tensor([5, -2.1]), b=3.1)
print(f"Features: {data.X[0]}")
print(f"Label: {data.y[0]}")

Features: tensor([ 0.2400, -2.2767])
Label: tensor([8.9570])


In [3]:
# Try out the training dataloader
X, y = next(iter(data.training_dataloader()))
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X shape: torch.Size([64, 2])
y shape: torch.Size([64, 1])


# Define the Optimizer (From Scratch)

As descibed in the article d2l.ai, we'll use mini-batch stochastic gradient descent for our optimizer

In [4]:
class SGD():
    def __init__(self, params, learning_rate):
        self.params = params
        self.learning_rate = learning_rate

    def step(self):
        for param in self.params:
            param -= self.learning_rate * param.grad

    def zero_grad(self):
        for param in self.params:
            if param.grad is not None:
                param.grad_zero_()

# Define the Model (From Scratch)

We'll use only primitives for the first model and then we'll reimplement using more features in the Torch framework

In [None]:
from torch.import nn

class LinearRegressionV1(torch.nn):
    def __init__(self, input_count, learning_rate, sigma=0.01):
        super().__init__()
        self.learning_rate = learning_rate
        # Draw data from the normal distribution centered at 0
        # with standard deviation sigma
        self.w = torch.normal(0, sigma, (input_count, 1), requires_grad=True)
        self.b = torch.zeros(1, requires_grad=True)

    def forward(self, X):
        # y = Xw + b
        return torch.matmul(X, self.w) + self.b

    def loss(self, y_predicted, y):
        # We use the squared loss function
        l = ((y_predicted - y) ** 2) / 2
        return l.mean()

    def configure_optimizers(self):
        return SGD([self.w, self.b], self.learning_rate)
        

# Training