# Introduction

Real world datasets have surprises in them such as missing data, outliers, and data entry errors. Code you create to analyze this data has its own set of surprises such as logical errors, syntax errors and conceptual errors. In the interest of progress it can be helpful to reduce variables to just your code before introducing additional complexity. In this example we will generate a synthetic dataset and form a base regression model for the task.

Based on the approach described in https://d2l.ai/chapter_linear-regression/synthetic-regression-data.html

# Generating Data

We'll generate data from a pure linear function and then pollute the data with noise.

In [14]:
import torch
import random

# y = Xw + b + noise
class SyntheticRegressionData():
    def __init__(self, w, b, noise=0.1, training_count = 1000, validation_count = 1000, batch_size = 64):
        self.batch_size = batch_size
        self.training_count = training_count
        self.validation_count = validation_count
        self.observation_count = training_count + validation_count
        self.X = torch.randn(self.observation_count, len(w))
        noise = torch.randn(self.observation_count, 1) * noise
        self.y = torch.matmul(self.X, w.reshape((-1, 1))) + b + noise

    def get_tensorloader(self, tensors, train, indices=slice(0, None)):
        tensors = tuple(a[indices] for a in tensors)
        dataset = torch.utils.data.TensorDataset(*tensors)
        return torch.utils.data.DataLoader(dataset, self.batch_size, shuffle=train)
    
    def get_dataloader(self, train):
        if train:
            # Training data is in the front of the dataset
            i = slice(0, self.training_count)
        else:
            # Validation data is at the end of the dataset
            i = slice(self.training_count, None)
        return self.get_tensorloader((self.X, self.y), train, i)
    
    def training_dataloader(self):
        return self.get_dataloader(train=True)

    def validation_dataloader(self):
        return self.get_dataloader(train=False)
        

In [15]:
# Try our new class
data = SyntheticRegressionData(w=torch.tensor([5, -2.1]), b=3.1)
print(f"Features: {data.X[0]}")
print(f"Label: {data.y[0]}")

Features: tensor([ 0.0265, -0.7153])
Label: tensor([4.9122])


In [16]:
# Try out the training dataloader
X, y = next(iter(data.training_dataloader()))
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X shape: torch.Size([64, 2])
y shape: torch.Size([64, 1])
