In [8]:
import os
import sys

print("Current working directory:", os.getcwd())
parent_dir = os.path.dirname(os.getcwd())
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)
    print(f"Added {parent_dir} to Python path")
else:
    print(f"{parent_dir} already in Python path")

Current working directory: /Users/jinceyang/Desktop/codebase/ml/learn_d2l/chap3
/Users/jinceyang/Desktop/codebase/ml/learn_d2l already in Python path


In [None]:
import torch
from d2l.linear_regression import SyntheticRegressionData

In [10]:
rng = torch.Generator().manual_seed(42)

```python
class SyntheticRegressionData:
    def __init__(self, 
                 w: torch.Tensor, 
                 b: torch.Tensor, 
                 noise_std: float = 0.01, 
                 num_train: int = 1000, 
                 num_test: int = 100,
                 rng: torch.Generator = torch.Generator().manual_seed(0)) -> None:
        
        self.w: torch.Tensor = w
        self.b: torch.Tensor = b
        self.noise_std: float = noise_std
        self.num_train: int = num_train
        self.num_test: int = num_test
        self.n: int = self.num_test + self.num_train
        self.rng: torch.Generator = rng
        
        self.X: torch.Tensor
        self.y: torch.Tensor

        self.generate()

    def generate(self) -> None:
        self.X: torch.Tensor = torch.randn((self.n, len(self.w)), generator=self.rng)
        self.noise: torch.Tensor = torch.normal(0, self.noise_std, (self.n, 1), generator=self.rng)
        self.y: torch.Tensor = self.X @ self.w.reshape((-1, 1)) + self.b + self.noise
    
    def get_train_data(self) -> Tuple[torch.Tensor, torch.Tensor]:
        return self.X[:self.num_train, :], self.y[:self.num_train]
    
    def get_test_data(self) -> Tuple[torch.Tensor, torch.Tensor]:
        return self.X[self.num_train:, :], self.y[self.num_train:]

    def get_all_data(self) -> Tuple[torch.Tensor, torch.Tensor]:
        return self.X, self.y
    
    def get_train_data_batch(self, batch_size: int) -> Tuple[torch.Tensor, torch.Tensor]:
        indices = torch.randperm(self.num_train, generator=self.rng)[:batch_size]
        return self.X[indices], self.y[indices]
    
    def get_train_data_batch_generator(self, batch_size: int) -> Generator[Tuple[torch.Tensor, torch.Tensor], None, None]:
        indices = torch.randperm(self.num_train, generator=self.rng)
        for i in range(0, self.num_train, batch_size):
            batch_indices = indices[i:i+batch_size]
            yield self.X[batch_indices], self.y[batch_indices]
            
    def get_train_data_loader(self, batch_size: int) -> torch.utils.data.DataLoader:
        dataset = torch.utils.data.TensorDataset(self.X[:self.num_train, :], self.y[:self.num_train])
        return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    def get_test_data_loader(self) -> torch.utils.data.DataLoader:
        dataset = torch.utils.data.TensorDataset(self.X[self.num_train:, :], self.y[self.num_train:])
        return torch.utils.data.DataLoader(dataset, batch_size=self.num_test, shuffle=False)
```

In [11]:
data = SyntheticRegressionData(w=torch.tensor([2, -3.4]), b=torch.tensor(4.2), num_test=100, num_train=1000, rng=rng)
((x_train, y_train), (x_test, y_test)) = (data.get_train_data(), data.get_test_data())
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

torch.Size([1000, 2]) torch.Size([1000, 1])
torch.Size([100, 2]) torch.Size([100, 1])


In [12]:
print('features:', data.X[0],'\nlabel:', data.y[0])

features: tensor([1.9269, 1.4873]) 
label: tensor([2.9952])


In [13]:
for x_batch, y_batch in data.get_train_data_batch_generator(batch_size=10):
    print(x_batch.shape, y_batch.shape)
    break

torch.Size([10, 2]) torch.Size([10, 1])


In [14]:
for x_batch, y_batch in data.get_train_data_loader(batch_size=10):
    print(x_batch.shape, y_batch.shape)
    break

torch.Size([10, 2]) torch.Size([10, 1])
