# 3.3.2 Data Generator

This notebook contains a class that generates a simulated dataset for the regression problem $Y=X^\top\beta + \epsilon$. There are $N=300$ observations on $p=31$ standard Gaussian variables, with pairwise correlations all equal to $0.85$.

For $10$ of the variables, the coefficients are drawn at random from a $N(0, 0.4)$ distribution, the rest are zero.

The noise $\epsilon \sim N(0. 6.25)$, resulting in a signal-to-noise ratio of $0.64$.

In [1]:
import numpy as np

In [2]:
class DataGenerator:
    def __init__(self, *args, **kwargs):
        # data params
        self.ndim = kwargs.get('ndim', 11)
        self.ndata = kwargs.get('ndata', 300)
        
        # x params
        self.mu_x = kwargs.get('mu_x', np.zeros(self.ndim))
        cov = 0.85 * np.ones((self.ndim, self.ndim))
        np.fill_diagonal(cov, 1)
        self.cov_x = kwargs.get('cov_x', cov)
        
        # beta params
        self.num_choice = kwargs.get('num_choice', 10)
        self.mu_b = kwargs.get('mu_b', 0.0)
        self.var_b = kwargs.get('var_b', 0.4)
        
        # noise params
        self.mu_n = kwargs.get('mu_n', 0.0)
        self.var_n = kwargs.get('var_n', 6.25)
    
    def get_x(self):
        return np.random.multivariate_normal(self.mu_x, self.cov_x, self.ndata)
    
    def get_beta(self):
        beta = np.zeros(self.ndim)
        beta_index = np.arange(self.ndim)
        
        # this part should be uncommented if we wish to use random beta index
        # self.choice_index = np.random.choice(beta_index, self.num_choice)
        
        self.choice_index = np.arange(self.num_choice)
        beta[self.choice_index] = np.random.normal(
            self.mu_b, 
            np.sqrt(self.var_b), 
            len(self.choice_index)
        )

        return beta

    def get_noise(self):
        return np.random.normal(self.mu_n, np.sqrt(self.var_n), self.ndata)
    
    def get_y(self, x, beta):
        noise = self.get_noise()
        y = x @ beta + noise
        self._get_noise_var(y, noise)
        
        return y
    
    def _get_noise_var(self, y, noise):
        self.sn = np.var(y) / np.var(noise)
    
    def generate(self):
        x = self.get_x()
        beta = self.get_beta()
        y = self.get_y(x, beta)
        return x, y, beta

In [3]:
datagen = DataGenerator()
x, y, beta = datagen.generate()

In [4]:
print(x.shape, y.shape, beta.shape)

(300, 11) (300,) (11,)
