# Sketching Unconstrained Least Squares

Now that it seems the sketching methods are working, we are in a position to test how the count sketch transform can be used to speed up the solving of a constrained regression problem.  To begin with, we will focus on the unconstrained case and measure how the time varies across using diferent data sizes as well as the density of the data.

In [13]:
import numpy as np
import scipy as sp
from countSketch import countSketch_elt_stream 
from srht import srht_transform
from time import time
import itertools

import matplotlib.pyplot as plt
%matplotlib inline

from numpy.linalg import norm

In [43]:
def iterative_hessian_general(data, targets, sketch_size,
                      num_iters):
    '''
    INPUT:
    data - n x d matrix
    targets - n x 1 target variables
    num_iters - how many iterations to perform.  
    Need log(1/eps) -- > num_iters for eps accuracy where
    eps is relative error in the semi-norm ||u-v||_A = 
    1/sqrt(n)*||A(u-v)||_2.
    
    OUTPUT:
    x0 - vector which approximately recovers the true solution to 
    the constrained problem
    
    
    
    TO DO:
    Add functionality for lower bound on sketch size
    '''
    
    A = data
    #tidy_data =  sort_row_order(A)
    
    y = targets
    n,d = A.shape
    x0 = np.zeros(shape=(d,1))
    m = int(sketch_size) # sketching dimension
    
    ATy = A.T@y[:,None]
    covariance_mat = A.T.dot(A)
    #print("A^Ty shape: {}".format(ATy.shape))
    #print("covariance shape: {}".format(covariance_mat.shape))
   
    for n_iter in range(int(num_iters)):
        #print("Iteration: {}".format(n_iter))
        S_A = countSketch_elt_stream(A, sketch_size)
        
        true_norm = np.linalg.norm(A@x0)**2
        #print("Norm calculated")
        approx_norm = np.linalg.norm(S_A@x0)**2
        #print("True norm: {}".format(true_norm))
        #print("Approx norm: {}".format(approx_norm))
        #print("Relative norms: {}".format(approx_norm/true_norm))
        B = S_A.T.dot(S_A)
        #print("cross term shape: {}".format((S_A.T@(S_A@x0)).shape))
        #print("Sketch shape: {}".format(S_A.shape))
        #print("sketch vector shape: {}".format(np.dot(S_A, x0).shape))
        z = ATy + np.dot(S_A.T, np.dot(S_A,x0))- covariance_mat@x0 #
        #z = ATy - covariance_mat.dot(x0) + np.dot(S_A.T, (np.dot(S_A,x0)))
        #x_new = sparse.linalg.lsqr(B,z)[0]
        #print("B shape: {}".format(B.shape))
        #print("z shape: {}".format(z.shape))
        x_new = np.linalg.lstsq(B,z)[0]
        #print("x_new shape: {}".format(x_new.shape))
        x0 = x_new # # need to convert to vector format as 
                           # the sparse solver is different output
        #print("x0 shape: {}".format(x0.shape))
    return np.ravel(x0)

The following function allows for the generation of sparse data similar to the `sklearn` method but with the added flexibility of being able to set sparsity as a parameter.

In [5]:
from scipy import sparse
from sklearn.utils import shuffle

def generate_sparse_data(n_samples, n_features, density,
                         n_targets=1, bias = 0.0, tail_strength=0.5,
                        noise=0.0, permute=True, coef=True,
                        random_state=None):
    '''
    Generate a random regression problem with a sparse design matrix.
    Follow the setup from sklearn.datasets.make_regression except 
    with sparse matrices and a density parameter.
    The output is generate by applying a random linear regression
    model with n_features regressors to the generated input and 
    adding some gaussian noise with adjustable scale.
    
    
    Parameters
    ----------
    n_samples : int
        The number of samples.
    n_features : int
        The number of features.
    density : float in (0,1)
        density of the data to be generated.
    n_targets : int, optional (default=1)
        The number of regression targets, i.e., the dimension of the y output
        vector associated with a sample. By default, the output is a scalar.
    bias : float, optional (default=0.0)
        The bias term in the underlying linear model.
    tail_strength : float between 0.0 and 1.0, optional (default=0.5)
        The relative importance of the fat noisy tail of the singular values
        profile if `effective_rank` is not None.
    noise : float, optional (default=0.0)
        The standard deviation of the gaussian noise applied to the output.
    shuffle : boolean, optional (default=True)
        Shuffle the samples and the features.
    coef : boolean, optional (default=True)
        If True, the coefficients of the underlying linear model are returned.
    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.
    Returns
    -------
    X : SciPy sparse matrix of shape [n_samples, n_features].
        Note that it may often be required to use as a NumPy 
        ndarray so use .toarray() on the output.
        e.g removing any zero rows. Defines the input samples.
    y : array of shape [n_samples] or [n_samples, n_targets]
        The output values.
    coef : array of shape [n_features] or [n_features, n_targets], optional
        The coefficient of the underlying linear model. It is returned only if
        coef is True.'''
    random_state = np.random.seed(random_state)
    X = sparse.random(m=n_samples, n=n_features,density=density,
                     random_state=random_state)
    
    # In future can add n_informative to extend data generation
    ground_truth = np.zeros((n_features, n_targets))
    ground_truth = np.random.rand(n_features,n_targets)
    
    y = X@ground_truth + bias
    
    # Add noise
    if noise > 0.0:
        y += random.normal(scale=noise, size=y.shape)
        
    # Randomly permute samples and features
    if permute:
        X,y = shuffle(X,y, random_state=random_state)
        indices = np.arange(n_features)
        np.random.shuffle(indices)
        X[:,:] = X[:,indices]
        ground_truth = ground_truth[indices]
    y = np.squeeze(y)
    X = X.toarray()
    if coef:
        return X,y, np.squeeze(ground_truth)
    else:
        return X, y
    

Now set up a parameter grid which will define instances of varying parameters over size, dimension, density, and sketch size.

In [41]:
param_grid = {
    'rows' : [10000, 25000, 50000],
    'columns' : [10, 100, 250],
    'density' : [0.01, 0.1, 0.25],#, 0.5, 1.0],
    'sketch type' : ['CWT'], #'None', ],# 'SRHT']
}
param_grid

{'columns': [10, 100, 250],
 'density': [0.01, 0.1, 0.25],
 'rows': [10000, 25000, 50000],
 'sketch type': ['CWT']}

In [34]:
# Setup independent variables


# Experimental output
experiment_output = {}

In [46]:
for n_rows, n_cols, density, method in itertools.product(param_grid['rows'],
                                                         param_grid['columns'],
                                                         param_grid['density'],
                                                         param_grid['sketch type']):
    
    sketch_size = 10*n_cols
    print(n_rows, n_cols, density, method)
    print("Sketch size: {}".format(sketch_size))
    
    ### Generate data
    data, target, coef = generate_sparse_data(n_rows, n_cols, density)
    
    if method == 'None':
        experiment_output['None'] = {}
        # Include a loop here to determine number of samples to take
        start = time()
        x_true = np.linalg.lstsq(data, target)[0]
        solve_time = time() - start
        print("System Solve time: {}".format(solve_time))
       
    if method == "CWT":
        start = time()
        x_true = iterative_hessian_general(data, target, sketch_size, num_iters=10)
        solve_time = time() - start
        print("IHS CWT Solve time: {}".format(solve_time))
    
    del data, target, coef, x_true # so don't keep lots of test matrices in memory

10000 10 0.01 CWT
Sketch size: 100
IHS CWT Solve time: 0.008370161056518555
10000 10 0.1 CWT
Sketch size: 100
IHS CWT Solve time: 0.010008573532104492
10000 10 0.25 CWT
Sketch size: 100




IHS CWT Solve time: 0.013273477554321289
10000 100 0.01 CWT
Sketch size: 1000
IHS CWT Solve time: 0.049902915954589844
10000 100 0.1 CWT
Sketch size: 1000
IHS CWT Solve time: 0.06836700439453125
10000 100 0.25 CWT
Sketch size: 1000
IHS CWT Solve time: 0.09726858139038086
10000 250 0.01 CWT
Sketch size: 2500
IHS CWT Solve time: 0.3043227195739746
10000 250 0.1 CWT
Sketch size: 2500
IHS CWT Solve time: 0.32970380783081055
10000 250 0.25 CWT
Sketch size: 2500
IHS CWT Solve time: 0.38506054878234863
25000 10 0.01 CWT
Sketch size: 100
IHS CWT Solve time: 0.017129898071289062
25000 10 0.1 CWT
Sketch size: 100
IHS CWT Solve time: 0.02086019515991211
25000 10 0.25 CWT
Sketch size: 100
IHS CWT Solve time: 0.030331850051879883
25000 100 0.01 CWT
Sketch size: 1000
IHS CWT Solve time: 0.08072996139526367
25000 100 0.1 CWT
Sketch size: 1000
IHS CWT Solve time: 0.14963841438293457
25000 100 0.25 CWT
Sketch size: 1000
IHS CWT Solve time: 0.2191169261932373
25000 250 0.01 CWT
Sketch size: 2500
IHS CWT

In [38]:
experiment_output

{'None': {'columns': (250,),
  'density': (0.25,),
  'rows': 50000,
  'solution': (array([0.99726157, 0.95818364, 0.14274053, 0.49829881, 0.48931521,
          0.11056605, 0.08847292, 0.84914061, 0.86198591, 0.21746241,
          0.64373171, 0.82705849, 0.4232046 , 0.41585693, 0.95332526,
          0.81302164, 0.19700084, 0.31659645, 0.00263679, 0.74681964,
          0.12938689, 0.53088871, 0.22582044, 0.22191946, 0.16616623,
          0.73713707, 0.79740904, 0.02067645, 0.2362221 , 0.29004592,
          0.09656994, 0.21484963, 0.28730806, 0.93709924, 0.19805551,
          0.67384063, 0.69523739, 0.48000644, 0.59209633, 0.38222969,
          0.16415923, 0.41648813, 0.81788816, 0.28119272, 0.67427052,
          0.81429958, 0.18351942, 0.55953303, 0.14786285, 0.34166903,
          0.22046479, 0.04909992, 0.41421816, 0.11916088, 0.4407302 ,
          0.42683362, 0.67455984, 0.31059181, 0.8599357 , 0.42437931,
          0.34944468, 0.45012024, 0.63163496, 0.15973587, 0.3392525 ,
          