## SCAD, pytorch, variable selection

In [1]:
!pip install torch



In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from scad_linear import SCADLinear
from scaler_torch import StandardScaler, MinMaxScaler

In [3]:
device = torch.device('cpu')
dtype = torch.float64

**Part 1: SCAD class with real data**
This section uses the <a href="https://archive.ics.uci.edu/dataset/165/concrete+compressive+strength">concrete.csv dataset</a> to predict linear correlations. The model is written using pytorch.

In [4]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [5]:
data = pd.read_csv('../data/concrete.csv')

x = data.drop(columns='strength').values
y = data['strength'].values

scaler = MinMaxScaler()
xscaled = scaler.fit_transform(x)

x_tensor = torch.tensor(xscaled, dtype= dtype)
y_tensor = torch.tensor(y, dtype= dtype)

In [6]:
model = SCADLinear(input_size= x_tensor.shape[1], lambda_val= 0.1, a_val= 2.5)

In [7]:
model.fit(x= torch.squeeze(x_tensor), y= y_tensor.unsqueeze(1), num_epochs= 1000, learning_rate= 0.09)
prediction = model.predict(x_tensor)

epoch: 100/1000, loss_with_scad: 147.24204059368722
epoch: 200/1000, loss_with_scad: 123.28070099508518
epoch: 300/1000, loss_with_scad: 115.8342783407946
epoch: 400/1000, loss_with_scad: 112.64318359704748
epoch: 500/1000, loss_with_scad: 110.9643283365862
epoch: 600/1000, loss_with_scad: 109.94583698654623
epoch: 700/1000, loss_with_scad: 109.26374004475271
epoch: 800/1000, loss_with_scad: 108.77680593079931
epoch: 900/1000, loss_with_scad: 108.41566513098944
epoch: 1000/1000, loss_with_scad: 108.14197763344379


In [8]:
mse(y_tensor, prediction)

108.13959448494

In [9]:
model.get_coefficients()

Parameter containing:
tensor([[ 47.6537,  32.3728,  13.6233, -15.2120,  16.8919,   5.7837,   4.8289,
          40.0994]], dtype=torch.float64, requires_grad=True)

Based on the simulation design explained in class, generate 200 data sets where the input features have a strong correlation structure (you may consider a 0.9) and apply ElasticNet, SqrtLasso and SCAD to check which method produces the best approximation of an ideal solution, such as a "betastar" you design with a sparsity pattern of your choice.

In [11]:
import numpy as np
from scipy.linalg import toeplitz
import models_for_comparison

In [12]:
def make_correlated(num_samples, p, rho):
    vcor = []
    for i in range(p):
        vcor.append(rho**i)
    r = toeplitz(vcor)
    mu = np.repeat(0,p)
    x = np.random.multivariate_normal(mu, r, size=num_samples)
    return x

In [13]:
rho = 0.9
p = 30
n = 200
x = make_correlated(n, p, rho)

In [14]:
beta =np.array([-1,2,3,0,0,0,0,2,-1,4,3,3,3,3,3,3,0])
beta = beta.reshape(-1,1)
betastar = np.concatenate([beta,np.repeat(0,p-len(beta)).reshape(-1,1)],axis=0)
y = x@betastar + 1.5*np.random.normal(size=(n,1))

In [15]:
x_tensor = torch.tensor(x, device=device)
y_tensor = torch.tensor(y, device= device)

In [16]:
x_tensor

tensor([[-7.1976e-01, -1.7404e+00, -1.8205e+00,  ..., -1.0479e+00,
         -9.7710e-01, -3.1249e-01],
        [ 1.1701e+00,  1.5021e+00,  7.7970e-01,  ...,  2.7051e+00,
          2.4982e+00,  1.8455e+00],
        [ 5.1497e-01,  5.6799e-01,  6.9492e-01,  ...,  1.4443e+00,
          1.1649e+00,  1.2818e+00],
        ...,
        [-4.6900e-01, -9.3511e-01, -4.6202e-01,  ...,  1.1954e+00,
          8.1938e-01,  7.8942e-02],
        [ 1.0626e+00,  3.0947e-01,  3.4042e-01,  ..., -4.2222e-01,
         -1.4043e-01,  1.7426e-01],
        [ 6.8952e-01, -1.1428e-01, -3.1684e-01,  ..., -4.9555e-02,
          1.0311e-03, -5.7636e-01]], dtype=torch.float64)

In [17]:
scad = SCADLinear(x_tensor.shape[1], 0.5, 3)
elastic = models_for_comparison.ElasticNet(input_size=x_tensor.shape[1], alpha=0.1, l1_ratio=0.6)

NameError: name 'device' is not defined

In [35]:
model.fit(x_tensor, y_tensor, num_epochs=1000, learning_rate=0.01)

epoch: 100/1000, loss_with_scad: 4.101080490200097
epoch: 200/1000, loss_with_scad: 3.0752120934690548
epoch: 300/1000, loss_with_scad: 2.736670963731683
epoch: 400/1000, loss_with_scad: 2.5357154640344546
epoch: 500/1000, loss_with_scad: 2.403333998776603
epoch: 600/1000, loss_with_scad: 2.3116572269358553
epoch: 700/1000, loss_with_scad: 2.246082463200986
epoch: 800/1000, loss_with_scad: 2.198034026937726
epoch: 900/1000, loss_with_scad: 2.1621371534173233
epoch: 1000/1000, loss_with_scad: 2.1348740927353442


In [36]:
mse(model.predict(x_tensor).detach().numpy(), y_tensor.detach().numpy())

2.134636308064832

Use the methods you implemented above to determine a variable selection for the Concrete data set with quadratic interaction terms (polynomial features of degree 2). To solve this, you should consider choosing the best weight for the penalty function. What is the ideal model size (number of variables with non-zero weights), and what is the cross-validated mean square error?