## SCAD, pytorch, variable selection

In [1]:
!pip install torch



In [28]:
import torch
import torch.nn as nn
import torch.optim as optim
from scad_linear import SCADLinear
from scaler_torch import StandardScaler, MinMaxScaler

In [29]:
device = torch.device('cpu')
dtype = torch.float64

Create your own PyTorch class that implements the method of SCAD regularization and variable selection (smoothly clipped absolute deviations) for linear models. Your development should be based on the following references:
https://andrewcharlesjones.github.io/journal/scad.html
https://www.jstor.org/stable/27640214?seq=1

Test your method on a real data set and determine a variable selection based on features' importance, according to SCAD.

In [30]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [31]:
data = pd.read_csv('../data/customer_purchasing_behavior.csv')

x = np.squeeze(data.drop(columns=['loyalty_score', 'user_id', 'region'])).values
y = data['loyalty_score']

scaler = StandardScaler()
xscaled = scaler.fit_transform(x)

x_tensor = torch.tensor(xscaled, dtype= dtype)
y_tensor = torch.tensor(y, dtype= dtype)

In [32]:
model = SCADLinear(input_size= x_tensor.shape[1], lambda_val= 0.9, a_val= 3)
scaler = MinMaxScaler()

In [27]:
model.fit(x= torch.squeeze(x_tensor), y= y_tensor, num_epochs= 1000, learning_rate= 0.1)

  return F.mse_loss(input, target, reduction=self.reduction)


epoch: 100/1000, loss_with_scad: 50.65130939811225
epoch: 200/1000, loss_with_scad: 50.651299399387625
epoch: 300/1000, loss_with_scad: 50.651291467717115
epoch: 400/1000, loss_with_scad: 50.651285163577306
epoch: 500/1000, loss_with_scad: 50.65128014757402
epoch: 600/1000, loss_with_scad: 50.65127615403481
epoch: 700/1000, loss_with_scad: 50.65127297340397
epoch: 800/1000, loss_with_scad: 50.651270439678285
epoch: 900/1000, loss_with_scad: 50.65126842103348
epoch: 1000/1000, loss_with_scad: 50.65126681263918


Based on the simulation design explained in class, generate 200 data sets where the input features have a strong correlation structure (you may consider a 0.9) and apply ElasticNet, SqrtLasso and SCAD to check which method produces the best approximation of an ideal solution, such as a "betastar" you design with a sparsity pattern of your choice.

In [15]:
import numpy as np
from scipy.linalg import toeplitz

In [16]:
def make_correlated(num_samples, p, rho):
    vcor = []
    for i in range(p):
        vcor.append(rho**i)
    r = toeplitz(vcor)
    mu = np.repeat(0,p)
    x = np.random.multivariate_normal(mu, r, size=num_samples)
    return x

In [17]:
rho = 0.9
p = 20
n = 150
x = make_correlated(n, p, rho)

In [18]:
beta =np.array([-1,2,3,0,0,0,0,2,-1,4])
beta = beta.reshape(-1,1)
betastar = np.concatenate([beta,np.repeat(0,p-len(beta)).reshape(-1,1)],axis=0)
y = x@betastar + 1.5*np.random.normal(size=(n,1))

In [19]:
x_tensor = torch.tensor(x, device=device)
y_tensor = torch.tensor(y, device= device)

In [23]:
x_tensor

tensor([[-0.9694, -0.7039, -0.5928,  ..., -1.1205, -1.3873, -1.5827],
        [ 0.0087, -0.1929, -0.3554,  ..., -2.0655, -1.4800, -1.5179],
        [ 0.1829,  0.5509,  0.7710,  ...,  1.1026,  0.3366, -0.0863],
        ...,
        [ 0.4960,  0.6532,  1.1835,  ...,  0.6555,  0.5639,  1.3547],
        [-0.4167, -0.5749, -1.2126,  ..., -0.4091, -0.2421, -0.7258],
        [ 1.4866,  1.3494,  0.5041,  ...,  1.6545,  1.5291,  1.2364]],
       dtype=torch.float64)

In [20]:
model = SCADLinear(x_tensor.shape[1], 0.5, 3)

In [21]:
model.fit(x_tensor, y_tensor, num_epochs=1000, learning_rate=0.01)

Parameter containing:
tensor([[ 0.1200, -0.0796,  0.1907,  0.1057, -0.1398,  0.0507, -0.1303, -0.0908,
          0.0439, -0.1703,  0.1561, -0.1037,  0.2074, -0.0255, -0.0582, -0.1281,
         -0.1738, -0.1020,  0.1088,  0.0005]], dtype=torch.float64,
       requires_grad=True)
Parameter containing:
tensor([[ 0.2190,  0.0303,  0.3057,  0.2277, -0.0198,  0.1640, -0.0097,  0.0392,
          0.1720, -0.0341,  0.2740, -0.0046,  0.3004,  0.0568,  0.0107, -0.0538,
         -0.1053, -0.0428,  0.1616,  0.0612]], dtype=torch.float64,
       requires_grad=True)
Parameter containing:
tensor([[ 3.0019e-01,  1.2140e-01,  4.0104e-01,  3.2742e-01,  7.6751e-02,
          2.5476e-01,  8.6956e-02,  1.4459e-01,  2.7551e-01,  7.6425e-02,
          3.6794e-01,  7.2536e-02,  3.7100e-01,  1.1765e-01,  6.0105e-02,
          8.3934e-05, -5.6499e-02, -2.1680e-03,  1.9724e-01,  1.0316e-01]],
       dtype=torch.float64, requires_grad=True)
Parameter containing:
tensor([[ 0.3670,  0.1974,  0.4807,  0.4094,  0.1545

In [111]:
mse(model.predict(x_tensor).detach().numpy(), y_tensor.detach().numpy())

1.86320628220977

Use the methods you implemented above to determine a variable selection for the Concrete data set with quadratic interaction terms (polynomial features of degree 2). To solve this, you should consider choosing the best weight for the penalty function. What is the ideal model size (number of variables with non-zero weights), and what is the cross-validated mean square error?