In [34]:
import numpy as np
import time
from sklearn.decomposition import PCA, SparsePCA
from scipy.linalg import svd

# Load Data

In [3]:
hastie_synthetic = np.loadtxt('data/hastie_synthetic.csv')

## Run Baseline Algorithms

In [177]:
curr_dataset = hastie_synthetic
number_components = 3

### PCA

In [178]:
start = time.time()
pca = PCA(n_components=number_components, svd_solver='full')
PCAResult = pca.fit(curr_dataset)
pca_time_result = time.time() - start

### Sparse PCA

In [179]:
start = time.time()
spca = SparsePCA(n_components=number_components, method='cd')
SPCAResult = spca.fit(curr_dataset)
spca_time_result = time.time() - start



### Shen et al. Algorithm

In [183]:
def hard_thresh(vec, lam):
    return np.multiply(vec, np.abs(vec) > lam)

def soft_thresh(vec, lam):
    return np.multiply(vec - lam * np.sign(vec), np.abs(vec) > lam)

def scad(vec, lam, a=3.7):
    return np.multiply(np.multiply(vec - lam * np.sign(vec), np.abs(vec) > lam), np.abs(vec) < (2*lam)) + \
            np.multiply(vec, np.abs(vec) > a * lam) + \
            np.multiply(np.multiply(((a - 1) * vec - np.sign(vec) * lam * a )/ (a - 2), \
                                    np.abs(vec) > (2*lam)), np.abs(vec) < (a*lam))

def shen_alg(data, num_components, lam, threshold_fun):
    components = np.zeros((num_components, data.shape[1]))
    
    curr_data = data
    for i in range(0,num_components):
        (U, s, Vh) = svd(curr_data)
        curr_u = U.T[0]
        curr_v = Vh[0]
        curr_s = s[0]
        
        j = 0
        while(j < 1000):
            curr_v = threshold_fun(np.matmul(curr_data.T, curr_u), lam)
            curr_u = np.matmul(curr_data, curr_v)
            if(np.linalg.norm(curr_u) > 0):
                curr_u = curr_u / np.linalg.norm(curr_u)
            
            j = j + 1
        if(np.linalg.norm(curr_v) > 0):
                curr_v = curr_v / np.linalg.norm(curr_v)
        components[i] = curr_v
          
        curr_data = curr_data - curr_s * np.outer(curr_u, curr_v)

    return components

In [184]:
lam = 200
start = time.time()
shen_alg_result = shen_alg(curr_dataset, number_components, lam, scad)
shen_alg_time_result = time.time() - start

## Collect and Store Baseline Algorithm Data

In [27]:
time_result

0.0014750957489013672

In [185]:
shen_alg_result

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        , -0.70710678, -0.70710678],
       [ 0.        ,  0.        ,  0.        ,  0.        , -0.5       ,
        -0.5       , -0.5       , -0.5       ,  0.        ,  0.        ],
       [-0.5       , -0.5       , -0.5       , -0.5       ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ]])

In [38]:
(U, s, Vh) = svd(curr_dataset)

In [57]:
curr_dataset.shape

(200, 10)