# Comparative analysis with competing algorithms

In this comparative analysis part, we will compare SSVD (from version 3) with SVD and Sparse PCA algorithm. Note that SPCA here uses 2 as weight parameter. 

Rank 1 decomposition of this three methods will be compared. The true u and v are defined by the code in the next cell and normal distributed noises are incorporated to get test data.

For accuracy, we will compare four indicators.

a. Average number of zeros in u and v

b. Average number of correctly identified 0s in u and v

c. Average number of correctly identified non-0s in u and v

d. Rate of correctly identified all the positions of 0s AND non-0s (Correct classification rate) in u and v

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import scipy.linalg as la
from functools import reduce
import seaborn as sns
import time
from SSVDversion3 import SSVD_layer
from sklearn.decomposition import SparsePCA

In [2]:
u_tilde = [list(range(10, 2, -1)), list(np.repeat(2, 17)), list(np.repeat(0, 75))]
u_tilde = np.array(reduce(lambda x, y: x+y, u_tilde))
v_tilde = [[10, -10, 8, -8, 5, -5], list(np.repeat(3, 5)), list(np.repeat(-3, 5)), list(np.repeat(0, 34))]
v_tilde = np.array(reduce(lambda x, y: x+y, v_tilde))

u_true = u_tilde / np.linalg.norm(u_tilde)  # (100,)
v_true = v_tilde / np.linalg.norm(v_tilde)  # (50,)
s = 50
Xstar = s * np.outer(u_true, v_true)  # (100, 50)

lam_grid = np.linspace(0, 8, 81)
gamma1 = gamma2 = 2

X = Xstar + np.random.normal(0, 0.5, Xstar.shape)  # (100, 50)
num_layer = 2


# SVD
U, _, VT = la.svd(X)
V = VT.T
# prepare vector Y and Z, which are fixed after given X
Y = X.T.reshape((-1,1))  # (nd, 1)
Z = X.reshape((-1,1))  # (nd, 1)

# initial value
u_old = U[:,0][:,None]
v_old = V[:,0][:,None]
d = v_old.shape[0]
n = u_old.shape[0]

X.shape

(100, 50)

In [3]:
def evaluate(vec_result, vec_true):
    vec_result_0index = np.where(vec_result==0)[0]  # the index of the 0s in vec_result
    vec_true_0index = np.where(vec_true==0)[0]  # the index of the 0s in vec_true
    vec_result_non0index = np.where(vec_result!=0)[0]  # the index of the non0s in vec_result
    vec_true_non0index = np.where(vec_true!=0)[0]  # the index of the non0s in vec_true

    # number of 0s in vec_result
    num_zero = len(vec_result_0index)  
    # whether each of 0s identified is correct
    vec_result_0index_correct = [index in vec_true_0index for index in vec_result_0index]
    num_zero_correct = np.sum(vec_result_0index_correct)
    # whether each of non0s identified is correct
    vec_result_non0index_correct = [index in vec_true_non0index for index in vec_result_non0index]
    num_nonzero_correct = np.sum(vec_result_non0index_correct)
    # whether correct identify all the positions of 0s (and also all non0s), 
    # i.e., correct classification rate
    # =False if the number of 0s wrong or the positions of 0s wrong
    correct_all = np.allclose(vec_result_0index, vec_true_0index)\
        if len(vec_result_0index) == len(vec_true_0index) else False 
        
    return [num_zero, num_zero_correct, num_nonzero_correct, correct_all]

In [4]:
time_ssvd = 0
time_svd = 0
time_spca = 0

In [5]:
lam_grid = np.linspace(0, 8, 81)
gamma1 = gamma2 = 2
simu_time = 100
evaluate_all  = evaluate_all_svd = evaluate_all_spca = np.repeat(0, 8)  # 8 criteria (4 for u and 4 for v)
for i in range(simu_time):
    
    X = Xstar + np.random.normal(0, 0.5, Xstar.shape)  # (100, 50)

    # ssvd
    start_time_ssvd = time.time()
    _, u, v, _, _, _ = SSVD_layer(X, lam_grid, gamma1, gamma2)
    criteria = evaluate(u, u_true) + evaluate(v, v_true)  # list combination
    evaluate_all = np.array([evaluate_all[i]+criteria[i] for i in range(len(criteria))])
    time_ssvd += time.time()-start_time_ssvd
    

    # svd
    start_time_svd = time.time()
    u_svd, S_svd, v_svd = np.linalg.svd(X)
    criteria_svd = evaluate(u_svd[:,0], u_true) + evaluate(v_svd[0,:], v_true)  # list combination
    evaluate_all_svd = np.array([evaluate_all_svd[i]+criteria_svd[i] for i in range(len(criteria_svd))])
    time_svd += time.time()-start_time_svd

    # SPCA
    start_time_spca = time.time()
    spca = SparsePCA(n_components = 1, alpha = 2)
    spca.fit(X)
    v_spca = spca.components_[0]
    spca.fit(X.T)
    u_spca = spca.components_[0]
    criteria_spca = evaluate(u_spca, u_true) + evaluate(v_spca, v_true)  # list combination
    evaluate_all_spca = np.array([evaluate_all_spca[i]+criteria_spca[i] for i in range(len(criteria_spca))])
    time_spca += time.time()-start_time_spca

In [6]:
performance = (evaluate_all / simu_time).reshape((2,-1))
performance_svd = (evaluate_all_svd / simu_time).reshape((2,-1))
performance_spca = (evaluate_all_spca / simu_time).reshape((2,-1))

In [7]:
df1 = pd.DataFrame(performance,
                   columns=["Avg. # of 0s", "Avg. # of correctly identified 0s", "Avg. # of correctly identified non0s", "Rate of correctly identified all 0s"],\
    index=["u_ssvd", "v_ssvd"])

df2 = pd.DataFrame(performance_svd,
    columns=["Avg. # of 0s", "Avg. # of correctly identified 0s", "Avg. # of correctly identified non0s", "Rate of correctly identified all 0s"],\
    index=["u_svd", "v_svd"])

df3 = pd.DataFrame(performance_spca,
    columns=["Avg. # of 0s", "Avg. # of correctly identified 0s", "Avg. # of correctly identified non0s", "Rate of correctly identified all 0s"],\
    index=["u_spca", "v_spca"])

In [8]:
pd.concat([df1, df2,df3], sort=False)

Unnamed: 0,Avg. # of 0s,Avg. # of correctly identified 0s,Avg. # of correctly identified non0s,Rate of correctly identified all 0s
u_ssvd,74.85,74.85,25.0,0.85
v_ssvd,33.9,33.9,16.0,0.91
u_svd,0.0,0.0,25.0,0.0
v_svd,0.0,0.0,16.0,0.0
u_spca,74.98,74.98,25.0,0.98
v_spca,34.0,34.0,16.0,1.0


In [9]:
times = {'Time of SSVD' : time_ssvd,
         'Time of SVD'  : time_svd,
         'Time of SPCA' : time_spca}

In [10]:
pd.DataFrame({'Time(sec)':times})

Unnamed: 0,Time(sec)
Time of SPCA,5.600834
Time of SSVD,1.4888
Time of SVD,0.125581
