# Replication - High Dimensional Case2 - Table

Here we provide a notebook to replicate the summary tables for the high-dimensional case simulation. 

The notebook replicates the results in:
- /out/simulation/tables/sim_hd2*

The main script can be found at: 
- /scripts/simulation/tables/highdimensional_case2.py


## Please choose the settup for replication:

In [None]:
suffix = 'rank5' # rank5, rank50
R_suffix = 'R_lasso_theta_1se' # ''R_lasso_theta', 'R_lasso_theta_1se', 'R_Alasso1_theta', 'R_Alasso1_theta_1se', 'R_Alasso2_theta', 'R_Alasso2_theta_1se', 'R_SCAD_theta', 'R_MCP_theta', 'R_SCAD_theta'

In [None]:
# Modules
# =======================================================================================================================
import os
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

sim_name = 'sim_hd2'

In [None]:
# Function
# =======================================================================================================================
def custom_mean(X, W, col_idx):
    '''
    - average for paramters of an array selcted by an indexing matrix

    X :: array to apply mean along axis=0
    W :: indexing which elements to use for mean computatiuon
    col_idx :: indexing the columns where W is applied - otherwise standard mean without selecting elements
    '''
    m = []
    assert X.shape == W.shape
    N, M = X.shape

    for jj in range(M):
        if col_idx[jj] == True:
            m.append(np.mean(X[W[:, jj], jj]))
        else:
            m.append(np.mean(X[:, jj]))
    return(np.asarray(m))


def custom_var(X, W, col_idx):
    '''
    - variance for paramters of an array selcted by an indexing matrix

    X :: array to apply variance along axis=0
    W :: indexing which elements to use for variance computatiuon
    col_idx :: indexing the columns where W is applied - otherwise standard mean without selecting elements
    '''
    m = []
    assert X.shape == W.shape
    N, M = X.shape

    for jj in range(M):
        if col_idx[jj] == True:
            m.append(np.var(X[W[:, jj], jj]))
        else:
            m.append(np.var(X[:, jj]))
    return(np.asarray(m))


In [None]:
# Simulation Settings
# =======================================================================================================================

I = 750
P = 1000
theta = np.concatenate((np.asarray([-0.5, 0.7, 1.2, 0.65, -0.9, 1.4, 0.2, -0.4, -1.3, 0.1]), np.zeros((990,))))[:, None]

In [None]:
# Overall Parameters
# =======================================================================================================================

url = 'https://raw.githubusercontent.com/alexwjung/ProbCox/main/paper/ProbCox/out/simulation/sim_hd2/N_obs.txt'
N_obs = pd.read_csv(url, header=None, sep=';')

print('Obs: ', np.min(N_obs.iloc[:, 1]), np.median(N_obs.iloc[:, 1]), np.max(N_obs.iloc[:, 1]))
print('Censorpship: ', np.min(1-N_obs.iloc[:, 2]/I), np.median(1-N_obs.iloc[:, 2]/I), np.max(1-N_obs.iloc[:, 2]/I))
#print('Tied Events', np.min(N_obs.iloc[:, 3]), np.median(N_obs.iloc[:, 3]), np.max(N_obs.iloc[:, 3]))

Obs:  750 750.0 750
Censorpship:  0.7026666666666667 0.7493333333333334 0.7973333333333333


In [None]:
# ProbCox Table
# =======================================================================================================================

res = np.zeros((P, 7))
res[:, 0] = theta[:, 0]

url1 = 'https://raw.githubusercontent.com/alexwjung/ProbCox/main/paper/ProbCox/out/simulation/sim_hd2/probcox' + suffix +'_theta.txt'
url2 = 'https://raw.githubusercontent.com/alexwjung/ProbCox/main/paper/ProbCox/out/simulation/sim_hd2/probcox' + suffix +'_theta_lower.txt'
url3 = 'https://raw.githubusercontent.com/alexwjung/ProbCox/main/paper/ProbCox/out/simulation/sim_hd2/probcox' + suffix +'_theta_upper.txt'


theta_est = pd.read_csv(url1, header=None, sep=';')
theta_est_lower = pd.read_csv(url2, header=None, sep=';')
theta_est_upper = pd.read_csv(url3, header=None, sep=';')

theta_est = theta_est.dropna(axis=0)
theta_est = theta_est.groupby(0).first().reset_index()
theta_est = theta_est.iloc[:, :-1]
assert theta_est.shape[0] == 200

theta_est_lower = theta_est_lower.dropna(axis=0)
theta_est_lower = theta_est_lower.groupby(0).first().reset_index()
theta_est_lower = theta_est_lower.iloc[:, :-1]
assert theta_est_lower.shape[0] == 200

theta_est_upper = theta_est_upper.dropna(axis=0)
theta_est_upper = theta_est_upper.groupby(0).first().reset_index()
theta_est_upper = theta_est_upper.iloc[:, :-1]
assert theta_est_upper.shape[0] == 200

theta_bound = theta_est_lower.merge(theta_est_upper, how='inner', on=0)
theta_bound = theta_bound.merge(theta_est, how='inner', on=0)
theta_est = np.asarray(theta_bound.iloc[:, -P:]).astype(float)
theta_bound = theta_bound.iloc[:, :-P]
theta_bound = np.asarray(theta_bound.iloc[:, 1:]).astype(float)

theta_est_lower = np.asarray(theta_est_lower.iloc[:, 1:])
theta_est_upper = np.asarray(theta_est_upper.iloc[:, 1:])

W = np.sign(theta_est_lower) == np.sign(theta_est_upper) # non zero parameters estimates (based on HPD95%)
col_idx = np.logical_and(np.squeeze(theta != 0), np.sum(W, axis=0) > 5) # true non-zero parameters


res[:, 1] = custom_mean(theta_est, W, col_idx)
res[:, 2] = np.sqrt(custom_var(theta_est, W, col_idx))
res[:, 3] = np.sqrt(custom_mean((theta_est - theta[:, 0][None, :])**2, W, col_idx))

res[:, 4] = custom_mean(theta_bound[:, -P:] - theta_bound[:, :P], W, col_idx)

res[:, 5] = custom_mean(np.logical_and(np.squeeze(theta)[None, :] >= theta_bound[:, :P], np.squeeze(theta)[None, :] <= theta_bound[:, -P:])
, W, col_idx)

res[:, 6] = np.mean(W, axis=0)

res = np.round(res, 2)

#pd.DataFrame(res) # full table with 0 parameters 
pd.DataFrame(res[:10, :])

# column headings
#$\theta$   $\bar{\hat{\theta}}$ 	$\overline{\sigma_{\hat{\theta}}}$	$RMSE$ 	$\overline{HPD}_{95\%}$	$Coverage_{95\%}$  $p_{|\hat{\theta}| > 0}$

Unnamed: 0,0,1,2,3,4,5,6
0,-0.5,-0.52,0.1,0.1,0.69,1.0,0.26
1,0.7,0.61,0.15,0.17,0.67,0.98,0.7
2,1.2,1.03,0.2,0.26,0.64,0.78,1.0
3,0.65,0.58,0.14,0.16,0.67,0.98,0.58
4,-0.9,-0.77,0.17,0.22,0.65,0.89,0.93
5,1.4,1.22,0.2,0.27,0.65,0.72,1.0
6,0.2,0.01,0.06,0.2,0.04,0.04,0.01
7,-0.4,-0.51,0.11,0.15,0.68,0.95,0.18
8,-1.3,-1.12,0.18,0.26,0.65,0.79,1.0
9,0.1,0.0,0.0,0.1,0.02,0.0,0.01


In [None]:
# Evaluating identification
theta_est_lower = theta_bound[:, :1000]
theta_est_upper = theta_bound[:, 1000:]
pd.DataFrame(np.concatenate((np.round(np.mean(np.sum(np.sign(theta_est_lower[:, :]) == np.sign(theta_est_upper[:, :]), axis=1)))[None, None], np.round(np.sqrt(np.var(np.sum(np.sign(theta_est_lower[:, :]) == np.sign(theta_est_upper[:, :]), axis=1))))[None, None], np.round(np.mean(np.sum((np.sign(theta_est_lower[:, :]) == np.sign(theta_est_upper[:, :])) * np.squeeze(theta == 0)[None, :], axis=1)))[None, None]), axis=1))

# column headings
# number of covariates identified       standard error        falsly identified

Unnamed: 0,0,1,2
0,14.0,3.0,9.0


In [None]:
# R-Cox Table
# =======================================================================================================================

res = np.zeros((P, 7))
res[:, 0] = theta[:, 0]

url = 'https://raw.githubusercontent.com/alexwjung/ProbCox/main/paper/ProbCox/out/simulation/sim_hd2/' + R_suffix  + '.txt'

theta_est = pd.read_csv(url, header=None, sep=';')

theta_est = theta_est.dropna(axis=0)
theta_est = theta_est.groupby(0).first().reset_index()
theta_est = np.asarray(theta_est.iloc[:, 1:])
assert theta_est.shape[0] == 200


W = theta_est!=0 # non zero parameters estimates (based on HPD95%)
col_idx = np.logical_and(np.squeeze(theta != 0), np.sum(W, axis=0) > 5) # true non-zero parameters

res[:, 1] = custom_mean(theta_est, W, col_idx)
res[:, 2] = np.sqrt(custom_var(theta_est, W, col_idx))
res[:, 3] = np.sqrt(custom_mean((theta_est - theta[:, 0][None, :])**2, W, col_idx))

res[:, 6] = np.mean(W, axis=0)

res = np.round(res, 2)

# pd.DataFrame(res) # full table with 0 parameters 
res = pd.DataFrame(res[:10, :])
res.iloc[:, 4] = '-'
res.iloc[:, 5] = '-'
res


# column headings
#$\theta$   $\bar{\hat{\theta}}$ 	$\overline{\sigma_{\hat{\theta}}}$	$RMSE$ 	$\overline{CI}_{95\%}$	$Coverage_{95\%}$ $p_{|\hat{\theta}| > 0}$

Unnamed: 0,0,1,2,3,4,5,6
0,-0.5,-0.31,0.43,0.46,-,-,0.12
1,0.7,0.33,0.71,0.8,-,-,0.3
2,1.2,0.38,0.63,1.04,-,-,0.82
3,0.65,0.38,0.75,0.8,-,-,0.24
4,-0.9,-0.31,0.64,0.88,-,-,0.58
5,1.4,0.45,0.77,1.23,-,-,0.97
6,0.2,0.26,0.37,0.37,-,-,0.04
7,-0.4,-0.38,0.64,0.64,-,-,0.12
8,-1.3,-0.38,0.65,1.12,-,-,0.94
9,0.1,-0.01,0.1,0.15,-,-,0.02


In [None]:
# Evaluating identification
pd.DataFrame(np.concatenate((np.round(np.mean(np.sum(theta_est != 0, axis=1)))[None, None], np.round(np.sqrt(np.var(np.sum(theta_est != 0, axis=1))))[None, None],np.round(np.mean(np.sum((theta_est != 0) * np.squeeze(theta == 0)[None, :], axis=1)))[None, None]), axis=1))

# column headings
# number of covariates identified       standard error        falsly identified


Unnamed: 0,0,1,2
0,19.0,65.0,15.0
