# Replication - High Dimensional Case - Table

Here we provide a notebook to replicate the summary tables for the high-dimensional case simulation. 

The notebook replicates the results in:
- /out/simulation/tables/sim_hd*

The main script can be found at: 
- /scripts/simulation/tables/highdimensional_case.py


## Please choose the settup for replication:

In [1]:
suffix = 'rank50_b1024' # rank5, rank50, rank50_b1024  - either Multivariate Normal with rank 5, or 50 and one with MN rank 50 and batch size 1024
R_suffix = 'R_lasso_theta_1se' # 'R_lasso_theta', 'R_lasso_theta_1se', 'R_Alasso1_theta', 'R_Alasso1_theta_1se', 'R_Alasso2_theta', 'R_Alasso2_theta_1se' - For the different R fits.

In [2]:
# Modules
# =======================================================================================================================
import os
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

sim_name = 'sim_hd'

In [3]:
# Function
# =======================================================================================================================
def custom_mean(X, W, col_idx):
    '''
    - average for paramters of an array selcted by an indexing matrix

    X :: array to apply mean along axis=0
    W :: indexing which elements to use for mean computatiuon
    col_idx :: indexing the columns where W is applied - otherwise standard mean without selecting elements
    '''
    m = []
    assert X.shape == W.shape
    N, M = X.shape

    for jj in range(M):
        if col_idx[jj] == True:
            m.append(np.mean(X[W[:, jj], jj]))
        else:
            m.append(np.mean(X[:, jj]))
    return(np.asarray(m))


def custom_var(X, W, col_idx):
    '''
    - variance for paramters of an array selcted by an indexing matrix

    X :: array to apply variance along axis=0
    W :: indexing which elements to use for variance computatiuon
    col_idx :: indexing the columns where W is applied - otherwise standard mean without selecting elements
    '''
    m = []
    assert X.shape == W.shape
    N, M = X.shape

    for jj in range(M):
        if col_idx[jj] == True:
            m.append(np.var(X[W[:, jj], jj]))
        else:
            m.append(np.var(X[:, jj]))
    return(np.asarray(m))


In [4]:
# Simulation Settings
# =======================================================================================================================

I = 1000
P = 10000
url = 'https://raw.githubusercontent.com/alexwjung/ProbCox/main/paper/ProbCox/out/simulation/sim_hd/theta.txt'
theta = np.asarray(pd.read_csv(url, header=None, sep=';'))


In [5]:
# Overall Parameters
# =======================================================================================================================

url = 'https://raw.githubusercontent.com/alexwjung/ProbCox/main/paper/ProbCox/out/simulation/sim_hd/N_obs.txt'
N_obs = pd.read_csv(url, header=None, sep=';')

print('Obs: ', np.min(N_obs.iloc[:, 1]), np.median(N_obs.iloc[:, 1]), np.max(N_obs.iloc[:, 1]))
print('Censorpship: ', np.min(1-N_obs.iloc[:, 2]/I), np.median(1-N_obs.iloc[:, 2]/I), np.max(1-N_obs.iloc[:, 2]/I))
#print('Tied Events', np.min(N_obs.iloc[:, 3]), np.median(N_obs.iloc[:, 3]), np.max(N_obs.iloc[:, 3]))

Obs:  6896 7231.0 7613
Censorpship:  0.644 0.679 0.72


In [6]:
# ProbCox Table
# =======================================================================================================================

res = np.zeros((P, 7))
res[:, 0] = theta[:, 0]

url1 = 'https://raw.githubusercontent.com/alexwjung/ProbCox/main/paper/ProbCox/out/simulation/sim_hd/probcox' + suffix +'_theta.txt'
url2 = 'https://raw.githubusercontent.com/alexwjung/ProbCox/main/paper/ProbCox/out/simulation/sim_hd/probcox' + suffix +'_theta_lower.txt'
url3 = 'https://raw.githubusercontent.com/alexwjung/ProbCox/main/paper/ProbCox/out/simulation/sim_hd/probcox' + suffix +'_theta_upper.txt'


theta_est = pd.read_csv(url1, header=None, sep=';')
theta_est_lower = pd.read_csv(url2, header=None, sep=';')
theta_est_upper = pd.read_csv(url3, header=None, sep=';')

theta_est = theta_est.dropna(axis=0)
theta_est = theta_est.groupby(0).first().reset_index()
theta_est = theta_est.iloc[:, :-1]
assert theta_est.shape[0] == 200

theta_est_lower = theta_est_lower.dropna(axis=0)
theta_est_lower = theta_est_lower.groupby(0).first().reset_index()
theta_est_lower = theta_est_lower.iloc[:, :-1]
assert theta_est_lower.shape[0] == 200

theta_est_upper = theta_est_upper.dropna(axis=0)
theta_est_upper = theta_est_upper.groupby(0).first().reset_index()
theta_est_upper = theta_est_upper.iloc[:, :-1]
assert theta_est_upper.shape[0] == 200

theta_bound = theta_est_lower.merge(theta_est_upper, how='inner', on=0)
theta_bound = theta_bound.merge(theta_est, how='inner', on=0)
theta_est = np.asarray(theta_bound.iloc[:, -P:]).astype(float)
theta_bound = theta_bound.iloc[:, :-P]
theta_bound = np.asarray(theta_bound.iloc[:, 1:]).astype(float)

theta_est_lower = np.asarray(theta_est_lower.iloc[:, 1:])
theta_est_upper = np.asarray(theta_est_upper.iloc[:, 1:])

W = np.sign(theta_est_lower) == np.sign(theta_est_upper) # non zero parameters estimates (based on HPD95%)
col_idx = np.logical_and(np.squeeze(theta != 0), np.sum(W, axis=0) > 5) # true non-zero parameters


res[:, 1] = custom_mean(theta_est, W, col_idx)
res[:, 2] = np.sqrt(custom_var(theta_est, W, col_idx))
res[:, 3] = np.sqrt(custom_mean((theta_est - theta[:, 0][None, :])**2, W, col_idx))

res[:, 4] = custom_mean(theta_bound[:, -P:] - theta_bound[:, :P], W, col_idx)

res[:, 5] = custom_mean(np.logical_and(np.squeeze(theta)[None, :] >= theta_bound[:, :P], np.squeeze(theta)[None, :] <= theta_bound[:, -P:])
, W, col_idx)

res[:, 6] = np.mean(W, axis=0)

res = np.round(res, 2)

#pd.DataFrame(res) # full table with 0 parameters 
pd.DataFrame(np.concatenate((res[:10, :], res[5000:5010, :])))

# column headings
#$\theta$   $\bar{\hat{\theta}}$ 	$\overline{\sigma_{\hat{\theta}}}$	$RMSE$ 	$\overline{HPD}_{95\%}$	$Coverage_{95\%}$  $p_{|\hat{\theta}| > 0}$

Unnamed: 0,0,1,2,3,4,5,6
0,-0.71,-0.84,0.2,0.23,0.78,0.92,0.13
1,1.31,1.24,0.15,0.16,0.53,0.92,1.0
2,1.37,1.32,0.13,0.14,0.53,0.94,1.0
3,0.91,0.83,0.13,0.15,0.56,0.95,0.9
4,0.4,0.52,0.11,0.16,0.63,0.91,0.06
5,-0.19,-0.0,0.0,0.19,0.01,0.0,0.0
6,0.99,0.95,0.15,0.16,0.55,0.94,0.96
7,1.1,1.06,0.15,0.16,0.54,0.9,0.99
8,-1.36,-1.33,0.25,0.25,0.88,0.92,0.92
9,-1.16,-1.14,0.2,0.2,0.83,0.97,0.76


In [7]:
# Evaluating identification
theta_est_lower = theta_bound[:, :10000]
theta_est_upper = theta_bound[:, 10000:]
pd.DataFrame(np.concatenate((np.round(np.mean(np.sum(np.sign(theta_est_lower[:, :]) == np.sign(theta_est_upper[:, :]), axis=1)))[None, None], np.round(np.sqrt(np.var(np.sum(np.sign(theta_est_lower[:, :]) == np.sign(theta_est_upper[:, :]), axis=1))))[None, None], np.round(np.mean(np.sum((np.sign(theta_est_lower[:, :]) == np.sign(theta_est_upper[:, :])) * np.squeeze(theta == 0)[None, :], axis=1)))[None, None]), axis=1))

# column headings
# number of covariates identified       standard error        falsly identified

Unnamed: 0,0,1,2
0,15.0,3.0,2.0


In [8]:
# R-Cox Table
# =======================================================================================================================

res = np.zeros((P, 7))
res[:, 0] = theta[:, 0]

url = 'https://raw.githubusercontent.com/alexwjung/ProbCox/main/paper/ProbCox/out/simulation/sim_hd/' + R_suffix  + '.txt'

theta_est = pd.read_csv(url, header=None, sep=';')

theta_est = theta_est.dropna(axis=0)
theta_est = theta_est.groupby(0).first().reset_index()
theta_est = np.asarray(theta_est.iloc[:, 1:])
assert theta_est.shape[0] == 200


W = theta_est!=0 # non zero parameters estimates (based on HPD95%)
col_idx = np.logical_and(np.squeeze(theta != 0), np.sum(W, axis=0) > 5) # true non-zero parameters

res[:, 1] = custom_mean(theta_est, W, col_idx)
res[:, 2] = np.sqrt(custom_var(theta_est, W, col_idx))
res[:, 3] = np.sqrt(custom_mean((theta_est - theta[:, 0][None, :])**2, W, col_idx))

res[:, 6] = np.mean(W, axis=0)

res = np.round(res, 2)

# pd.DataFrame(res) # full table with 0 parameters 
res = pd.DataFrame(np.concatenate((res[:10, :], res[5000:5010, :])))
res.iloc[:, 4] = '-'
res.iloc[:, 5] = '-'
res


# column headings
#$\theta$   $\bar{\hat{\theta}}$ 	$\overline{\sigma_{\hat{\theta}}}$	$RMSE$ 	$\overline{CI}_{95\%}$	$Coverage_{95\%}$ $p_{|\hat{\theta}| > 0}$

Unnamed: 0,0,1,2,3,4,5,6
0,-0.71,-0.09,0.08,0.63,-,-,0.32
1,1.31,0.61,0.14,0.71,-,-,1.0
2,1.37,0.67,0.14,0.72,-,-,1.0
3,0.91,0.25,0.13,0.67,-,-,0.92
4,0.4,0.09,0.06,0.31,-,-,0.07
5,-0.19,-0.0,0.0,0.19,-,-,0.0
6,0.99,0.34,0.14,0.66,-,-,0.98
7,1.1,0.43,0.15,0.69,-,-,1.0
8,-1.36,-0.33,0.13,1.04,-,-,0.98
9,-1.16,-0.22,0.12,0.96,-,-,0.98


In [9]:
# Evaluating identification
pd.DataFrame(np.concatenate((np.round(np.mean(np.sum(theta_est != 0, axis=1)))[None, None], np.round(np.sqrt(np.var(np.sum(theta_est != 0, axis=1))))[None, None],np.round(np.mean(np.sum((theta_est != 0) * np.squeeze(theta == 0)[None, :], axis=1)))[None, None]), axis=1))

# column headings
# number of covariates identified       standard error        falsly identified


Unnamed: 0,0,1,2
0,15.0,4.0,1.0
