In [2]:
import sys, os
from joblib import Parallel, delayed
import numpy as np
import matplotlib.pyplot as plt
import math
import scipy
from scipy.integrate import quad, nquad
from sklearn.preprocessing import normalize
from sktree.tree import DecisionTreeClassifier
from sktree.ensemble import HonestForestClassifier
from sktree.stats import FeatureImportanceForestClassifier
from sklearn.metrics import roc_auc_score
from scipy.stats import entropy
from sklearn.model_selection import StratifiedKFold

In [2]:
n_jobs = -1
test_size = 0.2
n_estimators = 500

### Calculate one dimention posterior with max_features = all, depth = 1
clf_Pertree = FeatureImportanceForestClassifier(
    estimator=HonestForestClassifier(
        n_estimators=n_estimators,
        max_features=0.3,
        bootstrap= False,
        tree_estimator=DecisionTreeClassifier(),
        # max_depth = 1,
        honest_fraction=0.5,
        n_jobs=n_jobs,

    ),
    test_size=test_size,
    # permute_per_tree=True,
    # sample_dataset_per_tree=False,
    stratify = True
)



In [4]:
n_jobs = -1
test_size = 0.2
n_estimators = 500

### Calculate one dimention posterior with max_features = all, depth = 1
clf_default = FeatureImportanceForestClassifier(
    estimator=HonestForestClassifier(
        n_estimators=n_estimators,
        # max_features=1/3,
        # bootstrap= False,
        # tree_estimator=DecisionTreeClassifier(),
        # # max_depth = 1,
        # honest_fraction=0.5,
        # n_jobs=n_jobs,

    ),
    # test_size=test_size,
    # permute_per_tree=True,
    # # sample_dataset_per_tree=False,
    # stratify = True
)



In [4]:
def statistcs_Reps_Pertree_Linear(clf,n=100,p=4096,ratio=0.5,metric = 'mi',reps = 1):
    clf.reset()
    # coeffs = np.array([np.exp(-0.0022 * (i + 30)) if i < 10 else 0 for i in range(p)])
    coeffs = np.array([1/(i+5) if i < 10 else 0 for i in range(p)])
    coeffs_noise = np.array([1 if i >=10 else 0 for i in range(p)])
    # print(coeffs)

    x_1 = np.random.normal(size=(n, p))
    noise = np.random.normal(size=(n, p))

    x_2 = x_1 * coeffs + noise*coeffs_noise
    x =  np.nan_to_num(np.float32(np.vstack((x_1,x_2))))
    y = np.array([0]*n+[1]*n).reshape(-1,1)

    if metric == 'auc':
        stats,pos,samples = clf.statistic(x, y, metric=metric, return_posteriors=True,max_fpr = 0.1)
    else:
        stats,pos,samples = clf.statistic(x, y, metric=metric,return_posteriors=True)
    clf.reset()
    POS = pos[:,:,0].reshape((n_estimators,2*n))

    np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Posterior_linear_new_dimension_n1024_{}_{}_{}.csv".format(metric,p, reps), POS, delimiter=",")
    np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Samples_linear_new_dimension_n1024_{}_{}_{}.csv".format(metric,p, reps), samples, delimiter=",")
    #POSs.append(POS)
    clf.reset()
    return stats

In [3]:
DIMENSIONS = [2**i for i in range(4, 13)]
print(len((DIMENSIONS)))
print(DIMENSIONS)
REPs = 10
print([int(0.3*i) for i in DIMENSIONS])
print([int(i/3) for i in DIMENSIONS])

9
[16, 32, 64, 128, 256, 512, 1024, 2048, 4096]
[4, 9, 19, 38, 76, 153, 307, 614, 1228]
[5, 10, 21, 42, 85, 170, 341, 682, 1365]


In [6]:
Stats_Paucs_Dim_Linear = np.zeros((REPs,len(DIMENSIONS)))

for i in range(0,REPs):
    print(i)
    stats_paucs_samplesize_linear =[]
    for dim_i in DIMENSIONS:
        stat = statistcs_Reps_Pertree_Linear(clf_Pertree,n=1024,p = dim_i,ratio=0.5,metric = 'auc',reps = i)
        stats_paucs_samplesize_linear.append(stat) 
    print(stats_paucs_samplesize_linear)
    np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/linear_new_pAUC_dimension_n1024_default_{}.csv".format(i), stats_paucs_samplesize_linear, delimiter=",")
    Stats_Paucs_Dim_Linear[i,:] = stats_paucs_samplesize_linear

0
[0.9999799226459705, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9999949806614926, 0.999974903307463]
1
[1.0, 1.0, 0.9998694971988076, 1.0, 0.9999899613229852, 0.9999799226459705, 1.0, 1.0, 1.0]
2
[1.0, 1.0, 0.9999799226459705, 1.0, 0.9999949806614926, 1.0, 1.0, 0.9999899613229852, 1.0]
3


In [16]:
Stats_Paucs_Dim_Linear = np.zeros((REPs,len(DIMENSIONS)))
for i in range(REPs):
    pauc = np.array(np.genfromtxt('/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/linear_new_pAUC_dimension_n1024_{}.csv'.format(i),delimiter=','))
    Stats_Paucs_Dim_Linear[i,:] = pauc
print(Stats_Paucs_Dim_Linear)


Stats_MI_Dim_Linear = np.zeros((REPs,len(DIMENSIONS)))
Stats_Conen_Dim_Linear = np.zeros((REPs,len(DIMENSIONS)))
for i in range(REPs):
    for samp in range(len(DIMENSIONS)):
        pos = np.genfromtxt('/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Posterior_linear_new_dimension_n1024_auc_{}_{}.csv'.format(DIMENSIONS[samp],i),delimiter=',')
        posterior_forest_0 = np.nanmean(pos, axis=0)
        posterior_forest_1 = np.ones(posterior_forest_0.shape)-posterior_forest_0
        posterior_forest = np.hstack((posterior_forest_0.reshape(-1,1),posterior_forest_1.reshape(-1,1)))

        stats_conen = np.mean(entropy(posterior_forest, base=np.exp(1), axis=1))

        H_Y = entropy([50,50], base=np.exp(1))
        stats_mi = H_Y - stats_conen

        Stats_Conen_Dim_Linear[i,samp] = stats_conen
        Stats_MI_Dim_Linear[i,samp] = stats_mi


[[1.         1.         0.99997992 1.         1.         0.99999498
  1.         1.         1.        ]
 [1.         0.99998996 0.99999498 1.         0.99999498 0.9999749
  0.99985946 1.         1.        ]
 [1.         1.         0.99998996 1.         1.         1.
  1.         1.         1.        ]
 [0.9999749  1.         0.99999498 1.         1.         1.
  1.         1.         1.        ]
 [1.         1.         1.         1.         1.         1.
  1.         1.         0.99994981]
 [1.         1.         1.         1.         1.         1.
  1.         0.99992471 1.        ]
 [1.         0.99999498 1.         1.         0.99999498 1.
  0.99998996 1.         1.        ]
 [1.         1.         0.99998996 1.         0.99999498 1.
  1.         0.99981428 0.99995985]
 [1.         1.         1.         1.         1.         1.
  1.         1.         1.        ]
 [0.99998494 1.         1.         1.         1.         1.
  0.99999498 1.         1.        ]]


In [17]:
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Paucs_Dim_Linear_new_n1024_MIGHT.csv", Stats_Paucs_Dim_Linear, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Conen_Dim_Linear_new_n1024_MIGHT.csv", Stats_Conen_Dim_Linear, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_MI_Dim_Linear_new_n1024_MIGHT.csv", Stats_MI_Dim_Linear, delimiter=",")
Stats_MI_Dim_Linear

array([[0.57217262, 0.56604898, 0.55762191, 0.5599915 , 0.54908416,
        0.54551059, 0.55290321, 0.55230891, 0.54284667],
       [0.56478865, 0.55141957, 0.55643318, 0.55539923, 0.54493629,
        0.5375008 , 0.54340459, 0.53486529, 0.5407128 ],
       [0.57657585, 0.5565153 , 0.55832861, 0.55863309, 0.55416   ,
        0.55676446, 0.54758068, 0.53635601, 0.54138334],
       [0.56215426, 0.55630777, 0.55866426, 0.54411928, 0.54975863,
        0.54832448, 0.53731837, 0.54717955, 0.54693804],
       [0.56355294, 0.5598885 , 0.55217218, 0.55909925, 0.54959388,
        0.54727005, 0.54270661, 0.54193557, 0.54695437],
       [0.56472631, 0.56058816, 0.55433641, 0.56082533, 0.5507344 ,
        0.55757036, 0.55112946, 0.55037445, 0.54392623],
       [0.57221459, 0.56968767, 0.55767495, 0.55271065, 0.55566445,
        0.53755536, 0.55584089, 0.53619125, 0.55568073],
       [0.56601452, 0.55585176, 0.55587727, 0.56092691, 0.55309942,
        0.54589954, 0.54237124, 0.54915758, 0.53832338],


In [18]:
Stats_Paucs_Dim_Linear

array([[1.        , 1.        , 0.99997992, 1.        , 1.        ,
        0.99999498, 1.        , 1.        , 1.        ],
       [1.        , 0.99998996, 0.99999498, 1.        , 0.99999498,
        0.9999749 , 0.99985946, 1.        , 1.        ],
       [1.        , 1.        , 0.99998996, 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        ],
       [0.9999749 , 1.        , 0.99999498, 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        ],
       [1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 0.99994981],
       [1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 0.99992471, 1.        ],
       [1.        , 0.99999498, 1.        , 1.        , 0.99999498,
        1.        , 0.99998996, 1.        , 1.        ],
       [1.        , 1.        , 0.99998996, 1.        , 0.99999498,
        1.        , 1.        , 0.99981428, 0.99995985],


In [None]:
##Add KNN for linear
### add KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
REPs = 10
n = 1024

Stats_MI_DIM_Mu2_KNN = np.zeros((REPs,len(DIMENSIONS)))
Stats_Conen_DIM_Mu2_KNN = np.zeros((REPs,len(DIMENSIONS)))
Stats_Pauc_DIM_Mu2_KNN = np.zeros((REPs,len(DIMENSIONS)))


for i in range(0,REPs):
    print(i)
    stats_paucs_samplesize_mu0 =[]
    stats_mi_samplesize_mu0 =[]
    stats_conen_samplesize_mu0 =[]
    for p in DIMENSIONS:
        neigh = KNeighborsClassifier(n_neighbors=int(np.sqrt(n))+1)
        # coeffs = np.array([np.exp(-0.0022 * (i + 30)) if i < 10 else 0 for i in range(p)])
        coeffs = np.array([1/(i+5) if i < 10 else 0 for i in range(p)])
        coeffs_noise = np.array([1 if i >=10 else 0 for i in range(p)])

        x_1 = np.random.normal(size=(n, p))
        noise = np.random.normal(size=(n, p))
        x_2 = x_1 * coeffs +noise*coeffs_noise
        x = np.vstack((x_1,x_2))
        y = np.array([0]*n+[1]*n).reshape(-1,1).ravel()
        
        neigh.fit(x, y)
        posterior = neigh.predict_proba(x)

        stats_conen = np.mean(entropy(posterior, base=np.exp(1), axis=1))
        # print(stats_conen)

        # _, counts = np.unique(y_true_final, return_counts=True)
        H_Y = entropy([50,50], base=np.exp(1))
        # print(H_Y)
        stats_mi = H_Y - stats_conen

        pauc = roc_auc_score(
                y, posterior[:,1], max_fpr=0.1
            )
        print(stats_conen,stats_mi,pauc)
        
        stats_paucs_samplesize_mu0.append(pauc)
        stats_mi_samplesize_mu0.append(stats_mi)
        stats_conen_samplesize_mu0.append(stats_conen)

    Stats_Pauc_DIM_Mu2_KNN[i,:] = stats_paucs_samplesize_mu0
    Stats_MI_DIM_Mu2_KNN[i,:] = stats_mi_samplesize_mu0
    Stats_Conen_DIM_Mu2_KNN[i,:] = stats_conen_samplesize_mu0
    

0
0.667355999091981 0.02579118146796433 0.5389776260589042
0.664655012921102 0.028492167638843324 0.5295779579564145
0.6707785377922386 0.02236864276770667 0.5333440541788368
0.6714264753714896 0.021720705188455702 0.5257729538505489
0.6692426480342681 0.023904532525677147 0.5208203193836642
0.6739503639879922 0.019196816571953046 0.5164019594267261
0.6719292679576261 0.0212179126023192 0.5291123004565649
0.6709370002346018 0.022210180325343498 0.5301684998629386
1
0.6690200288651361 0.024127151694809168 0.5193685182652676
0.6679290372700601 0.025218143289885142 0.5355694993097979
0.6707248346355088 0.022422345924436482 0.5432017741110846
0.6739414495378016 0.019205731022143735 0.5315295972322163
0.6665624468311143 0.026584733728830967 0.5440084246239082
0.6730081874174534 0.0201389931424919 0.5366956592970091
0.6707073269146357 0.022439853645309538 0.5473600688733553
0.6691538666585605 0.023993313901384816 0.530314028173461
2
0.6695584138727455 0.02358876668719978 0.5075709091989618
0

In [16]:
#### Add CV KNN

##Add KNN for linear
### add KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
REPs = 20
n = 1024

Stats_MI_DIM_Mu2_KNN = np.zeros((REPs,len(DIMENSIONS)))
Stats_Conen_DIM_Mu2_KNN = np.zeros((REPs,len(DIMENSIONS)))
Stats_Pauc_DIM_Mu2_KNN = np.zeros((REPs,len(DIMENSIONS)))


for i in range(0,REPs):
    print(i)
    stats_paucs_samplesize_mu0 =[]
    stats_mi_samplesize_mu0 =[]
    stats_conen_samplesize_mu0 =[]
    for p in DIMENSIONS:
        neigh = KNeighborsClassifier(n_neighbors=int(np.sqrt(n))+1)
        # coeffs = np.array([np.exp(-0.0022 * (i + 30)) if i < 10 else 0 for i in range(p)])
        coeffs = np.array([1/(i+5) if i < 10 else 0 for i in range(p)])
        coeffs_noise = np.array([1 if i >=10 else 0 for i in range(p)])

        x_1 = np.random.normal(size=(n, p))
        noise = np.random.normal(size=(n, p))
        x_2 = x_1 * coeffs +noise*coeffs_noise
        x = np.vstack((x_1,x_2))
        y = np.array([0]*n+[1]*n).reshape(-1,1).ravel()
        
        cv = StratifiedKFold(n_splits=5, shuffle=True)
        conen_lst = []
        mi_lst = []
        pAUC_lst = []
        for train_ix, test_ix in cv.split(x,y):
            X_train, X_test = x[train_ix, :], x[test_ix, :]
            y_train, y_test = y[train_ix], y[test_ix]
            neigh.fit(X_train,y_train)
            posterior = neigh.predict_proba(X_test)
 
            stats_conen = np.mean(entropy(posterior, base=np.exp(1), axis=1))
            H_Y = entropy([50,50], base=np.exp(1))
            stats_mi = H_Y - stats_conen

            pauc = roc_auc_score(
                    y_test, posterior[:,1], max_fpr=0.1
                )
            print(stats_conen,stats_mi,pauc)
            conen_lst.append(stats_conen)
            mi_lst.append(stats_mi)
            pAUC_lst.append(pauc)

        stats_paucs_samplesize_mu0.append(np.mean(pAUC_lst))
        stats_mi_samplesize_mu0.append(np.mean(mi_lst))
        stats_conen_samplesize_mu0.append(np.mean(conen_lst))

    Stats_Pauc_DIM_Mu2_KNN[i,:] = stats_paucs_samplesize_mu0
    Stats_MI_DIM_Mu2_KNN[i,:] = stats_mi_samplesize_mu0
    Stats_Conen_DIM_Mu2_KNN[i,:] = stats_conen_samplesize_mu0
    



0
0.22817900636024807 0.4649681741996972 0.8526708583692209
0.24123466109880778 0.45191251946113753 0.9297666859268663
0.2449065453815966 0.44824063517834867 0.911333061982738
0.2492352018140857 0.4439119787458596 0.9086644044652521
0.2361017368814372 0.4570454436785081 0.9042966095295628
0.38887653502148173 0.30427064553846356 0.7679733241491593
0.36484435723915243 0.32830282332079286 0.6919042790704395
0.37568759758018083 0.31745958297976445 0.7662197659115275
0.36658337667716034 0.32656380388278494 0.7356488648090815
0.3736519610959949 0.3194952194639504 0.7061986626058212
0.5233980395037601 0.16974914105618522 0.558580418923573
0.5309155963056775 0.16223158425426776 0.5779376311093021
0.5067805305107496 0.18636665004919573 0.5690161105718888
0.5164706425165578 0.17667653804338745 0.599340741859431
0.5229884222443234 0.17015875831562188 0.5548440685645246
0.5913846191714697 0.10176256138847561 0.542230893265287
0.6047959862257942 0.08835119433415106 0.5280680135794573
0.580106076234

In [18]:
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Paucs_Dim_Linear_new_n1024_KNN.csv", Stats_Pauc_DIM_Mu2_KNN, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Conen_Dim_Linear_new_n1024_KNN.csv", Stats_Conen_DIM_Mu2_KNN, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_MI_Dim_Linear_new_n1024_KNN.csv", Stats_MI_DIM_Mu2_KNN, delimiter=",")
Stats_Pauc_DIM_Mu2_KNN

array([[0.90134632, 0.73358898, 0.57194379, 0.53872654, 0.51346258,
        0.50523855, 0.49691842, 0.50286412, 0.50002636],
       [0.87792918, 0.69521983, 0.57780534, 0.53421349, 0.51629664,
        0.50709156, 0.49614418, 0.50331296, 0.49939652],
       [0.91465832, 0.70614572, 0.57006501, 0.53468483, 0.51813555,
        0.50639752, 0.50577453, 0.50188358, 0.50926027],
       [0.85803607, 0.70161084, 0.56411742, 0.53386352, 0.51293802,
        0.50238507, 0.5046607 , 0.50471052, 0.4978877 ],
       [0.84756845, 0.70391078, 0.56357073, 0.53581848, 0.50788051,
        0.50842   , 0.49621065, 0.49968329, 0.49697381],
       [0.90042923, 0.6965836 , 0.56426675, 0.53989134, 0.50663161,
        0.50889838, 0.50586448, 0.50344133, 0.49876784],
       [0.87248125, 0.69577216, 0.56005762, 0.52553173, 0.5143024 ,
        0.50254968, 0.4961623 , 0.50292646, 0.49624551],
       [0.90677497, 0.70857556, 0.59117443, 0.5300113 , 0.51473419,
        0.50688769, 0.49680281, 0.50497674, 0.4947798 ],


In [20]:
##Add Logistics Regression for linear
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
REPs = 50
n = 1024

Stats_MI_DIM_Mu2_LG = np.zeros((REPs,len(DIMENSIONS)))
Stats_Conen_DIM_Mu2_LG = np.zeros((REPs,len(DIMENSIONS)))
Stats_Pauc_DIM_Mu2_LG = np.zeros((REPs,len(DIMENSIONS)))


for i in range(0,REPs):
    print(i)
    stats_paucs_samplesize_mu0 =[]
    stats_mi_samplesize_mu0 =[]
    stats_conen_samplesize_mu0 =[]
    for p in DIMENSIONS:
        # coeffs = np.array([np.exp(-0.0022 * (i + 30)) if i < 10 else 0 for i in range(p)])
        coeffs = np.array([1/(i+5) if i < 10 else 0 for i in range(p)])
        coeffs_noise = np.array([1 if i >=10 else 0 for i in range(p)])

        x_1 = np.random.normal(size=(n, p))
        noise = np.random.normal(size=(n, p))
        x_2 = x_1* coeffs + noise*coeffs_noise

        # x_2 = x_1 * coeffs + noise*coeffs_noise
        x = np.float32(np.vstack((x_1,x_2)))
        y = np.array([0]*n+[1]*n).reshape(-1,1).ravel()
        
        clf = LogisticRegression(penalty='l1',solver = 'liblinear')
        
        cv = StratifiedKFold(n_splits=5, shuffle=True)
        conen_lst = []
        mi_lst = []
        pAUC_lst = []
        for train_ix, test_ix in cv.split(x,y):
            X_train, X_test = x[train_ix, :], x[test_ix, :]
            y_train, y_test = y[train_ix], y[test_ix]
            clf.fit(X_train,y_train)
            posterior = clf.predict_proba(X_test)
 
            stats_conen = np.mean(entropy(posterior, base=np.exp(1), axis=1))
            H_Y = entropy([50,50], base=np.exp(1))
            stats_mi = H_Y - stats_conen

            pauc = roc_auc_score(
                    y_test, posterior[:,1], max_fpr=0.1
                )
            
            conen_lst.append(stats_conen)
            mi_lst.append(stats_mi)
            pAUC_lst.append(pauc)

        stats_paucs_samplesize_mu0.append(np.mean(pAUC_lst))
        stats_mi_samplesize_mu0.append(np.mean(mi_lst))
        stats_conen_samplesize_mu0.append(np.mean(conen_lst))
        print(np.mean(conen_lst),np.mean(mi_lst),np.mean(pAUC_lst))

    Stats_Pauc_DIM_Mu2_LG[i,:] = stats_paucs_samplesize_mu0
    Stats_MI_DIM_Mu2_LG[i,:] = stats_mi_samplesize_mu0
    Stats_Conen_DIM_Mu2_LG[i,:] = stats_conen_samplesize_mu0
    

0
0.6896734109149981 0.0034737696449471224 0.47516221178975365
0.6846519514827383 0.008495229077207034 0.48182429864772863
0.6720831862177139 0.02106399434223143 0.487433090898152
0.6559842524074541 0.037162928152491205 0.4992693793131392
0.6126569222113893 0.08049025834855592 0.4939068899108533
0.488300277720619 0.2048469028393262 0.503249822424978
0.1313801755438646 0.5617670050160808 0.49927404506998324
0.2149263098131362 0.478220870746809 0.49588842456422755
0.27277242146188263 0.4203747590980626 0.4983653889245983
1
0.688306084035853 0.004841096524092192 0.48452233393435645
0.6883042890966828 0.0048428914632624 0.47956834382207747
0.6731883004287628 0.019958880131182456 0.49625591430723637
0.6605050373902942 0.032642143169651125 0.4915828518702011
0.6056028328682501 0.08754434769169514 0.4991777217477679
0.4753235932882931 0.2178235872716522 0.4963577014762823
0.13609551797630037 0.5570516625836449 0.4989958186223862
0.2054426247138342 0.4877045558461111 0.5008594201323724
0.27354

In [21]:
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Paucs_Dim_Linear_new_n1024_LG.csv", Stats_Pauc_DIM_Mu2_LG, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Conen_Dim_Linear_new_n1024_LG.csv", Stats_Conen_DIM_Mu2_LG, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_MI_Dim_Linear_new_n1024_LG.csv", Stats_MI_DIM_Mu2_LG, delimiter=",")
Stats_MI_DIM_Mu2_LG

array([[0.00347377, 0.00849523, 0.02106399, 0.03716293, 0.08049026,
        0.2048469 , 0.56176701, 0.47822087, 0.42037476],
       [0.0048411 , 0.00484289, 0.01995888, 0.03264214, 0.08754435,
        0.21782359, 0.55705166, 0.48770456, 0.41960311],
       [0.00337674, 0.00595761, 0.01658854, 0.04054336, 0.09044397,
        0.20265756, 0.55608037, 0.47094287, 0.42155608],
       [0.00415028, 0.00741588, 0.02155321, 0.0472925 , 0.08794381,
        0.23055536, 0.56036579, 0.48246785, 0.42529127],
       [0.00406875, 0.00747311, 0.01697047, 0.03858683, 0.08416829,
        0.19708099, 0.55080304, 0.47941873, 0.41842539],
       [0.00544506, 0.00779335, 0.01529249, 0.03965597, 0.09165209,
        0.20518437, 0.55937398, 0.48823227, 0.42823658],
       [0.00489588, 0.00567521, 0.01577966, 0.04356635, 0.08415625,
        0.2030835 , 0.55964344, 0.47293924, 0.42671305],
       [0.00321174, 0.00615031, 0.01415291, 0.03849765, 0.09926063,
        0.21284491, 0.5621859 , 0.47154525, 0.42550392],


In [22]:
### Add SVM to the Linear
##Add Logistics Regression for linear
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_val_predict

REPs = 20
n = 1024

Stats_MI_DIM_Mu2_SVM = np.zeros((REPs,len(DIMENSIONS)))
Stats_Conen_DIM_Mu2_SVM = np.zeros((REPs,len(DIMENSIONS)))
Stats_Pauc_DIM_Mu2_SVM = np.zeros((REPs,len(DIMENSIONS)))


for i in range(0,REPs):
    print(i)
    stats_paucs_samplesize_mu0 =[]
    stats_mi_samplesize_mu0 =[]
    stats_conen_samplesize_mu0 =[]
    for p in DIMENSIONS:
        
        # coeffs = np.array([np.exp(-0.0022 * (i + 30)) if i < 10 else 0 for i in range(p)])
        coeffs = np.array([1/(i+5) if i < 10 else 0 for i in range(p)])
        
        coeffs_noise = np.array([1 if i >=10 else 0 for i in range(p)])

        x_1 = np.random.normal(size=(n, p))
        noise = np.random.normal(size=(n, p))

        x_2 = x_1 * coeffs + noise*coeffs_noise

        x = np.vstack((x_1,x_2))
        y = np.array([0]*n+[1]*n).reshape(-1,1).ravel()
        
        model = SVC(probability=True,kernel = 'rbf')  # Set probability=True to enable probability estimates
        cv = StratifiedKFold(n_splits=5, shuffle=True)
        conen_lst = []
        mi_lst = []
        pAUC_lst = []
        for train_ix, test_ix in cv.split(x,y):
            X_train, X_test = x[train_ix, :], x[test_ix, :]
            y_train, y_test = y[train_ix], y[test_ix]
            model.fit(X_train,y_train)
            posterior = model.predict_proba(X_test)
 
            stats_conen = np.mean(entropy(posterior, base=np.exp(1), axis=1))
            H_Y = entropy([50,50], base=np.exp(1))
            stats_mi = H_Y - stats_conen

            pauc = roc_auc_score(
                    y_test, posterior[:,1], max_fpr=0.1
                )
            # print(stats_conen,stats_mi,pauc)
            conen_lst.append(stats_conen)
            mi_lst.append(stats_mi)
            pAUC_lst.append(pauc)
        print(np.mean(conen_lst),np.mean(mi_lst),np.mean(pAUC_lst))

        stats_paucs_samplesize_mu0.append(np.mean(pAUC_lst))
        stats_mi_samplesize_mu0.append(np.mean(mi_lst))
        stats_conen_samplesize_mu0.append(np.mean(conen_lst))

    Stats_Pauc_DIM_Mu2_SVM[i,:] = stats_paucs_samplesize_mu0
    Stats_MI_DIM_Mu2_SVM[i,:] = stats_mi_samplesize_mu0
    Stats_Conen_DIM_Mu2_SVM[i,:] = stats_conen_samplesize_mu0
    

0
0.02551002374198542 0.6676371568179599 0.9986457026844677
0.17649708233672806 0.5166500982232172 0.9432175550329089
0.4919857111461151 0.20116146941383017 0.716768300356869
0.642726541238825 0.05042063932112031 0.5622227174472908
0.6868234323050193 0.006323748254925965 0.5183777409019277
0.6894652452230884 0.003681935336856945 0.5124618681813187
0.6918670766312968 0.0012801039286485238 0.497223997460846
0.6916402839785324 0.0015068965814128532 0.49465009586288566
0.692869860111569 0.0002773204483762726 0.49731277174090793
1
0.019304144238873378 0.6738430363210719 0.9994974488746624
0.16454864800555544 0.5285985325543898 0.9453238372903249
0.49514752008206997 0.1979996604778753 0.6917745162500331
0.6556062051437881 0.037540975416157155 0.544001156616565
0.6881967653453464 0.00495041521459898 0.5132148967793386
0.6883972550636523 0.004749925496292984 0.5139811859494416
0.6927089482094887 0.0004382323504565688 0.5016214886342777
0.6919047094080295 0.001242471151915825 0.5012317598068377

In [24]:
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Paucs_Dim_Linear_new_n1024_SVM.csv", Stats_Pauc_DIM_Mu2_SVM, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Conen_Dim_Linear_new_n1024_SVM.csv", Stats_Conen_DIM_Mu2_SVM, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_MI_Dim_Linear_new_n1024_SVM.csv", Stats_MI_DIM_Mu2_SVM, delimiter=",")
Stats_Pauc_DIM_Mu2_SVM

array([[0.9986457 , 0.94321756, 0.7167683 , 0.56222272, 0.51837774,
        0.51246187, 0.497224  , 0.4946501 , 0.49731277],
       [0.99949745, 0.94532384, 0.69177452, 0.54400116, 0.5132149 ,
        0.51398119, 0.50162149, 0.50123176, 0.50504098],
       [0.99873349, 0.9416773 , 0.68316128, 0.55003975, 0.51867463,
        0.50099313, 0.49758584, 0.49784432, 0.49222335],
       [0.99870605, 0.9516282 , 0.68732437, 0.57474457, 0.51213692,
        0.50899791, 0.50129924, 0.50277999, 0.49809705],
       [0.9994228 , 0.93570833, 0.68097943, 0.57308392, 0.5309231 ,
        0.50977194, 0.51157366, 0.49995144, 0.5050126 ],
       [0.99933562, 0.94426539, 0.68065878, 0.56997745, 0.53565369,
        0.51839205, 0.50117586, 0.50154245, 0.49295458],
       [0.99892025, 0.94935333, 0.67840528, 0.55002723, 0.51848303,
        0.49979237, 0.4978769 , 0.50369227, 0.50029259],
       [0.99964896, 0.9437878 , 0.6875113 , 0.5479541 , 0.52148065,
        0.50210721, 0.50474643, 0.50135998, 0.49914871],


In [20]:
##Add RF for linear
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
REPs = 10
n = 1024

Stats_MI_DIM_Mu2_RF = np.zeros((REPs,len(DIMENSIONS)))
Stats_Conen_DIM_Mu2_RF = np.zeros((REPs,len(DIMENSIONS)))
Stats_Pauc_DIM_Mu2_RF = np.zeros((REPs,len(DIMENSIONS)))


for i in range(0,REPs):
    print(i)
    stats_paucs_samplesize_mu0 =[]
    stats_mi_samplesize_mu0 =[]
    stats_conen_samplesize_mu0 =[]
    for p in DIMENSIONS:
        # coeffs = np.array([np.exp(-0.0022 * (i + 30)) if i < 10 else 0 for i in range(p)])
        coeffs = np.array([1/(i+5) if i < 10 else 0 for i in range(p)])
        coeffs_noise = np.array([1 if i >=10 else 0 for i in range(p)])

        x_1 = np.random.normal(size=(n, p))
        noise = np.random.normal(size=(n, p))
        x_2 = x_1* coeffs + noise*coeffs_noise

        # x_2 = x_1 * coeffs + noise*coeffs_noise
        x = np.float32(np.vstack((x_1,x_2)))
        y = np.array([0]*n+[1]*n).reshape(-1,1).ravel()
        
        clf = RandomForestClassifier(n_estimators = 500)
        
        cv = StratifiedKFold(n_splits=5, shuffle=True)
        conen_lst = []
        mi_lst = []
        pAUC_lst = []
        for train_ix, test_ix in cv.split(x,y):
            X_train, X_test = x[train_ix, :], x[test_ix, :]
            y_train, y_test = y[train_ix], y[test_ix]
            clf.fit(X_train,y_train)
            posterior = clf.predict_proba(X_test)
 
            stats_conen = np.mean(entropy(posterior, base=np.exp(1), axis=1))
            H_Y = entropy([50,50], base=np.exp(1))
            stats_mi = H_Y - stats_conen

            pauc = roc_auc_score(
                    y_test, posterior[:,1], max_fpr=0.1
                )
            
            conen_lst.append(stats_conen)
            mi_lst.append(stats_mi)
            pAUC_lst.append(pauc)

        stats_paucs_samplesize_mu0.append(np.mean(pAUC_lst))
        stats_mi_samplesize_mu0.append(np.mean(mi_lst))
        stats_conen_samplesize_mu0.append(np.mean(conen_lst))
        print(np.mean(conen_lst),np.mean(mi_lst),np.mean(pAUC_lst))

    Stats_Pauc_DIM_Mu2_RF[i,:] = stats_paucs_samplesize_mu0
    Stats_MI_DIM_Mu2_RF[i,:] = stats_mi_samplesize_mu0
    Stats_Conen_DIM_Mu2_RF[i,:] = stats_conen_samplesize_mu0
    

0
0.07797750672104886 0.6151696738388963 1.0
0.123043218983153 0.5701039615767922 1.0
0.15388744908031643 0.5392597314796289 1.0
0.23563877989903453 0.4575084006609108 1.0
0.3278976569097354 0.3652495236502099 1.0
0.4334757657940546 0.25967141476589073 1.0
0.5253595976580281 0.16778758290191728 1.0
0.5948482060239425 0.09829897453600264 1.0
0.6404527491523102 0.05269443140763512 1.0
1
0.07471141710347293 0.6184357634564724 1.0
0.11589551719030491 0.5772516633696403 1.0
0.1561509287032162 0.5369962518567292 1.0
0.22869142409821608 0.4644557564617292 1.0


KeyboardInterrupt: 

In [21]:
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Paucs_Dim_Linear_new_n1024_RF.csv", np.ones((10,Stats_Pauc_DIM_Mu2_RF)), delimiter=",")

## LOGARITHM

In [4]:
def statistcs_Reps_Pertree_Logarithm(clf,n=100,p=4096,ratio=0.5,metric = 'mi',reps = 1):
    clf.reset()
    # coeffs = np.array([np.exp(-0.072 * (i + 10)) if i < 10 else 0 for i in range(p)])
    # coeffs = np.array([10/(5+i) if i < 10 else 0 for i in range(p)])
    coeffs = np.array([np.exp(-0.2* (i + 1)) if i < 10 else 0 for i in range(p)])
    # coeffs = np.array([100/(i+1) if i < 10 else 0 for i in range(p)])
    # coeffs = np.array([1+100 if i < 10 else 0 for i in range(p)])
    coeffs_noise = np.array([1 if i >=10 else 0 for i in range(p)])
    # print(coeffs)

    x_1 = np.random.normal(size=(n, p))
    noise = np.random.normal(size=(n, p))

    x_2 = np.log((x_1 * coeffs+1)**2) + noise*coeffs_noise
    x = np.nan_to_num(np.float32(np.vstack((x_1,x_2))))
    y = np.array([0]*n+[1]*n).reshape(-1,1)

    if metric == 'auc':
        stats,pos,samples = clf.statistic(x, y, metric=metric, return_posteriors=True,max_fpr = 0.1)
    else:
        stats,pos,samples = clf.statistic(x, y, metric=metric,return_posteriors=True)
    clf.reset()
    POS = pos[:,:,0].reshape((n_estimators,2*n))

    np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Posterior_log_dimension_newcoef_n1024_{}_{}_{}.csv".format(metric,p, reps), POS, delimiter=",")
    np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Samples_log_dimension_newcoef_n1024_{}_{}_{}.csv".format(metric,p, reps), samples, delimiter=",")
    #POSs.append(POS)
    clf.reset()
    return stats

In [25]:
statistcs_Reps_Pertree_Logarithm(clf_Pertree,n=1024,p = 4096,ratio=0.5,metric = 'mi',reps = 1)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


0.37766493496896536

In [5]:
REPs = 10
Stats_Paucs_Samplesize_Log = np.zeros((REPs,len(DIMENSIONS)))

for i in range(0,REPs):
    print(i)
    stats_paucs_samplesize_log =[]
    for dim_i in DIMENSIONS:
        stat = statistcs_Reps_Pertree_Logarithm(clf_Pertree,n=1024,p = dim_i,ratio=0.5,metric = 'auc',reps = i)
        stats_paucs_samplesize_log.append(stat) 
    print(stats_paucs_samplesize_log)
    print(stat)
    np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/log_pAUC_dimension_newcoef_n1024_{}.csv".format(i), stats_paucs_samplesize_log, delimiter=",")
    Stats_Paucs_Samplesize_Log[i,:] = stats_paucs_samplesize_log

0


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


[0.8723772952431127, 0.8679934049907483, 0.8407012537906045, 0.840537623355263, 0.8563676131399054, 0.8409000195954975, 0.8385047912597656, 0.8404101321571752, 0.8519606339304071]
0.8519606339304071
1


In [None]:
REPs = 10
Stats_Paucs_Dim_Log = np.zeros((REPs,len(DIMENSIONS)))
for i in range(REPs):
    pauc = np.array(np.genfromtxt('/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/log_pAUC_dimension_newcoef_n1024_{}.csv'.format(i),delimiter=','))
    Stats_Paucs_Dim_Log[i,:] = pauc
print(Stats_Paucs_Dim_Log)


Stats_MI_Dim_Log = np.zeros((REPs,len(DIMENSIONS)))
Stats_Conen_Dim_Log = np.zeros((REPs,len(DIMENSIONS)))
for i in range(REPs):
    for samp in range(len(DIMENSIONS)):
        pos = np.genfromtxt('/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Posterior_log_dimension_newcoef_n1024_auc_{}_{}.csv'.format(DIMENSIONS[samp],i),delimiter=',')
        posterior_forest_0 = np.nanmean(pos, axis=0)
        posterior_forest_1 = np.ones(posterior_forest_0.shape)-posterior_forest_0
        posterior_forest = np.hstack((posterior_forest_0.reshape(-1,1),posterior_forest_1.reshape(-1,1)))

        stats_conen = np.mean(entropy(posterior_forest, base=np.exp(1), axis=1))

        H_Y = entropy([50,50], base=np.exp(1))
        stats_mi = H_Y - stats_conen

        Stats_Conen_Dim_Log[i,samp] = stats_conen
        Stats_MI_Dim_Log[i,samp] = stats_mi


[[0.84737497 0.8670578  0.82899114 0.84804354 0.84721836 0.82185063
  0.85697295 0.81871454 0.79338395]
 [0.88029078 0.85081522 0.82797221 0.85251377 0.83882402 0.8280224
  0.83612261 0.82556192 0.82560409]
 [0.83753907 0.82902728 0.85798083 0.84456514 0.82670633 0.82428099
  0.84452499 0.82545351 0.80831749]
 [0.85162133 0.86569957 0.86471377 0.85365014 0.84248312 0.83803297
  0.83113841 0.82361442 0.83149177]
 [0.86282851 0.83958395 0.82242283 0.83509164 0.83325155 0.83486577
  0.79766946 0.82691614 0.8256081 ]
 [0.85854701 0.86147429 0.85264828 0.84412545 0.82669429 0.85251377
  0.82531598 0.81856296 0.8093605 ]
 [0.84232953 0.84889482 0.85374049 0.81007526 0.83376352 0.82165086
  0.82967879 0.83179494 0.82811376]
 [0.85288218 0.8607505  0.81845755 0.83612161 0.82369072 0.84141099
  0.81508857 0.82718217 0.8220514 ]
 [0.86620853 0.86509022 0.8465277  0.82808464 0.84565634 0.80259544
  0.80573253 0.83329371 0.8020393 ]
 [0.84599364 0.84341672 0.84301015 0.82454701 0.84163686 0.815226

FileNotFoundError: /Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Posterior_log_dimension_newcoef_n1024_auc_4096_1.csv not found.

In [None]:
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Paucs_Dim_Log_n1024_newcoef_MIGHT.csv",Stats_Paucs_Dim_Log , delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Conen_Dim_Log_n1024_newcoef_MIGHT.csv", Stats_Conen_Dim_Log, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_MI_Dim_Log_n1024_newcoef_MIGHT.csv", Stats_MI_Dim_Log, delimiter=",")
Stats_Paucs_Dim_Log

array([[0.84737497, 0.8670578 , 0.82899114, 0.84804354, 0.84721836,
        0.82185063, 0.85697295, 0.81871454, 0.79338395],
       [0.88029078, 0.85081522, 0.82797221, 0.85251377, 0.83882402,
        0.8280224 , 0.83612261, 0.82556192, 0.82560409],
       [0.83753907, 0.82902728, 0.85798083, 0.84456514, 0.82670633,
        0.82428099, 0.84452499, 0.82545351, 0.80831749],
       [0.85162133, 0.86569957, 0.86471377, 0.85365014, 0.84248312,
        0.83803297, 0.83113841, 0.82361442, 0.83149177],
       [0.86282851, 0.83958395, 0.82242283, 0.83509164, 0.83325155,
        0.83486577, 0.79766946, 0.82691614, 0.8256081 ],
       [0.85854701, 0.86147429, 0.85264828, 0.84412545, 0.82669429,
        0.85251377, 0.82531598, 0.81856296, 0.8093605 ],
       [0.84232953, 0.84889482, 0.85374049, 0.81007526, 0.83376352,
        0.82165086, 0.82967879, 0.83179494, 0.82811376],
       [0.85288218, 0.8607505 , 0.81845755, 0.83612161, 0.82369072,
        0.84141099, 0.81508857, 0.82718217, 0.8220514 ],


In [4]:
##Add KNN for Logarithm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
REPs = 20
n = 1024

Stats_MI_DIM_Mu2_KNN = np.zeros((REPs,len(DIMENSIONS)))
Stats_Conen_DIM_Mu2_KNN = np.zeros((REPs,len(DIMENSIONS)))
Stats_Pauc_DIM_Mu2_KNN = np.zeros((REPs,len(DIMENSIONS)))


for i in range(0,REPs):
    print(i)
    stats_paucs_samplesize_mu0 =[]
    stats_mi_samplesize_mu0 =[]
    stats_conen_samplesize_mu0 =[]
    for p in DIMENSIONS:
        neigh = KNeighborsClassifier(n_neighbors=int(np.sqrt(n))+1)
        # coeffs = np.array([np.exp(-0.072 * (i + 10)) if i < 10 else 0 for i in range(p)])
        coeffs = np.array([np.exp(-0.2* (i + 1)) if i < 10 else 0 for i in range(p)])
        coeffs_noise = np.array([1 if i >=10 else 0 for i in range(p)])
        # print(coeffs)

        x_1 = np.random.normal(size=(n, p))
        noise = np.random.normal(size=(n, p))

        x_2 = np.log((x_1 * coeffs+1)**2) + noise*coeffs_noise
        x = np.vstack((x_1,x_2))
        y = np.array([0]*n+[1]*n).reshape(-1,1).ravel()
        
        cv = StratifiedKFold(n_splits=5, shuffle=True)
        conen_lst = []
        mi_lst = []
        pAUC_lst = []
        for train_ix, test_ix in cv.split(x,y):
            X_train, X_test = x[train_ix, :], x[test_ix, :]
            y_train, y_test = y[train_ix], y[test_ix]
            neigh.fit(X_train,y_train)
            posterior = neigh.predict_proba(X_test)
 
            stats_conen = np.mean(entropy(posterior, base=np.exp(1), axis=1))
            H_Y = entropy([50,50], base=np.exp(1))
            stats_mi = H_Y - stats_conen

            pauc = roc_auc_score(
                    y_test, posterior[:,1], max_fpr=0.1
                )
            print(stats_conen,stats_mi,pauc)
            conen_lst.append(stats_conen)
            mi_lst.append(stats_mi)
            pAUC_lst.append(pauc)

        stats_paucs_samplesize_mu0.append(np.mean(pAUC_lst))
        stats_mi_samplesize_mu0.append(np.mean(mi_lst))
        stats_conen_samplesize_mu0.append(np.mean(conen_lst))

    Stats_Pauc_DIM_Mu2_KNN[i,:] = stats_paucs_samplesize_mu0
    Stats_MI_DIM_Mu2_KNN[i,:] = stats_mi_samplesize_mu0
    Stats_Conen_DIM_Mu2_KNN[i,:] = stats_conen_samplesize_mu0
    

0
0.5789894942239981 0.11415768633594714 0.707132827722076
0.56974809421908 0.12339908634086527 0.7459845330160619
0.5895467180613837 0.10360046249856159 0.6980807163655719
0.5811970627446301 0.11195011781531516 0.6998414256588387
0.5782776954122684 0.11486948514767692 0.7446333282992272
0.6310977215069983 0.06204945905294701 0.6194342687971167
0.6315662808796343 0.061580899680310996 0.6140331651948369
0.6214467575651118 0.07170042299483348 0.6422868593255894
0.638074748205026 0.05507243235491932 0.6128046360244502
0.6408387326399739 0.05230844791997136 0.6233989122458502
0.662112082042147 0.031035098517798287 0.5946833310088265
0.6490065069501262 0.0441406736098191 0.6083471617771377
0.6557218362080839 0.0374253443518614 0.5686230000939291
0.6535722522322238 0.03957492832772147 0.5893034684990813
0.658375941424359 0.03477123913558633 0.5915925394548063
0.66548086052044 0.02766632003950531 0.5564086653820208
0.6597999330583069 0.03334724750163842 0.5682779914596816
0.6613576802070211 0

In [5]:
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Paucs_Dim_Log_n1024_KNN.csv", Stats_Pauc_DIM_Mu2_KNN, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Conen_Dim_Log_n1024_KNN.csv", Stats_Conen_DIM_Mu2_KNN, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_MI_Dim_Log_n1024_KNN.csv", Stats_MI_DIM_Mu2_KNN, delimiter=",")
Stats_MI_DIM_Mu2_KNN

array([[0.11359537, 0.06054233, 0.03738946, 0.03040316, 0.02040724,
        0.018995  , 0.01520943, 0.01557351, 0.01706572],
       [0.11870045, 0.06133847, 0.03472933, 0.02442813, 0.01649276,
        0.01592594, 0.01717268, 0.01551238, 0.01529144],
       [0.11512878, 0.05648296, 0.03512449, 0.02132156, 0.01893827,
        0.01678065, 0.01598066, 0.01544409, 0.01630161],
       [0.11545646, 0.06906208, 0.03796354, 0.02986408, 0.02040889,
        0.01698199, 0.01622925, 0.01594399, 0.01438475],
       [0.12224223, 0.06017543, 0.03564782, 0.02192301, 0.01749752,
        0.01810589, 0.01656305, 0.01366462, 0.01414056],
       [0.11435649, 0.05909776, 0.03468104, 0.02143582, 0.01837027,
        0.0166998 , 0.01487778, 0.01559969, 0.01415787],
       [0.111274  , 0.06111748, 0.03331382, 0.02300266, 0.01838977,
        0.0178038 , 0.01502495, 0.01521669, 0.0147219 ],
       [0.1215614 , 0.05814393, 0.03427238, 0.02654342, 0.01998143,
        0.01890766, 0.01679007, 0.01878107, 0.01542144],


In [6]:
##Add Logistics Regression for Logarithm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
REPs = 20
n = 1024

Stats_MI_DIM_Mu2_LG = np.zeros((REPs,len(DIMENSIONS)))
Stats_Conen_DIM_Mu2_LG = np.zeros((REPs,len(DIMENSIONS)))
Stats_Pauc_DIM_Mu2_LG = np.zeros((REPs,len(DIMENSIONS)))


for i in range(0,REPs):
    print(i)
    stats_paucs_samplesize_mu0 =[]
    stats_mi_samplesize_mu0 =[]
    stats_conen_samplesize_mu0 =[]
    for p in DIMENSIONS:
        # coeffs = np.array([np.exp(-0.072 * (i + 10)) if i < 10 else 0 for i in range(p)])
        coeffs = np.array([np.exp(-0.2* (i + 1)) if i < 10 else 0 for i in range(p)])
        coeffs_noise = np.array([1 if i >=10 else 0 for i in range(p)])
        # print(coeffs)

        x_1 = np.random.normal(size=(n, p))
        noise = np.random.normal(size=(n, p))

        x_2 = np.log((x_1 * coeffs+1)**2) + noise*coeffs_noise
        x = np.vstack((x_1,x_2))
        y = np.array([0]*n+[1]*n).reshape(-1,1).ravel()
        
        clf = LogisticRegression(penalty='l1',solver = 'liblinear')
        
        cv = StratifiedKFold(n_splits=5, shuffle=True)
        conen_lst = []
        mi_lst = []
        pAUC_lst = []
        for train_ix, test_ix in cv.split(x,y):
            X_train, X_test = x[train_ix, :], x[test_ix, :]
            y_train, y_test = y[train_ix], y[test_ix]
            clf.fit(X_train,y_train)
            posterior = clf.predict_proba(X_test)
 
            stats_conen = np.mean(entropy(posterior, base=np.exp(1), axis=1))
            H_Y = entropy([50,50], base=np.exp(1))
            stats_mi = H_Y - stats_conen

            pauc = roc_auc_score(
                    y_test, posterior[:,1], max_fpr=0.1
                )
            
            conen_lst.append(stats_conen)
            mi_lst.append(stats_mi)
            pAUC_lst.append(pauc)

        stats_paucs_samplesize_mu0.append(np.mean(pAUC_lst))
        stats_mi_samplesize_mu0.append(np.mean(mi_lst))
        stats_conen_samplesize_mu0.append(np.mean(conen_lst))
        print(np.mean(conen_lst),np.mean(mi_lst),np.mean(pAUC_lst))

    Stats_Pauc_DIM_Mu2_LG[i,:] = stats_paucs_samplesize_mu0
    Stats_MI_DIM_Mu2_LG[i,:] = stats_mi_samplesize_mu0
    Stats_Conen_DIM_Mu2_LG[i,:] = stats_conen_samplesize_mu0
    

0
0.6421479478546928 0.05099923270525255 0.6022196724270346
0.6401354698160683 0.053011710743877075 0.5970700274849913
0.6265786404839087 0.0665685400760366 0.5795852264948687
0.6178794228492672 0.07526775771067815 0.5528943347275352
0.5497966811272542 0.14335049943269093 0.5538789935962487
0.4283826301270435 0.2647645504329018 0.530433933803961
0.1332329022575746 0.5599142783023707 0.5060480487006787
0.22505435638024562 0.46809282417969966 0.5295916419105292
0.25903483097676877 0.43411234958317657 0.5233133749445175
1
0.639525276969378 0.053621903590567246 0.603135511382298
0.6381768401089956 0.05497034045094973 0.5885321835926204
0.6312108662109346 0.06193631434901077 0.5813237120515885
0.6052426469073361 0.0879045336526092 0.5587911760715432
0.5469572743040303 0.14618990625591494 0.550969525854739
0.4359518189513357 0.25719536160860956 0.5403285920648982
0.13999645745875494 0.5531507231011903 0.5146205174201557
0.22419926476508306 0.4689479157948623 0.5192831432958291
0.273204860609

In [10]:
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Paucs_Dim_Log_n1024_LG.csv", Stats_Pauc_DIM_Mu2_LG, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Conen_Dim_Log_n1024_LG.csv", Stats_Conen_DIM_Mu2_LG, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_MI_Dim_Log_n1024_LG.csv", Stats_MI_DIM_Mu2_LG, delimiter=",")
Stats_Pauc_DIM_Mu2_LG

array([[0.60221967, 0.59707003, 0.57958523, 0.55289433, 0.55387899,
        0.53043393, 0.50604805, 0.52959164, 0.52331337],
       [0.60313551, 0.58853218, 0.58132371, 0.55879118, 0.55096953,
        0.54032859, 0.51462052, 0.51928314, 0.52083267],
       [0.58825611, 0.57272546, 0.57865502, 0.56266443, 0.53913397,
        0.5310891 , 0.51673668, 0.51164223, 0.53088768],
       [0.59388221, 0.58069887, 0.57353982, 0.5666089 , 0.55347166,
        0.52262886, 0.51291181, 0.51494552, 0.51994943],
       [0.59227934, 0.59331188, 0.5722051 , 0.56928072, 0.55295517,
        0.53185232, 0.51426549, 0.51815747, 0.52135744],
       [0.59150292, 0.57384278, 0.57333023, 0.55343127, 0.54984182,
        0.5353781 , 0.51646079, 0.51407413, 0.52206921],
       [0.5808149 , 0.59627808, 0.58188041, 0.56918108, 0.54521174,
        0.52041195, 0.52125719, 0.52792474, 0.52962123],
       [0.59313815, 0.58783993, 0.57280926, 0.55777625, 0.5488428 ,
        0.50569849, 0.51998221, 0.51579279, 0.51843729],


In [8]:
### Add SVM to the Logarithm
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_val_predict

REPs = 20
n = 1024

Stats_MI_DIM_Mu2_SVM = np.zeros((REPs,len(DIMENSIONS)))
Stats_Conen_DIM_Mu2_SVM = np.zeros((REPs,len(DIMENSIONS)))
Stats_Pauc_DIM_Mu2_SVM = np.zeros((REPs,len(DIMENSIONS)))


for i in range(0,REPs):
    print(i)
    stats_paucs_samplesize_mu0 =[]
    stats_mi_samplesize_mu0 =[]
    stats_conen_samplesize_mu0 =[]
    for p in DIMENSIONS:
        
        # coeffs = np.array([np.exp(-0.072 * (i + 10)) if i <10 else 0 for i in range(p)])
        coeffs = np.array([np.exp(-0.2* (i + 1)) if i < 10 else 0 for i in range(p)])
        coeffs_noise = np.array([1 if i >= 10 else 0 for i in range(p)])
        # print(coeffs)

        x_1 = np.random.normal(size=(n, p))
        noise = np.random.normal(size=(n, p))

        x_2 = np.log((x_1 * coeffs+1)**2) + noise*coeffs_noise
        x = np.vstack((x_1,x_2))
        y = np.array([0]*n+[1]*n).reshape(-1,1).ravel()
        
        model = SVC(probability=True,kernel = 'rbf')  # Set probability=True to enable probability estimates
        cv = StratifiedKFold(n_splits=5, shuffle=True)
        conen_lst = []
        mi_lst = []
        pAUC_lst = []
        for train_ix, test_ix in cv.split(x,y):
            X_train, X_test = x[train_ix, :], x[test_ix, :]
            y_train, y_test = y[train_ix], y[test_ix]
            model.fit(X_train,y_train)
            posterior = model.predict_proba(X_test)
 
            stats_conen = np.mean(entropy(posterior, base=np.exp(1), axis=1))
            H_Y = entropy([50,50], base=np.exp(1))
            stats_mi = H_Y - stats_conen

            pauc = roc_auc_score(
                    y_test, posterior[:,1], max_fpr=0.1
                )
            # print(stats_conen,stats_mi,pauc)
            conen_lst.append(stats_conen)
            mi_lst.append(stats_mi)
            pAUC_lst.append(pauc)
        print(np.mean(conen_lst),np.mean(mi_lst),np.mean(pAUC_lst))

        stats_paucs_samplesize_mu0.append(np.mean(pAUC_lst))
        stats_mi_samplesize_mu0.append(np.mean(mi_lst))
        stats_conen_samplesize_mu0.append(np.mean(conen_lst))

    Stats_Pauc_DIM_Mu2_SVM[i,:] = stats_paucs_samplesize_mu0
    Stats_MI_DIM_Mu2_SVM[i,:] = stats_mi_samplesize_mu0
    Stats_Conen_DIM_Mu2_SVM[i,:] = stats_conen_samplesize_mu0
    

0
0.34364705600319595 0.3495001245567493 0.7886136501627797
0.528587387790561 0.1645597927693842 0.6640126491123705
0.6218516966976892 0.071295483862256 0.6016346724669391
0.6603625912406162 0.03278458931932915 0.5880979612484338
0.666339668576472 0.026807511983473288 0.559074866365971
0.6736532931195776 0.019493887440367775 0.5516182502306787
0.680523272298308 0.012623908261637284 0.5375540015311049
0.6803478058125839 0.012799374747361324 0.5289903731930169
0.6884028339309932 0.004744346628952023 0.5152875978657846
1
0.34373839417254626 0.3494087863873991 0.7631955277492819
0.5085406138718658 0.18460656668807948 0.675386229509807
0.6274518336442829 0.06569534691566233 0.601984849796395
0.6617844074093127 0.03136277315063276 0.5738435829574635
0.6684381925336205 0.024708988026324864 0.5577049142083961
0.6780766293911057 0.015070551168839662 0.5425139466225138
0.6811763009591909 0.011970879600754446 0.5329350250385385
0.6894601684935789 0.0036870120663662796 0.5177067313979038
0.6910994

In [11]:
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Paucs_Dim_Log_n1024_SVM.csv", Stats_Pauc_DIM_Mu2_SVM, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Conen_Dim_Log_n1024_SVM.csv", Stats_Conen_DIM_Mu2_SVM, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_MI_Dim_Log_n1024_SVM.csv", Stats_MI_DIM_Mu2_SVM, delimiter=",")
Stats_Pauc_DIM_Mu2_SVM

array([[0.78861365, 0.66401265, 0.60163467, 0.58809796, 0.55907487,
        0.55161825, 0.537554  , 0.52899037, 0.5152876 ],
       [0.76319553, 0.67538623, 0.60198485, 0.57384358, 0.55770491,
        0.54251395, 0.53293503, 0.51770673, 0.52014164],
       [0.76085731, 0.68622515, 0.62860348, 0.5745467 , 0.54323689,
        0.53610756, 0.53086576, 0.52875543, 0.51086139],
       [0.79083762, 0.67043856, 0.62309679, 0.57217742, 0.54890278,
        0.54882997, 0.54140743, 0.51438686, 0.51022654],
       [0.80672796, 0.67284302, 0.59488523, 0.57946607, 0.54431082,
        0.55258738, 0.53592393, 0.52698803, 0.51446164],
       [0.79404023, 0.66966601, 0.59643555, 0.59517156, 0.55602856,
        0.54222719, 0.53653717, 0.5286359 , 0.52357748],
       [0.77479755, 0.64670877, 0.62065089, 0.58304795, 0.5659436 ,
        0.54933976, 0.53864579, 0.52812543, 0.52080872],
       [0.75808419, 0.67199809, 0.60809914, 0.58566802, 0.54476407,
        0.55348738, 0.52695991, 0.52970767, 0.5128258 ],


In [28]:
Stats_Pauc_DIM_Mu2_SVM

array([[0.55248178, 0.51964134, 0.51972739, 0.51605383, 0.5251785 ,
        0.49662908, 0.5195676 , 0.49743142],
       [0.53844626, 0.5246679 , 0.52838292, 0.51367001, 0.51545096,
        0.52154171, 0.49628813, 0.49420884],
       [0.51942936, 0.52718394, 0.53231875, 0.50479916, 0.50579108,
        0.50668973, 0.50649131, 0.5021205 ],
       [0.54385022, 0.52119885, 0.5251895 , 0.52726586, 0.51615186,
        0.49990068, 0.50062228, 0.49454938],
       [0.5323103 , 0.51143648, 0.51865866, 0.51911994, 0.52907327,
        0.49971549, 0.49746291, 0.49978773],
       [0.53253524, 0.51156831, 0.53960988, 0.52414572, 0.49868855,
        0.51847527, 0.49981228, 0.51065663],
       [0.53615203, 0.53501925, 0.50686916, 0.52985596, 0.52020223,
        0.51571537, 0.51839599, 0.49024205],
       [0.53387332, 0.53974092, 0.5191278 , 0.50103618, 0.51317219,
        0.51404648, 0.51770682, 0.50065627],
       [0.5359538 , 0.53582001, 0.5292102 , 0.51747147, 0.52752558,
        0.50286669, 0.517971

In [None]:
##Add RF for log
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
REPs = 10
n = 1024

Stats_MI_DIM_Mu2_RF = np.zeros((REPs,len(DIMENSIONS)))
Stats_Conen_DIM_Mu2_RF = np.zeros((REPs,len(DIMENSIONS)))
Stats_Pauc_DIM_Mu2_RF = np.zeros((REPs,len(DIMENSIONS)))


for i in range(0,REPs):
    print(i) 
    stats_paucs_samplesize_mu0 =[]
    stats_mi_samplesize_mu0 =[]
    stats_conen_samplesize_mu0 =[]
    for p in DIMENSIONS[-1:]:
        print(p)
        # coeffs = np.array([np.exp(-0.072 * (i + 10)) if i <10 else 0 for i in range(p)])
        # coeffs = np.array([100/(i+1) if i < 10 else 0 for i in range(p)])
        # coeffs = np.array([10/(5+i) if i < 10 else 0 for i in range(p)])
        coeffs = np.array([np.exp(-0.2* (i + 1)) if i < 10 else 0 for i in range(p)]) 
        
        
        coeffs_noise = np.array([1 if i >= 10 else 0 for i in range(p)])
        # print(coeffs)

        x_1 = np.random.normal(size=(n, p))
        noise = np.random.normal(size=(n, p))

        x_2 = np.log((x_1 * coeffs+1)**2) + noise*coeffs_noise
        
        x = np.float32(np.vstack((x_1,x_2)))
        y = np.array([0]*n+[1]*n).reshape(-1,1).ravel()
        
        clf = RandomForestClassifier(n_estimators = 500)
        
        cv = StratifiedKFold(n_splits=5, shuffle=True)
        conen_lst = []
        mi_lst = []
        pAUC_lst = []
        for train_ix, test_ix in cv.split(x,y):
            X_train, X_test = x[train_ix, :], x[test_ix, :]
            y_train, y_test = y[train_ix], y[test_ix]
            clf.fit(X_train,y_train)
            posterior = clf.predict_proba(X_test)
 
            stats_conen = np.mean(entropy(posterior, base=np.exp(1), axis=1))
            H_Y = entropy([50,50], base=np.exp(1))
            stats_mi = H_Y - stats_conen

            pauc = roc_auc_score(
                    y_test, posterior[:,1], max_fpr=0.1
                )
            
            conen_lst.append(stats_conen)
            mi_lst.append(stats_mi)
            pAUC_lst.append(pauc)

        stats_paucs_samplesize_mu0.append(np.mean(pAUC_lst))
        stats_mi_samplesize_mu0.append(np.mean(mi_lst))
        stats_conen_samplesize_mu0.append(np.mean(conen_lst))
        print(np.mean(conen_lst),np.mean(mi_lst),np.mean(pAUC_lst))

    Stats_Pauc_DIM_Mu2_RF[i,:] = stats_paucs_samplesize_mu0
    Stats_MI_DIM_Mu2_RF[i,:] = stats_mi_samplesize_mu0
    Stats_Conen_DIM_Mu2_RF[i,:] = stats_conen_samplesize_mu0
    

0
16
0.45502526691484124 0.23812191364510404 0.8513253127555789
32
0.5339014775856814 0.15924570297426383 0.8652299919236283
64
0.5589782318201957 0.1341689487397496 0.8520613411334856
128
0.6128942262281152 0.08025295433183018 0.8442605503130846
256
0.6453527792424915 0.04779440131745387 0.8436838981587405
512
0.6715528062619912 0.021594374297954078 0.845021548451256
1024
0.6815027186615324 0.011644461898412884 0.7945745484864221
2048
0.6869668330621692 0.006180347497776029 0.7552489147191335
4096
0.6897322606032743 0.003414919956671003 0.6939162207242986
1
16
0.4587659492124561 0.23438123134748917 0.8432352872416313
32
0.5338979986054889 0.15924918195445634 0.871221162726415
64
0.561950040557599 0.1311971400023463 0.851729003643699
128
0.6128304246621608 0.08031675589778456 0.8703809725318884
256
0.6521841827587462 0.04096299780119894 0.8305374568996594
512
0.6714631766362776 0.02168400392366767 0.8338392902222859
1024
0.6787966674893544 0.01435051307059081 0.85027834111993
2048
0.68

In [None]:
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Paucs_Dim_Log_n1024_newcoef_RF.csv", Stats_Pauc_DIM_Mu2_RF, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Conen_Dim_Log_n1024_newcoef_RF.csv", Stats_Conen_DIM_Mu2_RF, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_MI_Dim_Log_n1024_newcoef_RF.csv", Stats_MI_DIM_Mu2_RF, delimiter=",")
Stats_Pauc_DIM_Mu2_RF

array([[0.85132531, 0.86522999, 0.85206134, 0.84426055, 0.8436839 ,
        0.84502155, 0.79457455, 0.75524891, 0.69391622],
       [0.84323529, 0.87122116, 0.851729  , 0.87038097, 0.83053746,
        0.83383929, 0.85027834, 0.77953492, 0.78281275],
       [0.80468109, 0.83079293, 0.86358596, 0.87574916, 0.83522773,
        0.82445393, 0.81208961, 0.83148272, 0.70913183],
       [0.84416286, 0.86087345, 0.85095684, 0.85140085, 0.83788398,
        0.8170462 , 0.81632664, 0.80964696, 0.75366667],
       [0.86097289, 0.85863539, 0.85761372, 0.8504262 , 0.8367497 ,
        0.81258868, 0.80244629, 0.80396417, 0.71982604]])

## Multi-Independent

In [6]:
def statistcs_Reps_Pertree_Gaussian(clf,n=100,p=4096,ratio=0.5,metric = 'mi',reps = 1):
    prob=0.5
    sep1=3 
    sep2=2
    clf.reset()
    rng = np.random.default_rng()
    sig = np.identity(p)
    u = rng.multivariate_normal(np.zeros(p), sig, size=n, method='cholesky')
    v = rng.multivariate_normal(np.zeros(p), sig, size=n, method='cholesky')
    u_2 = rng.binomial(1, prob, size=(n, p))
    v_2 = rng.binomial(1, prob, size=(n, p))

    x_1 = u / sep1 + sep2 * u_2 - 1
    x_2 = v / sep1 + sep2 * v_2 - 1
    x = np.nan_to_num(np.float32(np.vstack((x_1,x_2))))
    y = np.array([0]*n+[1]*n).reshape(-1,1)

    if metric == 'auc':
        stats,pos,samples = clf.statistic(x, y, metric=metric, return_posteriors=True,max_fpr = 0.1)
    else:
        stats,pos,samples = clf.statistic(x, y, metric=metric,return_posteriors=True)
    clf.reset()
    POS = pos[:,:,0].reshape((n_estimators,2*n))

    np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Posterior_MultiInd_dimension_n1024_{}_{}_{}.csv".format(metric,p, reps), POS, delimiter=",")
    np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Samples_MultiInd_dimension_n1024_{}_{}_{}.csv".format(metric,p, reps), samples, delimiter=",")
    #POSs.append(POS)
    clf.reset()
    return stats

In [7]:
REPs = 10
Stats_Paucs_Samplesize_Gaussian = np.zeros((REPs,len(DIMENSIONS)))

for i in range(5,REPs):
    print(i)
    stats_paucs_samplesize_gaussian =[]
    for dim_i in DIMENSIONS:
        stat = statistcs_Reps_Pertree_Gaussian(clf_Pertree,n=1024,p = dim_i,ratio=0.5,metric = 'auc',reps = i)
        stats_paucs_samplesize_gaussian.append(stat) 
    print(stats_paucs_samplesize_gaussian)
    np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/MultiInd_pAUC_dimension_n1024_{}.csv".format(i), stats_paucs_samplesize_gaussian, delimiter=",")
    Stats_Paucs_Samplesize_Gaussian[i,:] = stats_paucs_samplesize_gaussian

5
[0.4975883082339638, 0.5055299056203741, 0.5008448550575658, 0.5026989987021998, 0.5030674181486431, 0.494729293020148, 0.5022502698396382, 0.5006209925601357, 0.4948216488486842]
6
[0.49180904187654195, 0.5031306618138364, 0.4958887602153577, 0.4947774786698191, 0.4972018191688939, 0.5075446680972451, 0.5059334604363692, 0.5111927233244243, 0.5001381321957237]
7
[0.4937997115285773, 0.4967842102050781, 0.5003148129111842, 0.5022683394582648, 0.49834923995168584, 0.5027873390599301, 0.49999859458521795, 0.49495215164987666, 0.49721587331671463]
8
[0.5012052435623972, 0.4993129529451069, 0.49958700882761103, 0.5122999893991571, 0.49838839079204356, 0.5006079422800165, 0.5002023797286185, 0.5008137351588199, 0.4982950310958059]
9
[0.5035281934236225, 0.4940024928042763, 0.497730857447574, 0.5096517864026522, 0.503635607267681, 0.49930994134200246, 0.4959048220985814, 0.5030202363666735, 0.4931180853592722]


In [8]:
REPs = 10
Stats_Paucs_Samplesize_Gaussian = np.zeros((REPs,len(DIMENSIONS)))
for i in range(REPs):
    pauc = np.array(np.genfromtxt('/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/MultiInd_pAUC_dimension_n1024_{}.csv'.format(i),delimiter=','))
    Stats_Paucs_Samplesize_Gaussian[i,:] = pauc
print(Stats_Paucs_Samplesize_Gaussian)


Stats_MI_Dim_Gaussian = np.zeros((REPs,len(DIMENSIONS)))
Stats_Conen_Dim_Gaussian = np.zeros((REPs,len(DIMENSIONS)))
for i in range(REPs):
    for samp in range(len(DIMENSIONS)):
        pos = np.genfromtxt('/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Posterior_MultiInd_dimension_n1024_auc_{}_{}.csv'.format(DIMENSIONS[samp],i),delimiter=',')
        posterior_forest_0 = np.nanmean(pos, axis=0)
        posterior_forest_1 = np.ones(posterior_forest_0.shape)-posterior_forest_0
        posterior_forest = np.hstack((posterior_forest_0.reshape(-1,1),posterior_forest_1.reshape(-1,1)))

        stats_conen = np.mean(entropy(posterior_forest, base=np.exp(1), axis=1))

        H_Y = entropy([50,50], base=np.exp(1))
        stats_mi = H_Y - stats_conen

        Stats_Conen_Dim_Gaussian[i,samp] = stats_conen
        Stats_MI_Dim_Gaussian[i,samp] = stats_mi


[[0.50033188 0.5043855  0.49957898 0.49375253 0.50805664 0.50051458
  0.49994439 0.49585362 0.5013488 ]
 [0.49965126 0.49655934 0.49540992 0.49503246 0.50778058 0.49787441
  0.49382481 0.50037304 0.50002269]
 [0.49352666 0.50306139 0.4952493  0.50267792 0.49539586 0.50383839
  0.4932265  0.50461337 0.50607501]
 [0.49246356 0.48901126 0.49931998 0.49728213 0.49796777 0.49980083
  0.49908407 0.50402912 0.49895357]
 [0.49328172 0.49800792 0.50138494 0.49559764 0.49988516 0.50164795
  0.49765256 0.50120625 0.50158571]
 [0.49758831 0.50552991 0.50084486 0.502699   0.50306742 0.49472929
  0.50225027 0.50062099 0.49482165]
 [0.49180904 0.50313066 0.49588876 0.49477748 0.49720182 0.50754467
  0.50593346 0.51119272 0.50013813]
 [0.49379971 0.49678421 0.50031481 0.50226834 0.49834924 0.50278734
  0.49999859 0.49495215 0.49721587]
 [0.50120524 0.49931295 0.49958701 0.51229999 0.49838839 0.50060794
  0.50020238 0.50081374 0.49829503]
 [0.50352819 0.49400249 0.49773086 0.50965179 0.50363561 0.49930

In [9]:
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Paucs_Dim_MultiInd_n1024_MIGHT.csv", Stats_Paucs_Samplesize_Gaussian, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Conen_Dim_MultiInd_n1024_MIGHT.csv", Stats_Conen_Dim_Gaussian, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_MI_Dim_MultiInd_n1024_MIGHT.csv", Stats_MI_Dim_Gaussian, delimiter=",")
Stats_Paucs_Samplesize_Gaussian

array([[0.50033188, 0.5043855 , 0.49957898, 0.49375253, 0.50805664,
        0.50051458, 0.49994439, 0.49585362, 0.5013488 ],
       [0.49965126, 0.49655934, 0.49540992, 0.49503246, 0.50778058,
        0.49787441, 0.49382481, 0.50037304, 0.50002269],
       [0.49352666, 0.50306139, 0.4952493 , 0.50267792, 0.49539586,
        0.50383839, 0.4932265 , 0.50461337, 0.50607501],
       [0.49246356, 0.48901126, 0.49931998, 0.49728213, 0.49796777,
        0.49980083, 0.49908407, 0.50402912, 0.49895357],
       [0.49328172, 0.49800792, 0.50138494, 0.49559764, 0.49988516,
        0.50164795, 0.49765256, 0.50120625, 0.50158571],
       [0.49758831, 0.50552991, 0.50084486, 0.502699  , 0.50306742,
        0.49472929, 0.50225027, 0.50062099, 0.49482165],
       [0.49180904, 0.50313066, 0.49588876, 0.49477748, 0.49720182,
        0.50754467, 0.50593346, 0.51119272, 0.50013813],
       [0.49379971, 0.49678421, 0.50031481, 0.50226834, 0.49834924,
        0.50278734, 0.49999859, 0.49495215, 0.49721587],


In [10]:
#### Add CV KNN

##Add KNN for linear
### add KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
REPs = 20
n = 1024

Stats_MI_DIM_Mu2_KNN = np.zeros((REPs,len(DIMENSIONS)))
Stats_Conen_DIM_Mu2_KNN = np.zeros((REPs,len(DIMENSIONS)))
Stats_Pauc_DIM_Mu2_KNN = np.zeros((REPs,len(DIMENSIONS)))


for i in range(0,REPs):
    print(i)
    stats_paucs_samplesize_mu0 =[]
    stats_mi_samplesize_mu0 =[]
    stats_conen_samplesize_mu0 =[]
    for p in DIMENSIONS:
        
        neigh = KNeighborsClassifier(n_neighbors=int(np.sqrt(n))+1)
        prob=0.5
        sep1=3 
        sep2=2
        rng = np.random.default_rng()
        sig = np.identity(p)
        u = rng.multivariate_normal(np.zeros(p), sig, size=n, method='cholesky')
        v = rng.multivariate_normal(np.zeros(p), sig, size=n, method='cholesky')
        u_2 = rng.binomial(1, prob, size=(n, p))
        v_2 = rng.binomial(1, prob, size=(n, p))

        x_1 = u / sep1 + sep2 * u_2 - 1
        x_2 = v / sep1 + sep2 * v_2 - 1
        x = np.vstack((x_1,x_2))
        y = np.array([0]*n+[1]*n).reshape(-1,1).ravel()
            
        cv = StratifiedKFold(n_splits=5, shuffle=True)
        conen_lst = []
        mi_lst = []
        pAUC_lst = []
        for train_ix, test_ix in cv.split(x,y):
            X_train, X_test = x[train_ix, :], x[test_ix, :]
            y_train, y_test = y[train_ix], y[test_ix]
            neigh.fit(X_train,y_train)
            posterior = neigh.predict_proba(X_test)
 
            stats_conen = np.mean(entropy(posterior, base=np.exp(1), axis=1))
            H_Y = entropy([50,50], base=np.exp(1))
            stats_mi = H_Y - stats_conen

            pauc = roc_auc_score(
                    y_test, posterior[:,1], max_fpr=0.1
                )
            print(stats_conen,stats_mi,pauc)
            conen_lst.append(stats_conen)
            mi_lst.append(stats_mi)
            pAUC_lst.append(pauc)

        stats_paucs_samplesize_mu0.append(np.mean(pAUC_lst))
        stats_mi_samplesize_mu0.append(np.mean(mi_lst))
        stats_conen_samplesize_mu0.append(np.mean(conen_lst))

    Stats_Pauc_DIM_Mu2_KNN[i,:] = stats_paucs_samplesize_mu0
    Stats_MI_DIM_Mu2_KNN[i,:] = stats_mi_samplesize_mu0
    Stats_Conen_DIM_Mu2_KNN[i,:] = stats_conen_samplesize_mu0
    



0
0.6767068424907098 0.016440338069235505 0.5094028673977953
0.6789409418720245 0.014206238687920814 0.5069507498669339
0.6783062529378892 0.014840927622056133 0.5125290919147959
0.6780764585774379 0.015070721982507362 0.5060283420171663
0.6751716029092625 0.017975577650682806 0.4898442503516882
0.6755996538462787 0.01754752671366655 0.5087021353204546
0.678044392795367 0.015102787764578296 0.4902669577518508
0.6777395748439146 0.015407605716030726 0.5152862416709124
0.6786456744439934 0.014501506115951912 0.5185657284625074
0.6777710353065842 0.015376145253361062 0.49634891776400497
0.677317532227616 0.01582964833232925 0.5082631474581755
0.6781051519770839 0.015042028582861411 0.4969999629976461
0.678592054410686 0.014555126149259245 0.5204267225307333
0.6789763833554728 0.014170797204472452 0.5013819070632104
0.6790301976001589 0.014116982959786362 0.5064554020153205
0.6811952656065833 0.011951914953362008 0.49933936566580045
0.6776475949343894 0.015499585625555867 0.495402368577601

In [11]:
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Paucs_Dim_MultiInd_n1024_KNN.csv", Stats_Pauc_DIM_Mu2_KNN, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Conen_Dim_MultiInd_n1024_KNN.csv", Stats_Conen_DIM_Mu2_KNN, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_MI_Dim_MultiInd_n1024_KNN.csv", Stats_MI_DIM_Mu2_KNN, delimiter=",")
Stats_Pauc_DIM_Mu2_KNN

array([[0.50495106, 0.505834  , 0.50670543, 0.49465605, 0.50005732,
        0.5015768 , 0.50350038, 0.50317513, 0.49724174],
       [0.50769076, 0.51049121, 0.50735194, 0.50150926, 0.50528028,
        0.50187305, 0.49735625, 0.50890798, 0.50445693],
       [0.50185488, 0.50099251, 0.51061125, 0.49982176, 0.50109517,
        0.5028137 , 0.49812384, 0.50213997, 0.4947037 ],
       [0.49542079, 0.49891656, 0.49758644, 0.50752803, 0.49703232,
        0.49427678, 0.49445222, 0.49615168, 0.50361299],
       [0.49759764, 0.50543373, 0.49776968, 0.49969713, 0.50078788,
        0.49667875, 0.50652873, 0.50480255, 0.50274025],
       [0.49190439, 0.50144919, 0.50207082, 0.50385811, 0.50120824,
        0.50475943, 0.50213876, 0.49254217, 0.50302433],
       [0.50220414, 0.50630249, 0.49314092, 0.49733805, 0.50125198,
        0.49855691, 0.50098761, 0.50461054, 0.49824181],
       [0.51303157, 0.50285339, 0.49559269, 0.50302142, 0.50025495,
        0.50119062, 0.50090631, 0.50046726, 0.50293191],


In [12]:
##Add Logistics Regression for Multi-Independent
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
REPs = 10
n = 1024

Stats_MI_DIM_Mu2_LG = np.zeros((REPs,len(DIMENSIONS)))
Stats_Conen_DIM_Mu2_LG = np.zeros((REPs,len(DIMENSIONS)))
Stats_Pauc_DIM_Mu2_LG = np.zeros((REPs,len(DIMENSIONS)))


for i in range(0,REPs):
    print(i)
    stats_paucs_samplesize_mu0 =[]
    stats_mi_samplesize_mu0 =[]
    stats_conen_samplesize_mu0 =[]
    for p in DIMENSIONS:
        prob=0.5
        sep1=3 
        sep2=2
        rng = np.random.default_rng()
        sig = np.identity(p)
        u = rng.multivariate_normal(np.zeros(p), sig, size=n, method='cholesky')
        v = rng.multivariate_normal(np.zeros(p), sig, size=n, method='cholesky')
        u_2 = rng.binomial(1, prob, size=(n, p))
        v_2 = rng.binomial(1, prob, size=(n, p))

        x_1 = u / sep1 + sep2 * u_2 - 1
        x_2 = v / sep1 + sep2 * v_2 - 1
        x = np.vstack((x_1,x_2))
        y = np.array([0]*n+[1]*n).reshape(-1,1).ravel()
        
        clf = LogisticRegression(penalty='l1',solver = 'liblinear')
        
        cv = StratifiedKFold(n_splits=5, shuffle=True)
        conen_lst = []
        mi_lst = []
        pAUC_lst = []
        for train_ix, test_ix in cv.split(x,y):
            X_train, X_test = x[train_ix, :], x[test_ix, :]
            y_train, y_test = y[train_ix], y[test_ix]
            clf.fit(X_train,y_train)
            posterior = clf.predict_proba(X_test)
 
            stats_conen = np.mean(entropy(posterior, base=np.exp(1), axis=1))
            H_Y = entropy([50,50], base=np.exp(1))
            stats_mi = H_Y - stats_conen

            pauc = roc_auc_score(
                    y_test, posterior[:,1], max_fpr=0.1
                )
            
            conen_lst.append(stats_conen)
            mi_lst.append(stats_mi)
            pAUC_lst.append(pauc)

        stats_paucs_samplesize_mu0.append(np.mean(pAUC_lst))
        stats_mi_samplesize_mu0.append(np.mean(mi_lst))
        stats_conen_samplesize_mu0.append(np.mean(conen_lst))
        print(np.mean(conen_lst),np.mean(mi_lst),np.mean(pAUC_lst))

    Stats_Pauc_DIM_Mu2_LG[i,:] = stats_paucs_samplesize_mu0
    Stats_MI_DIM_Mu2_LG[i,:] = stats_mi_samplesize_mu0
    Stats_Conen_DIM_Mu2_LG[i,:] = stats_conen_samplesize_mu0
    

0
0.6895031102068655 0.003644070353079831 0.498800777707996


0.6853411336910856 0.007806046868859773 0.5014543655215303
0.6683365778051173 0.024810602754827805 0.5092350061913365
0.6520374456548284 0.04110973490511689 0.49800342441995743
0.5960479276980089 0.09709925286193632 0.501540252282384
0.47592950921361765 0.21721767134632763 0.5060303679379012
0.1301620519949034 0.5629851285650419 0.4947561190480137
0.2025240715467445 0.4906231090132008 0.4985534925952598
0.26649503731567453 0.42665214324427075 0.4967760848038142
1
0.6878601615271562 0.005287019032789053 0.5041314049023599
0.6840067152978587 0.009140465262086584 0.5078788671296816
0.671194943076458 0.02195223748348729 0.5082469707880648
0.6482708758685589 0.04487630469138639 0.5024323327126649
0.6079447328932941 0.08520244766665122 0.4917844616790954
0.4778963421053688 0.21525083845457652 0.5022278988930491
0.13615229274169505 0.5569948878182502 0.5057208931977563
0.20631028051525185 0.48683690004469343 0.5071117798695921
0.27759799244880956 0.41554918811113567 0.5092858383843221
2
0.689

In [13]:
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Paucs_Dim_MultiInd_n1024_LG.csv", Stats_Pauc_DIM_Mu2_LG, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Conen_Dim_MultiInd_n1024_LG.csv", Stats_Conen_DIM_Mu2_LG, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_MI_Dim_MultiInd_n1024_LG.csv", Stats_MI_DIM_Mu2_LG, delimiter=",")
Stats_MI_DIM_Mu2_LG

array([[0.00364407, 0.00780605, 0.0248106 , 0.04110973, 0.09709925,
        0.21721767, 0.56298513, 0.49062311, 0.42665214],
       [0.00528702, 0.00914047, 0.02195224, 0.0448763 , 0.08520245,
        0.21525084, 0.55699489, 0.4868369 , 0.41554919],
       [0.00330404, 0.00914898, 0.01740276, 0.03533922, 0.0998947 ,
        0.21875843, 0.56685097, 0.48071332, 0.41938317],
       [0.00344207, 0.00842072, 0.0212778 , 0.05033972, 0.08904313,
        0.21218728, 0.5614042 , 0.48881733, 0.42539412],
       [0.00691363, 0.01200426, 0.01824726, 0.04312054, 0.09504975,
        0.20198569, 0.56376795, 0.48684516, 0.42879277],
       [0.0031298 , 0.00614442, 0.02100138, 0.04319822, 0.09046276,
        0.22688508, 0.55934623, 0.49363392, 0.41765015],
       [0.00456197, 0.00788279, 0.01912097, 0.03376608, 0.083506  ,
        0.21556354, 0.56728061, 0.48998917, 0.4205223 ],
       [0.00483804, 0.01301579, 0.01911876, 0.0447483 , 0.10457977,
        0.21317485, 0.56470903, 0.4811737 , 0.42937533],


In [14]:
### Add SVM to the Multi-Independent
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_val_predict

REPs = 10
n = 1024

Stats_MI_DIM_Mu2_SVM = np.zeros((REPs,len(DIMENSIONS)))
Stats_Conen_DIM_Mu2_SVM = np.zeros((REPs,len(DIMENSIONS)))
Stats_Pauc_DIM_Mu2_SVM = np.zeros((REPs,len(DIMENSIONS)))


for i in range(0,REPs):
    print(i)
    stats_paucs_samplesize_mu0 =[]
    stats_mi_samplesize_mu0 =[]
    stats_conen_samplesize_mu0 =[]
    for p in DIMENSIONS:
        
        prob=0.5
        sep1=3 
        sep2=2
        rng = np.random.default_rng()
        sig = np.identity(p)
        u = rng.multivariate_normal(np.zeros(p), sig, size=n, method='cholesky')
        v = rng.multivariate_normal(np.zeros(p), sig, size=n, method='cholesky')
        u_2 = rng.binomial(1, prob, size=(n, p))
        v_2 = rng.binomial(1, prob, size=(n, p))

        x_1 = u / sep1 + sep2 * u_2 - 1
        x_2 = v / sep1 + sep2 * v_2 - 1
        x = np.vstack((x_1,x_2))
        y = np.array([0]*n+[1]*n).reshape(-1,1).ravel()
        
        model = SVC(probability=True,kernel = 'rbf')  # Set probability=True to enable probability estimates
        cv = StratifiedKFold(n_splits=5, shuffle=True)
        conen_lst = []
        mi_lst = []
        pAUC_lst = []
        for train_ix, test_ix in cv.split(x,y):
            X_train, X_test = x[train_ix, :], x[test_ix, :]
            y_train, y_test = y[train_ix], y[test_ix]
            model.fit(X_train,y_train)
            posterior = model.predict_proba(X_test)
 
            stats_conen = np.mean(entropy(posterior, base=np.exp(1), axis=1))
            H_Y = entropy([50,50], base=np.exp(1))
            stats_mi = H_Y - stats_conen

            pauc = roc_auc_score(
                    y_test, posterior[:,1], max_fpr=0.1
                )
            # print(stats_conen,stats_mi,pauc)
            conen_lst.append(stats_conen)
            mi_lst.append(stats_mi)
            pAUC_lst.append(pauc)
        print(np.mean(conen_lst),np.mean(mi_lst),np.mean(pAUC_lst))

        stats_paucs_samplesize_mu0.append(np.mean(pAUC_lst))
        stats_mi_samplesize_mu0.append(np.mean(mi_lst))
        stats_conen_samplesize_mu0.append(np.mean(conen_lst))

    Stats_Pauc_DIM_Mu2_SVM[i,:] = stats_paucs_samplesize_mu0
    Stats_MI_DIM_Mu2_SVM[i,:] = stats_mi_samplesize_mu0
    Stats_Conen_DIM_Mu2_SVM[i,:] = stats_conen_samplesize_mu0
    

0
0.6928743888689641 0.00027279169098115563 0.5015638266327539
0.6918121576360804 0.0013350229238648704 0.5032948327091391
0.6929924770807497 0.000154703479195506 0.4967914185208918
0.6926719159603515 0.0004752645995937499 0.5002509686049816
0.6923563892255384 0.0007907913344068262 0.5096287101208248
0.6929817563065961 0.0001654242533492667 0.5033389486602132
0.691400176150726 0.001747004409219244 0.499525750373414
0.6929685892918398 0.00017859126810551018 0.494512503425944
0.6927045373919626 0.00044264316798272587 0.4991525512174249
1
0.6923873855308299 0.0007597950291153178 0.4991218804600809
0.6925804323178004 0.0005667482421448211 0.4936791886985546
0.6920993729422131 0.001047807617732155 0.49918607099685736
0.6926706686545077 0.0004765119054376488 0.5035383933466308
0.6924909169721956 0.0006562635877496836 0.513361990903002
0.6928637427853681 0.0002834377745770889 0.4986111273960775
0.6930408194410818 0.00010636111886350719 0.4945193820105824
0.6921602473199979 0.00098693323994747

In [15]:
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Paucs_Dim_MultiInd_n1024_SVM.csv", Stats_Pauc_DIM_Mu2_SVM, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Conen_Dim_MultiInd_n1024_SVM.csv", Stats_Conen_DIM_Mu2_SVM, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_MI_Dim_MultiInd_n1024_SVM.csv", Stats_MI_DIM_Mu2_SVM, delimiter=",")
Stats_MI_DIM_Mu2_SVM

array([[0.00027279, 0.00133502, 0.0001547 , 0.00047526, 0.00079079,
        0.00016542, 0.001747  , 0.00017859, 0.00044264],
       [0.0007598 , 0.00056675, 0.00104781, 0.00047651, 0.00065626,
        0.00028344, 0.00010636, 0.00098693, 0.0007732 ],
       [0.00083422, 0.00027739, 0.00028954, 0.00016656, 0.00019818,
        0.00114227, 0.00050066, 0.00155977, 0.00032748],
       [0.00027284, 0.00024455, 0.00033138, 0.00023459, 0.00029469,
        0.00053727, 0.00042505, 0.00099957, 0.00092658],
       [0.00027964, 0.00089304, 0.00061564, 0.00028136, 0.00045904,
        0.00067307, 0.00062638, 0.00111795, 0.00134076],
       [0.00251825, 0.00113783, 0.00132112, 0.00024993, 0.00103749,
        0.00066499, 0.00072503, 0.00069168, 0.0012662 ],
       [0.00228418, 0.00170614, 0.00025902, 0.00121823, 0.00018588,
        0.00014951, 0.000337  , 0.00043538, 0.00079739],
       [0.00032137, 0.00164781, 0.00063059, 0.00020569, 0.00086265,
        0.00043219, 0.00201261, 0.00059864, 0.00106356],


In [4]:
##Add RF for Multi-Independent
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
REPs = 10
n = 1024

Stats_MI_DIM_Mu2_RF = np.zeros((REPs,len(DIMENSIONS)))
Stats_Conen_DIM_Mu2_RF = np.zeros((REPs,len(DIMENSIONS)))
Stats_Pauc_DIM_Mu2_RF = np.zeros((REPs,len(DIMENSIONS)))


for i in range(0,REPs):
    print(i)
    stats_paucs_samplesize_mu0 =[]
    stats_mi_samplesize_mu0 =[]
    stats_conen_samplesize_mu0 =[]
    for p in DIMENSIONS:
        prob=0.5
        sep1=3 
        sep2=2
        rng = np.random.default_rng()
        sig = np.identity(p)
        u = rng.multivariate_normal(np.zeros(p), sig, size=n, method='cholesky')
        v = rng.multivariate_normal(np.zeros(p), sig, size=n, method='cholesky')
        u_2 = rng.binomial(1, prob, size=(n, p))
        v_2 = rng.binomial(1, prob, size=(n, p))

        x_1 = u / sep1 + sep2 * u_2 - 1
        x_2 = v / sep1 + sep2 * v_2 - 1
        
        x = np.float32(np.vstack((x_1,x_2)))
        y = np.array([0]*n+[1]*n).reshape(-1,1).ravel()
        
        clf = RandomForestClassifier(n_estimators = 500)
        
        cv = StratifiedKFold(n_splits=5, shuffle=True)
        conen_lst = []
        mi_lst = []
        pAUC_lst = []
        for train_ix, test_ix in cv.split(x,y):
            X_train, X_test = x[train_ix, :], x[test_ix, :]
            y_train, y_test = y[train_ix], y[test_ix]
            clf.fit(X_train,y_train)
            posterior = clf.predict_proba(X_test)
 
            stats_conen = np.mean(entropy(posterior, base=np.exp(1), axis=1))
            H_Y = entropy([50,50], base=np.exp(1))
            stats_mi = H_Y - stats_conen

            pauc = roc_auc_score(
                    y_test, posterior[:,1], max_fpr=0.1
                )
            
            conen_lst.append(stats_conen)
            mi_lst.append(stats_mi)
            pAUC_lst.append(pauc)

        stats_paucs_samplesize_mu0.append(np.mean(pAUC_lst))
        stats_mi_samplesize_mu0.append(np.mean(mi_lst))
        stats_conen_samplesize_mu0.append(np.mean(conen_lst))
        print(np.mean(conen_lst),np.mean(mi_lst),np.mean(pAUC_lst))

    Stats_Pauc_DIM_Mu2_RF[i,:] = stats_paucs_samplesize_mu0
    Stats_MI_DIM_Mu2_RF[i,:] = stats_mi_samplesize_mu0
    Stats_Conen_DIM_Mu2_RF[i,:] = stats_conen_samplesize_mu0
    

0
0.6806853151067702 0.012461865453175047 0.5012001946901056
0.6826765949898713 0.01047058557007403 0.5011782040221455
0.6850016632324271 0.008145517327518248 0.5003099418274255
0.6855676337347377 0.007579546825207539 0.503192005434184
0.6867148035541704 0.006432377005775036 0.5168941789139528
0.6874835922489847 0.0056635883109605436 0.5046348942219424
0.6877586292472253 0.005388551312719892 0.5042509534452447
0.6873406774486063 0.005806503111338945 0.5079622352296272
0.6878908325805111 0.005256347979434151 0.502199134150547
1
0.6780550135200383 0.015092167039907012 0.4966732908135545
0.6829460516230746 0.010201128936870729 0.5113331801102878
0.6853283030736194 0.007818877486326037 0.5005411587284339
0.6863822262918754 0.006764954268069867 0.5007423684204193
0.6871616226473052 0.005985557912640216 0.5012192055784989
0.6871636573740099 0.005983523185935424 0.4993259997231827
0.6875408806002413 0.005606299959703987 0.49701737705007176
0.6876204915509191 0.005526689009026198 0.50164871803

In [5]:
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Paucs_Dim_MultiInd_n1024_RF.csv", Stats_Pauc_DIM_Mu2_RF, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Conen_Dim_MultiInd_n1024_RF.csv", Stats_Conen_DIM_Mu2_RF, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_MI_Dim_MultiInd_n1024_RF.csv", Stats_MI_DIM_Mu2_RF, delimiter=",")
Stats_MI_DIM_Mu2_RF

array([[0.01246187, 0.01047059, 0.00814552, 0.00757955, 0.00643238,
        0.00566359, 0.00538855, 0.0058065 , 0.00525635],
       [0.01509217, 0.01020113, 0.00781888, 0.00676495, 0.00598556,
        0.00598352, 0.0056063 , 0.00552669, 0.00500359],
       [0.01396621, 0.00961616, 0.00854924, 0.00746777, 0.00601722,
        0.00547405, 0.00561782, 0.00532845, 0.0050285 ],
       [0.01304458, 0.01010871, 0.00791208, 0.00715457, 0.00616821,
        0.0053779 , 0.00523424, 0.00517644, 0.00497658],
       [0.01498729, 0.01112   , 0.00838175, 0.00775511, 0.00638697,
        0.00584116, 0.00514083, 0.00540083, 0.00501682],
       [0.01274079, 0.01022657, 0.00794014, 0.00721613, 0.00638276,
        0.00562048, 0.00540932, 0.00518707, 0.00509916],
       [0.01373153, 0.01010556, 0.00774673, 0.00740873, 0.00640607,
        0.00601142, 0.00550144, 0.00514847, 0.00529248],
       [0.01399397, 0.01097884, 0.0083329 , 0.00687301, 0.00630959,
        0.00566256, 0.00545187, 0.00531499, 0.00507892],


## Gaussian with Mu= 0

In [16]:
def statistcs_Reps_Pertree_Gaussian(clf,n=100,p=4096,ratio=0.5,metric = 'mi',reps = 1):
    clf.reset()

    x_1 = np.random.normal(size=(n, p))
    x_2 = np.random.normal(size=(n, p))
    x = np.vstack((x_1,x_2))
    y = np.array([0]*n+[1]*n).reshape(-1,1)

    if metric == 'auc':
        stats,pos,samples = clf.statistic(x, y, metric=metric, return_posteriors=True,max_fpr = 0.1)
    else:
        stats,pos,samples = clf.statistic(x, y, metric=metric,return_posteriors=True)
    clf.reset()
    POS = pos[:,:,0].reshape((n_estimators,2*n))

    np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Posterior_Gaussian_dimension_{}_{}_{}.csv".format(metric,p, reps), POS, delimiter=",")
    np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Samples_Gaussian_dimension_{}_{}_{}.csv".format(metric,p, reps), samples, delimiter=",")
    #POSs.append(POS)
    clf.reset()
    return stats

In [17]:
REPs = 10
Stats_Paucs_Samplesize_Gaussian = np.zeros((REPs,len(DIMENSIONS)))

for i in range(0,REPs):
    print(i)
    stats_paucs_samplesize_gaussian =[]
    for dim_i in DIMENSIONS:
        stat = statistcs_Reps_Pertree_Gaussian(clf_Pertree,n=512,p = dim_i,ratio=0.5,metric = 'auc',reps = i)
        stats_paucs_samplesize_gaussian.append(stat) 
    print(stats_paucs_samplesize_gaussian)
    np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Gaussian_pAUC_dimension_{}.csv".format(i), stats_paucs_samplesize_gaussian, delimiter=",")
    Stats_Paucs_Samplesize_Gaussian[i,:] = stats_paucs_samplesize_gaussian

0
[0.49873673288445725, 0.5076791863692435, 0.49975264699835525, 0.5097551847759046, 0.49870862458881576, 0.5012624640213815, 0.501643933747944, 0.4992346512643914]
1
[0.4896095677425987, 0.5012223093133223, 0.5020093415912829, 0.5045190108449835, 0.511666548879523, 0.5030011628803454, 0.5008047003495065, 0.4920549894634046]
2
[0.5063862047697368, 0.49904592413651316, 0.5158988551089638, 0.5056232653166118, 0.5085746363589638, 0.5042580052425987, 0.5058802554481908, 0.5011741236636513]
3
[0.5023506566097862, 0.5021980687191612, 0.49660050241570725, 0.4962310791015625, 0.5033826326069079, 0.504904496042352, 0.5024831671463815, 0.49219553094161184]
4
[0.4971425909745066, 0.49937117727179275, 0.49349252801192434, 0.4960503829152961, 0.4982709382709704, 0.517316316303454, 0.512224699321546, 0.5172922234786184]
5
[0.5047117534436678, 0.5055228785464638, 0.4990700169613487, 0.49550427888569076, 0.5069965563322368, 0.5001019929584704, 0.5000979774876645, 0.5051815635279605]
6
[0.5065909937808

In [18]:
REPs = 10
Stats_Paucs_Samplesize_Gaussian = np.zeros((REPs,len(DIMENSIONS)))
for i in range(REPs):
    pauc = np.array(np.genfromtxt('/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Gaussian_pAUC_dimension_{}.csv'.format(i),delimiter=','))
    Stats_Paucs_Samplesize_Gaussian[i,:] = pauc
print(Stats_Paucs_Samplesize_Gaussian)


Stats_MI_Dim_Gaussian = np.zeros((REPs,len(DIMENSIONS)))
Stats_Conen_Dim_Gaussian = np.zeros((REPs,len(DIMENSIONS)))
for i in range(REPs):
    for samp in range(len(DIMENSIONS)):
        pos = np.genfromtxt('/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Posterior_Gaussian_dimension_auc_{}_{}.csv'.format(DIMENSIONS[samp],i),delimiter=',')
        posterior_forest_0 = np.nanmean(pos, axis=0)
        posterior_forest_1 = np.ones(posterior_forest_0.shape)-posterior_forest_0
        posterior_forest = np.hstack((posterior_forest_0.reshape(-1,1),posterior_forest_1.reshape(-1,1)))

        stats_conen = np.mean(entropy(posterior_forest, base=np.exp(1), axis=1))

        H_Y = entropy([50,50], base=np.exp(1))
        stats_mi = H_Y - stats_conen

        Stats_Conen_Dim_Gaussian[i,samp] = stats_conen
        Stats_MI_Dim_Gaussian[i,samp] = stats_mi


[[0.49873673 0.50767919 0.49975265 0.50975518 0.49870862 0.50126246
  0.50164393 0.49923465]
 [0.48960957 0.50122231 0.50200934 0.50451901 0.51166655 0.50300116
  0.5008047  0.49205499]
 [0.5063862  0.49904592 0.51589886 0.50562327 0.50857464 0.50425801
  0.50588026 0.50117412]
 [0.50235066 0.50219807 0.4966005  0.49623108 0.50338263 0.5049045
  0.50248317 0.49219553]
 [0.49714259 0.49937118 0.49349253 0.49605038 0.49827094 0.51731632
  0.5122247  0.51729222]
 [0.50471175 0.50552288 0.49907002 0.49550428 0.50699656 0.50010199
  0.50009798 0.50518156]
 [0.50659099 0.4944201  0.49416311 0.49570104 0.50511732 0.4941872
  0.49893751 0.49862028]
 [0.51473437 0.49497825 0.49599818 0.5009854  0.50195714 0.49289824
  0.50071636 0.49790553]
 [0.49643185 0.50397692 0.49508265 0.5010898  0.50076053 0.5030373
  0.50445075 0.51897471]
 [0.49724298 0.49933504 0.49605038 0.49331183 0.50226232 0.49551633
  0.50608906 0.50077659]]


In [19]:
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Paucs_Dim_Gaussian_MIGHT.csv", Stats_Paucs_Samplesize_Gaussian, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Conen_Dim_Gaussian_MIGHT.csv", Stats_Conen_Dim_Gaussian, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_MI_Dim_Gaussian_MIGHT.csv", Stats_MI_Dim_Gaussian, delimiter=",")
Stats_MI_Dim_Gaussian

array([[0.00109447, 0.00099941, 0.00090703, 0.00075109, 0.00063368,
        0.00057864, 0.00057527, 0.00050534],
       [0.00111564, 0.00099131, 0.00082842, 0.00070209, 0.00087623,
        0.00068686, 0.00051773, 0.00046166],
       [0.00152475, 0.00097356, 0.00087474, 0.00065733, 0.00054045,
        0.0005611 , 0.00055688, 0.00046382],
       [0.00129835, 0.00099383, 0.0007973 , 0.00069093, 0.00061737,
        0.00062601, 0.00048477, 0.0005211 ],
       [0.00105932, 0.00095172, 0.00084257, 0.00072036, 0.00061457,
        0.00053969, 0.00052834, 0.00051953],
       [0.00141689, 0.00105015, 0.0007923 , 0.00078335, 0.00059078,
        0.00055413, 0.00056896, 0.00044975],
       [0.00118541, 0.00093941, 0.00075312, 0.00067344, 0.00057568,
        0.00057726, 0.00050425, 0.00051943],
       [0.00196953, 0.00086612, 0.00101632, 0.00064436, 0.00062619,
        0.00056985, 0.0005391 , 0.00049488],
       [0.00107854, 0.0010114 , 0.00074186, 0.00071371, 0.00066014,
        0.00059194, 0.000511

In [25]:
##Add KNN for Gaussian Mu=0
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
REPs = 50
n = 512

Stats_MI_DIM_Mu2_KNN = np.zeros((REPs,len(DIMENSIONS)))
Stats_Conen_DIM_Mu2_KNN = np.zeros((REPs,len(DIMENSIONS)))
Stats_Pauc_DIM_Mu2_KNN = np.zeros((REPs,len(DIMENSIONS)))


for i in range(0,REPs):
    print(i)
    stats_paucs_samplesize_mu0 =[]
    stats_mi_samplesize_mu0 =[]
    stats_conen_samplesize_mu0 =[]
    for p in DIMENSIONS:
        neigh = KNeighborsClassifier(n_neighbors=int(np.sqrt(n)+1))
        x_1 = np.random.normal(size=(n, p))
        x_2 = np.random.normal(size=(n, p))
        x = np.vstack((x_1,x_2))
        y = np.array([0]*n+[1]*n).reshape(-1,1).ravel()
            
        neigh.fit(x, y)
        posterior = neigh.predict_proba(x)

        stats_conen = np.mean(entropy(posterior, base=np.exp(1), axis=1))
        # print(stats_conen)

        # _, counts = np.unique(y_true_final, return_counts=True)
        H_Y = entropy([50,50], base=np.exp(1))
        # print(H_Y)
        stats_mi = H_Y - stats_conen

        pauc = roc_auc_score(
                y, posterior[:,1], max_fpr=0.1
            )
        print(stats_conen,stats_mi,pauc)
        
        stats_paucs_samplesize_mu0.append(pauc)
        stats_mi_samplesize_mu0.append(stats_mi)
        stats_conen_samplesize_mu0.append(stats_conen)

    Stats_Pauc_DIM_Mu2_KNN[i,:] = stats_paucs_samplesize_mu0
    Stats_MI_DIM_Mu2_KNN[i,:] = stats_mi_samplesize_mu0
    Stats_Conen_DIM_Mu2_KNN[i,:] = stats_conen_samplesize_mu0
    

0
0.6697978081072058 0.023349372452739492 0.5251437903591321
0.6683011902632663 0.024845990296679 0.5320428901412538
0.6690566783939094 0.02409050216603592 0.5382732527365304
0.6705149082775237 0.022632272282421595 0.5225977996863524
0.6723188709457866 0.020828309614158713 0.5250654005466547
0.6704524689811757 0.022694711578769566 0.5367597881116365
0.6647682056992053 0.028378974860740036 0.5334863136301298
0.6707451741171202 0.022402006442825084 0.5330453169973273
1
0.6702428409747457 0.02290433958519955 0.524993896484375
0.672330846240707 0.02081633431923824 0.5376793710809005
0.6706676427881517 0.022479537771793567 0.5199331750072562
0.6699907643596419 0.023156416200303398 0.5269650911030016
0.6603488390830832 0.032798341476862114 0.5348369790967373
0.6721154591217148 0.02103172143823051 0.5360671595523232
0.6698534774096618 0.02329370315028345 0.5315496626483597
0.6673240638243806 0.0258231167355647 0.5593491309198433
2
0.6687069009886581 0.024440279571287138 0.5304592302453662
0.6

In [26]:
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Paucs_Dim_Gaussian_KNN.csv", Stats_Pauc_DIM_Mu2_KNN, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Conen_Dim_Gaussian_KNN.csv", Stats_Conen_DIM_Mu2_KNN, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_MI_Dim_Gaussian_KNN.csv", Stats_MI_DIM_Mu2_KNN, delimiter=",")
Stats_MI_DIM_Mu2_KNN

array([[0.02334937, 0.02484599, 0.0240905 , 0.02263227, 0.02082831,
        0.02269471, 0.02837897, 0.02240201],
       [0.02290434, 0.02081633, 0.02247954, 0.02315642, 0.03279834,
        0.02103172, 0.0232937 , 0.02582312],
       [0.02444028, 0.02114212, 0.022658  , 0.02156717, 0.02265819,
        0.02393238, 0.02197296, 0.02197006],
       [0.02441538, 0.01989728, 0.02171421, 0.02147757, 0.02271332,
        0.02181435, 0.02020164, 0.02093172],
       [0.02479513, 0.02058652, 0.02461301, 0.02419007, 0.02265418,
        0.02291377, 0.02760605, 0.02107438],
       [0.02264808, 0.02205587, 0.02235981, 0.02144698, 0.02120235,
        0.02367001, 0.02256111, 0.02437598],
       [0.02223379, 0.02311568, 0.02412675, 0.02243643, 0.02306882,
        0.02105824, 0.02425226, 0.02150768],
       [0.02217489, 0.01884852, 0.03254199, 0.02227797, 0.02554583,
        0.02396864, 0.02460682, 0.02860812],
       [0.02227905, 0.0210439 , 0.02366974, 0.02436252, 0.02282155,
        0.02408786, 0.019822

In [27]:
##Add Logistics Regression for Gaussian
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
REPs = 50
n = 512

Stats_MI_DIM_Mu2_LG = np.zeros((REPs,len(DIMENSIONS)))
Stats_Conen_DIM_Mu2_LG = np.zeros((REPs,len(DIMENSIONS)))
Stats_Pauc_DIM_Mu2_LG = np.zeros((REPs,len(DIMENSIONS)))


for i in range(0,REPs):
    print(i)
    stats_paucs_samplesize_mu0 =[]
    stats_mi_samplesize_mu0 =[]
    stats_conen_samplesize_mu0 =[]
    for p in DIMENSIONS:
        x_1 = np.random.normal(size=(n, p))
        x_2 = np.random.normal(size=(n, p))
        x = np.vstack((x_1,x_2))
        y = np.array([0]*n+[1]*n).reshape(-1,1).ravel()
        
        clf = LogisticRegression().fit(x, y)
        print(clf.score(x, y))
        posterior = clf.predict_proba(x)

        stats_conen = np.mean(entropy(posterior, base=np.exp(1), axis=1))
        # print(stats_conen)

        # _, counts = np.unique(y_true_final, return_counts=True)
        H_Y = entropy([50,50], base=np.exp(1))
        # print(H_Y)
        stats_mi = H_Y - stats_conen

        pauc = roc_auc_score(
                y, posterior[:,1], max_fpr=0.1
            )
        print(stats_conen,stats_mi,pauc)
        
        stats_paucs_samplesize_mu0.append(pauc)
        stats_mi_samplesize_mu0.append(stats_mi)
        stats_conen_samplesize_mu0.append(stats_conen)

    Stats_Pauc_DIM_Mu2_LG[i,:] = stats_paucs_samplesize_mu0
    Stats_MI_DIM_Mu2_LG[i,:] = stats_mi_samplesize_mu0
    Stats_Conen_DIM_Mu2_LG[i,:] = stats_conen_samplesize_mu0
    

0
0.55078125
0.682911833915122 0.010235346644823284 0.5111405222039473
0.6162109375
0.657888243435717 0.03525893712422834 0.5423005756578947
0.6416015625
0.6381500097792758 0.05499717078066946 0.5518694425884046
0.7255859375
0.5446236071332604 0.14852357342668487 0.6205339933696546
0.857421875
0.37018753939073756 0.3229596411692077 0.8247568230879934
1.0
0.0718285680544268 0.6213186125055185 1.0
1.0
0.028909294089689272 0.6642378864702561 1.0
1.0
0.014492766252783323 0.678654414307162 1.0
1
0.5810546875
0.6703067484749662 0.0228404320849791 0.5378915887129935
0.5869140625
0.6658568489730254 0.027290331586919847 0.5418869821648848
0.6494140625
0.628503823622639 0.06464335693730627 0.5733361495168585
0.734375
0.5385087957672966 0.1546383847926487 0.6666837993421053
0.8701171875
0.3406390260658394 0.3525081544941059 0.8477494089226973
1.0
0.07212963779308079 0.6210175427668645 1.0
1.0
0.027816302482864493 0.6653308780770808 1.0
1.0
0.013615746249396078 0.6795314343105492 1.0
2
0.595703125

In [28]:
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Paucs_Dim_Gaussian_LG.csv", Stats_Pauc_DIM_Mu2_LG, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Conen_Dim_Gaussian_LG.csv", Stats_Conen_DIM_Mu2_LG, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_MI_Dim_Gaussian_LG.csv", Stats_MI_DIM_Mu2_LG, delimiter=",")
Stats_MI_DIM_Mu2_LG

array([[0.01023535, 0.03525894, 0.05499717, 0.14852357, 0.32295964,
        0.62131861, 0.66423789, 0.67865441],
       [0.02284043, 0.02729033, 0.06464336, 0.15463838, 0.35250815,
        0.62101754, 0.66533088, 0.67953143],
       [0.01845989, 0.03458058, 0.06843258, 0.14681411, 0.33362689,
        0.61890949, 0.6653845 , 0.67916013],
       [0.02567196, 0.03323327, 0.06101148, 0.17742793, 0.34329524,
        0.62779667, 0.66376453, 0.67925287],
       [0.02315946, 0.02335502, 0.07441942, 0.14373511, 0.33921706,
        0.6190148 , 0.6644526 , 0.67911628],
       [0.01484949, 0.02559304, 0.05173988, 0.12901855, 0.36607427,
        0.61945348, 0.66532962, 0.6789846 ],
       [0.01700411, 0.03838323, 0.06692991, 0.14535682, 0.34126231,
        0.62136329, 0.66622195, 0.67910008],
       [0.02003428, 0.02903271, 0.06556148, 0.14691734, 0.35593584,
        0.62155945, 0.66511512, 0.67931326],
       [0.01688067, 0.01822698, 0.05775926, 0.13395384, 0.32464867,
        0.61403835, 0.665263

In [29]:
### Add SVM to the Linear
##Add Logistics Regression for linear
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_val_predict

REPs = 20
n = 512

Stats_MI_DIM_Mu2_SVM = np.zeros((REPs,len(DIMENSIONS)))
Stats_Conen_DIM_Mu2_SVM = np.zeros((REPs,len(DIMENSIONS)))
Stats_Pauc_DIM_Mu2_SVM = np.zeros((REPs,len(DIMENSIONS)))


for i in range(0,REPs):
    print(i)
    stats_paucs_samplesize_mu0 =[]
    stats_mi_samplesize_mu0 =[]
    stats_conen_samplesize_mu0 =[]
    for p in DIMENSIONS:
        
        x_1 = np.random.normal(size=(n, p))
        x_2 = np.random.normal(size=(n, p))
        x = np.vstack((x_1,x_2))
        y = np.array([0]*n+[1]*n).reshape(-1,1).ravel()
        
        model = SVC(probability=True,kernel = 'rbf')  # Set probability=True to enable probability estimates
        model.fit(x, y)
        posterior = model.predict_proba(x)

        stats_conen = np.mean(entropy(posterior, base=np.exp(1), axis=1))
        # print(stats_conen)

        # _, counts = np.unique(y_true_final, return_counts=True)
        H_Y = entropy([50,50], base=np.exp(1))
        # print(H_Y)
        stats_mi = H_Y - stats_conen

        pauc = roc_auc_score(
                y, posterior[:,1], max_fpr=0.1
            )
        print(stats_conen,stats_mi,pauc)
        
        stats_paucs_samplesize_mu0.append(pauc)
        stats_mi_samplesize_mu0.append(stats_mi)
        stats_conen_samplesize_mu0.append(stats_conen)

    Stats_Pauc_DIM_Mu2_SVM[i,:] = stats_paucs_samplesize_mu0
    Stats_MI_DIM_Mu2_SVM[i,:] = stats_mi_samplesize_mu0
    Stats_Conen_DIM_Mu2_SVM[i,:] = stats_conen_samplesize_mu0
    

0
0.6876699399924245 0.005477240567520747 0.8365181370785362
0.6929344879177242 0.00021269264222112394 0.47368421052631576
0.6905161146477332 0.0026310659122120716 0.9763629311009457
0.6907382509375491 0.0024089296223961387 0.47368421052631576
0.6804245424411137 0.012722638118831564 0.47368421052631576
0.6925312478308829 0.0006159327290623517 0.999849419844778
0.6328653902190899 0.060281790340855346 0.47368421052631576
0.6893513835829761 0.00379579697696919 0.47368421052631576
1
0.6931471805599453 0.0 0.5
0.6823722516612779 0.01077492889866738 0.47368421052631576
0.6905853083394959 0.002561872220449346 0.9670470388312089
0.6896089242791554 0.003538256280789853 0.9871464779502467
0.6744229784075231 0.01872420215242221 0.47368421052631576
0.6926840598646573 0.0004631206952879774 0.47368421052631576
0.6359501182260179 0.057197062333927406 0.47368421052631576
0.692756365387221 0.00039081517272432365 1.0
2
0.6920803444537484 0.0010668361061968445 0.47368421052631576
0.6903095503485417 0.002

In [32]:
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Paucs_Dim_Gaussian_SVM.csv", Stats_Pauc_DIM_Mu2_SVM, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Conen_Dim_Gaussian_SVM.csv", Stats_Conen_DIM_Mu2_SVM, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_MI_Dim_Gaussian_SVM.csv", Stats_MI_DIM_Mu2_SVM, delimiter=",")
Stats_MI_DIM_Mu2_SVM

array([[5.47724057e-03, 2.12692642e-04, 2.63106591e-03, 2.40892962e-03,
        1.27226381e-02, 6.15932729e-04, 6.02817903e-02, 3.79579698e-03],
       [0.00000000e+00, 1.07749289e-02, 2.56187222e-03, 3.53825628e-03,
        1.87242022e-02, 4.63120695e-04, 5.71970623e-02, 3.90815173e-04],
       [1.06683611e-03, 2.83763021e-03, 4.69469050e-03, 6.27760295e-04,
        5.75718133e-03, 0.00000000e+00, 7.26576427e-03, 1.68230642e-01],
       [2.93290581e-03, 3.17786950e-05, 0.00000000e+00, 6.88227452e-03,
        1.40411539e-02, 3.04446753e-03, 2.08282341e-01, 1.02618055e-02],
       [1.76259474e-03, 7.84130645e-03, 9.95512592e-04, 1.25848513e-03,
        2.51781592e-04, 1.04818251e-01, 3.86283793e-02, 7.88385003e-05],
       [9.04342675e-03, 3.30691869e-04, 1.17894458e-04, 5.26709642e-05,
        1.66917744e-03, 1.19449738e-04, 1.87034578e-05, 6.67072170e-02],
       [6.26221333e-04, 1.55039257e-02, 1.66125734e-03, 2.79366493e-02,
        4.22174137e-03, 4.94606316e-03, 4.69569675e-03, 1.

## Gaussian with Mu = 2

In [21]:
def statistcs_Reps_Pertree_Separate_Gaussian(clf,n=100,p=4096,ratio=0.5,metric = 'mi',reps = 1):
    clf.reset()

    x_1 = np.random.normal(loc = np.array([2]*1+[0]*(p-1)),scale = np.array([1]*p),size=(n,p))
    x_2 = np.random.normal(loc = np.array([-2]*1+[0]*(p-1)),scale = np.array([1]*p),size=(n,p))
    x = np.vstack((x_1,x_2))
    y = np.array([0]*n+[1]*n).reshape(-1,1)

    if metric == 'auc':
        stats,pos,samples = clf.statistic(x, y, metric=metric, return_posteriors=True,max_fpr = 0.1)
    else:
        stats,pos,samples = clf.statistic(x, y, metric=metric,return_posteriors=True)
    clf.reset()
    POS = pos[:,:,0].reshape((n_estimators,2*n))

    np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Posterior_Gaussian_mu2_dimension_{}_{}_{}.csv".format(metric,p, reps), POS, delimiter=",")
    np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Samples_Gaussian_mu2_dimension_{}_{}_{}.csv".format(metric,p, reps), samples, delimiter=",")
    #POSs.append(POS)
    clf.reset()
    return stats

In [22]:
REPs = 10
Stats_Paucs_Samplesize_Gaussian = np.zeros((REPs,len(DIMENSIONS)))

for i in range(0,REPs):
    print(i)
    stats_paucs_samplesize_gaussian =[]
    for dim_i in DIMENSIONS:
        stat = statistcs_Reps_Pertree_Separate_Gaussian(clf_Pertree,n=512,p = dim_i,ratio=0.5,metric = 'auc',reps = i)
        stats_paucs_samplesize_gaussian.append(stat) 
    print(stats_paucs_samplesize_gaussian)
    np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Gaussian_mu2_pAUC_dimension_{}.csv".format(i), stats_paucs_samplesize_gaussian, delimiter=",")
    Stats_Paucs_Samplesize_Gaussian[i,:] = stats_paucs_samplesize_gaussian

0
[0.9880057887027138, 0.9688640393708882, 0.9891060277035362, 0.9638567472759045, 0.9823881450452303, 0.9849299380653783, 0.9614595112047697, 0.9882667943050987]
1
[0.9787742213199013, 0.993992855674342, 0.9746784410978618, 0.9870621530633223, 0.9660732871607731, 0.9655030903063322, 0.9565646522923519, 0.9616602847450658]
2
[0.9816131591796875, 0.9818862111944902, 0.9907804790296053, 0.9718957198293585, 0.9870179828844572, 0.987841154399671, 0.9710685328433388, 0.9769752903988487]
3
[0.9835044459292763, 0.971923828125, 0.9868814568770559, 0.9849259225945723, 0.9775816264905428, 0.971056486430921, 0.9657962196751645, 0.968213533100329]
4
[0.986520064504523, 0.9575243498149671, 0.9814926950555098, 0.9674265008223684, 0.9694502981085527, 0.9885518927323191, 0.9729076184724506, 0.9862148887232731]
5
[0.9829663728412829, 0.9871545088918585, 0.9726024426912007, 0.9754012258429277, 0.9814043546977795, 0.9791075053967928, 0.9855362741570723, 0.9806614925986842]
6
[0.9790512888055098, 0.989346

In [4]:
REPs = 10
Stats_Paucs_Samplesize_Gaussian = np.zeros((REPs,len(DIMENSIONS)))
for i in range(REPs):
    pauc = np.array(np.genfromtxt('/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Gaussian_mu2_pAUC_dimension_{}.csv'.format(i),delimiter=','))
    Stats_Paucs_Samplesize_Gaussian[i,:] = pauc
print(Stats_Paucs_Samplesize_Gaussian)


Stats_MI_Dim_Gaussian = np.zeros((REPs,len(DIMENSIONS)))
Stats_Conen_Dim_Gaussian = np.zeros((REPs,len(DIMENSIONS)))
for i in range(REPs):
    for samp in range(len(DIMENSIONS)):
        pos = np.genfromtxt('/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Posterior_Gaussian_mu2_dimension_auc_{}_{}.csv'.format(DIMENSIONS[samp],i),delimiter=',')
        posterior_forest_0 = np.nanmean(pos, axis=0)
        posterior_forest_1 = np.ones(posterior_forest_0.shape)-posterior_forest_0
        posterior_forest = np.hstack((posterior_forest_0.reshape(-1,1),posterior_forest_1.reshape(-1,1)))

        stats_conen = np.mean(entropy(posterior_forest, base=np.exp(1), axis=1))

        H_Y = entropy([50,50], base=np.exp(1))
        stats_mi = H_Y - stats_conen

        Stats_Conen_Dim_Gaussian[i,samp] = stats_conen
        Stats_MI_Dim_Gaussian[i,samp] = stats_mi


[[0.98800579 0.96886404 0.98910603 0.96385675 0.98238815 0.98492994
  0.96145951 0.98826679]
 [0.97877422 0.99399286 0.97467844 0.98706215 0.96607329 0.96550309
  0.95656465 0.96166028]
 [0.98161316 0.98188621 0.99078048 0.97189572 0.98701798 0.98784115
  0.97106853 0.97697529]
 [0.98350445 0.97192383 0.98688146 0.98492592 0.97758163 0.97105649
  0.96579622 0.96821353]
 [0.98652006 0.95752435 0.9814927  0.9674265  0.9694503  0.98855189
  0.97290762 0.98621489]
 [0.98296637 0.98715451 0.97260244 0.97540123 0.98140435 0.97910751
  0.98553627 0.98066149]
 [0.97905129 0.98934696 0.9856728  0.95886953 0.97436523 0.97075131
  0.97880233 0.9811313 ]
 [0.98641566 0.97049834 0.95205929 0.9933825  0.97144599 0.97940867
  0.97083965 0.98127988]
 [0.98348035 0.96680009 0.98327958 0.97608386 0.99090897 0.97751336
  0.98652006 0.97777437]
 [0.96959084 0.98162922 0.98102288 0.98405055 0.96034723 0.97569034
  0.98083014 0.96924551]]


In [5]:
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Paucs_Dim_Gaussian_mu2_MIGHT.csv", Stats_Paucs_Samplesize_Gaussian, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Conen_Dim_Gaussian_mu2_MIGHT.csv", Stats_Conen_Dim_Gaussian, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_MI_Dim_Gaussian_mu2_MIGHT.csv", Stats_MI_Dim_Gaussian, delimiter=",")
Stats_MI_Dim_Gaussian

array([[0.44886788, 0.43915256, 0.43296477, 0.43015998, 0.4310698 ,
        0.39874803, 0.40946938, 0.39524884],
       [0.45542783, 0.4722959 , 0.43425727, 0.42980062, 0.42169526,
        0.37563462, 0.38116165, 0.41058263],
       [0.45496324, 0.46293445, 0.44681143, 0.43233152, 0.41692382,
        0.43764238, 0.39998851, 0.40669925],
       [0.44962667, 0.44452464, 0.45555366, 0.4333285 , 0.41752721,
        0.40835384, 0.37693711, 0.36600111],
       [0.45081239, 0.44569913, 0.43711476, 0.4348424 , 0.41067354,
        0.40611263, 0.3803455 , 0.40387625],
       [0.44078192, 0.45561755, 0.42968355, 0.44453988, 0.40805889,
        0.4243659 , 0.40205869, 0.37270787],
       [0.44997778, 0.45591402, 0.44603749, 0.42009164, 0.40414371,
        0.40923861, 0.41164945, 0.38828732],
       [0.45200319, 0.44462492, 0.43857315, 0.46125543, 0.41604002,
        0.4125767 , 0.40103427, 0.40135187],
       [0.44775796, 0.4424322 , 0.44311839, 0.44346724, 0.43352694,
        0.41941605, 0.387314

In [33]:
##Add KNN for Gaussian Mu=2
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
REPs = 50
n = 512

Stats_MI_DIM_Mu2_KNN = np.zeros((REPs,len(DIMENSIONS)))
Stats_Conen_DIM_Mu2_KNN = np.zeros((REPs,len(DIMENSIONS)))
Stats_Pauc_DIM_Mu2_KNN = np.zeros((REPs,len(DIMENSIONS)))


for i in range(0,REPs):
    print(i)
    stats_paucs_samplesize_mu0 =[]
    stats_mi_samplesize_mu0 =[]
    stats_conen_samplesize_mu0 =[]
    for p in DIMENSIONS:
        neigh = KNeighborsClassifier(n_neighbors=int(np.sqrt(n)+1))
        x_1 = np.random.normal(loc = np.array([2]*1+[0]*(p-1)),scale = np.array([1]*p),size=(n,p))
        x_2 = np.random.normal(loc = np.array([-2]*1+[0]*(p-1)),scale = np.array([1]*p),size=(n,p))
        x = np.vstack((x_1,x_2))
        y = np.array([0]*n+[1]*n).reshape(-1,1).ravel()
                
        neigh.fit(x, y)
        posterior = neigh.predict_proba(x)

        stats_conen = np.mean(entropy(posterior, base=np.exp(1), axis=1))
        # print(stats_conen)

        # _, counts = np.unique(y_true_final, return_counts=True)
        H_Y = entropy([50,50], base=np.exp(1))
        # print(H_Y)
        stats_mi = H_Y - stats_conen

        pauc = roc_auc_score(
                y, posterior[:,1], max_fpr=0.1
            )
        print(stats_conen,stats_mi,pauc)
        
        stats_paucs_samplesize_mu0.append(pauc)
        stats_mi_samplesize_mu0.append(stats_mi)
        stats_conen_samplesize_mu0.append(stats_conen)

    Stats_Pauc_DIM_Mu2_KNN[i,:] = stats_paucs_samplesize_mu0
    Stats_MI_DIM_Mu2_KNN[i,:] = stats_mi_samplesize_mu0
    Stats_Conen_DIM_Mu2_KNN[i,:] = stats_conen_samplesize_mu0
    

0
0.24178651067075063 0.45136066988919465 0.9835932279887952
0.3317732359417301 0.3613739446182152 0.9909069663599918
0.45065921120269864 0.24248796935724665 0.983270202258792
0.5344697401797021 0.15867744038024323 0.9473463199013158
0.6072427925076174 0.08590438805232792 0.921582071404708
0.6313704580544138 0.061776722505531456 0.855067192880731
0.6481209919549734 0.04502618860497187 0.798180980252144
0.6573062526185 0.03584092794144533 0.6715767308285362
1
0.24639402389293474 0.44675315666701054 0.9909912912469161
0.3601335311312196 0.3330136494287257 0.982195402446546
0.46103550501644097 0.23211167554350431 0.9666326422440379
0.5459310613363855 0.14721611922355982 0.9553127088044819
0.5934256966375329 0.09972148392241242 0.9145524376317075
0.6268887970270298 0.06625838353291547 0.8630186382092928
0.6480342173927502 0.04511296316719504 0.8015151907704787
0.6578523647870251 0.03529481577292015 0.6882946705957602
2
0.2564770111305996 0.43667016942934567 0.9849640695672286
0.36091023409

In [34]:
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Paucs_Dim_Gaussian_mu2_KNN.csv", Stats_Pauc_DIM_Mu2_KNN, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Conen_Dim_Gaussian_mu2_KNN.csv", Stats_Conen_DIM_Mu2_KNN, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_MI_Dim_Gaussian_mu2_KNN.csv", Stats_MI_DIM_Mu2_KNN, delimiter=",")
Stats_MI_DIM_Mu2_KNN

array([[0.45136067, 0.36137394, 0.24248797, 0.15867744, 0.08590439,
        0.06177672, 0.04502619, 0.03584093],
       [0.44675316, 0.33301365, 0.23211168, 0.14721612, 0.09972148,
        0.06625838, 0.04511296, 0.03529482],
       [0.43667017, 0.33223695, 0.23711204, 0.16044714, 0.10404505,
        0.07085447, 0.04325459, 0.03561122],
       [0.45758389, 0.35585541, 0.21389948, 0.14563329, 0.09731376,
        0.06215233, 0.04762439, 0.03393847],
       [0.44871589, 0.35479532, 0.20920238, 0.16072326, 0.09756193,
        0.05918992, 0.0449999 , 0.03695313],
       [0.44947375, 0.34994924, 0.21371369, 0.14744315, 0.0869084 ,
        0.06439109, 0.04552902, 0.03630849],
       [0.47215135, 0.34378774, 0.2320173 , 0.15916884, 0.10100987,
        0.06205465, 0.04399639, 0.03682614],
       [0.43515948, 0.3600202 , 0.24543011, 0.15224498, 0.09595344,
        0.06175401, 0.04330879, 0.03277031],
       [0.45165629, 0.3409831 , 0.24344678, 0.1463897 , 0.09564489,
        0.06399182, 0.047898

In [35]:
##Add Logistics Regression for Logarithm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
REPs = 50
n = 512

Stats_MI_DIM_Mu2_LG = np.zeros((REPs,len(DIMENSIONS)))
Stats_Conen_DIM_Mu2_LG = np.zeros((REPs,len(DIMENSIONS)))
Stats_Pauc_DIM_Mu2_LG = np.zeros((REPs,len(DIMENSIONS)))


for i in range(0,REPs):
    print(i)
    stats_paucs_samplesize_mu0 =[]
    stats_mi_samplesize_mu0 =[]
    stats_conen_samplesize_mu0 =[]
    for p in DIMENSIONS:
        x_1 = np.random.normal(loc = np.array([2]*1+[0]*(p-1)),scale = np.array([1]*p),size=(n,p))
        x_2 = np.random.normal(loc = np.array([-2]*1+[0]*(p-1)),scale = np.array([1]*p),size=(n,p))
        x = np.vstack((x_1,x_2))
        y = np.array([0]*n+[1]*n).reshape(-1,1).ravel()
        
        clf = LogisticRegression().fit(x, y)
        print(clf.score(x, y))
        posterior = clf.predict_proba(x)

        stats_conen = np.mean(entropy(posterior, base=np.exp(1), axis=1))
        # print(stats_conen)

        # _, counts = np.unique(y_true_final, return_counts=True)
        H_Y = entropy([50,50], base=np.exp(1))
        # print(H_Y)
        stats_mi = H_Y - stats_conen

        pauc = roc_auc_score(
                y, posterior[:,1], max_fpr=0.1
            )
        print(stats_conen,stats_mi,pauc)
        
        stats_paucs_samplesize_mu0.append(pauc)
        stats_mi_samplesize_mu0.append(stats_mi)
        stats_conen_samplesize_mu0.append(stats_conen)

    Stats_Pauc_DIM_Mu2_LG[i,:] = stats_paucs_samplesize_mu0
    Stats_MI_DIM_Mu2_LG[i,:] = stats_mi_samplesize_mu0
    Stats_Conen_DIM_Mu2_LG[i,:] = stats_conen_samplesize_mu0
    

0
0.9853515625
0.05681231362480194 0.6363348669351433 0.9956191213507402
0.99609375
0.05613856510639618 0.6370086154535491 0.9985142758018093
1.0
0.04284301494146647 0.6503041656184788 1.0
1.0
0.029723805084918183 0.663423375475027 1.0
1.0
0.020922893541654237 0.6722242870182911 1.0
1.0
0.015783894794106725 0.6773632857658386 1.0
1.0
0.0114084374837446 0.6817387430762006 1.0
1.0
0.008092233507664536 0.6850549470522808 1.0
1
0.9912109375
0.04899153738541881 0.6441556431745264 0.9983938116776316
0.994140625
0.051464125096778404 0.6416830554631668 0.999598452919408
1.0
0.04947645686502736 0.643670723694918 1.0
1.0
0.02705686027397068 0.6660903202859746 1.0
1.0
0.02244660117331946 0.6707005793866259 1.0
1.0
0.014655163013448165 0.6784920175464971 1.0
1.0
0.011130293124578 0.6820168874353673 1.0
1.0
0.008397566949407128 0.6847496136105382 1.0
2
0.984375
0.06324458051294952 0.6299026000469957 0.9932700709292763
0.9931640625
0.05201137178864953 0.6411358087712957 0.9991969058388157
1.0
0.0341

In [36]:
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Paucs_Dim_Gaussian_mu2_LG.csv", Stats_Pauc_DIM_Mu2_LG, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Conen_Dim_Gaussian_mu2_LG.csv", Stats_Conen_DIM_Mu2_LG, delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_MI_Dim_Gaussian_mu2_LG.csv", Stats_MI_DIM_Mu2_LG, delimiter=",")
Stats_MI_DIM_Mu2_LG

array([[0.63633487, 0.63700862, 0.65030417, 0.66342338, 0.67222429,
        0.67736329, 0.68173874, 0.68505495],
       [0.64415564, 0.64168306, 0.64367072, 0.66609032, 0.67070058,
        0.67849202, 0.68201689, 0.68474961],
       [0.6299026 , 0.64113581, 0.65903108, 0.664326  , 0.6725583 ,
        0.67735008, 0.68184978, 0.68493346],
       [0.62615286, 0.64481952, 0.65617658, 0.66641666, 0.67392572,
        0.67683392, 0.68169852, 0.68502822],
       [0.6322643 , 0.64375738, 0.6577571 , 0.6649493 , 0.67200832,
        0.67755809, 0.68179225, 0.68513379],
       [0.63148654, 0.63159643, 0.65553318, 0.66238203, 0.67048911,
        0.67769917, 0.68127683, 0.68514499],
       [0.62423273, 0.63293601, 0.65628539, 0.66446644, 0.67128614,
        0.67822431, 0.68160382, 0.68500402],
       [0.62683398, 0.64242367, 0.654641  , 0.66329881, 0.67124357,
        0.67807747, 0.68152645, 0.6851581 ],
       [0.64115661, 0.63177853, 0.65198864, 0.65981874, 0.67295261,
        0.67803335, 0.681971

In [37]:
### Add SVM to the Linear
##Add Logistics Regression for linear
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_val_predict

REPs = 20
n = 512

Stats_MI_DIM_Mu2_SVM = np.zeros((REPs,len(DIMENSIONS)))
Stats_Conen_DIM_Mu2_SVM = np.zeros((REPs,len(DIMENSIONS)))
Stats_Pauc_DIM_Mu2_SVM = np.zeros((REPs,len(DIMENSIONS)))


for i in range(0,REPs):
    print(i)
    stats_paucs_samplesize_mu0 =[]
    stats_mi_samplesize_mu0 =[]
    stats_conen_samplesize_mu0 =[]
    for p in DIMENSIONS:
        
        x_1 = np.random.normal(loc = np.array([2]*1+[0]*(p-1)),scale = np.array([1]*p),size=(n,p))
        x_2 = np.random.normal(loc = np.array([-2]*1+[0]*(p-1)),scale = np.array([1]*p),size=(n,p))
        x = np.vstack((x_1,x_2))
        y = np.array([0]*n+[1]*n).reshape(-1,1).ravel()
        
        model = SVC(probability=True,kernel = 'rbf')  # Set probability=True to enable probability estimates
        model.fit(x, y)
        posterior = model.predict_proba(x)

        stats_conen = np.mean(entropy(posterior, base=np.exp(1), axis=1))
        # print(stats_conen)

        # _, counts = np.unique(y_true_final, return_counts=True)
        H_Y = entropy([50,50], base=np.exp(1))
        # print(H_Y)
        stats_mi = H_Y - stats_conen

        pauc = roc_auc_score(
                y, posterior[:,1], max_fpr=0.1
            )
        print(stats_conen,stats_mi,pauc)
        
        stats_paucs_samplesize_mu0.append(pauc)
        stats_mi_samplesize_mu0.append(stats_mi)
        stats_conen_samplesize_mu0.append(stats_conen)

    Stats_Pauc_DIM_Mu2_SVM[i,:] = stats_paucs_samplesize_mu0
    Stats_MI_DIM_Mu2_SVM[i,:] = stats_mi_samplesize_mu0
    Stats_Conen_DIM_Mu2_SVM[i,:] = stats_conen_samplesize_mu0
    

0
0.04362946519378967 0.6495177153661557 0.9989158228824013
0.028265948477918346 0.664881232082027 0.9998594585217928
0.006378193671194318 0.686768986888751 1.0
0.0035938547381012278 0.689553325821844 1.0
0.0013516325062771528 0.6917955480536682 1.0
0.0002251074223215274 0.6929220731376238 1.0
6.829115641218437e-05 0.6930788894035331 1.0
3.5217299805441492e-06 0.6931436588299648 1.0
1
0.027788519774641363 0.6653586607853039 0.9996586849814968
0.016154889107645613 0.6769922914522997 1.0
0.008563155840204159 0.6845840247197411 1.0
0.002543043838933692 0.6906041367210116 1.0
0.004796918401037463 0.6883502621589078 1.0
0.0001732270445803237 0.692973953515365 1.0
9.91373215348165e-05 0.6930480432384105 1.0
2.378936487778576e-06 0.6931448016234575 1.0
2
0.049111107921227456 0.6440360726387179 0.9990764417146382
0.016030392213806213 0.6771167883461391 0.9999196905838816
0.012597235034458835 0.6805499455254864 0.9999598452919407
0.004342587653706648 0.6888045929062386 1.0
0.0026923198655652986

ValueError: Input X contains NaN.
SVC does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [38]:
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Paucs_Dim_Gaussian_mu2_SVM.csv", Stats_Pauc_DIM_Mu2_SVM[:16,:], delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_Conen_Dim_Gaussian_mu2_SVM.csv", Stats_Conen_DIM_Mu2_SVM[:16,:], delimiter=",")
np.savetxt("/Users/baiyuxin/Desktop/JHU/NDD/Cancer/vs_dimension/Stats_MI_Dim_Log_Gaussian_mu2_SVM.csv", Stats_MI_DIM_Mu2_SVM[:16,:], delimiter=",")
Stats_MI_DIM_Mu2_SVM[:16,:]

array([[0.64951772, 0.66488123, 0.68676899, 0.68955333, 0.69179555,
        0.69292207, 0.69307889, 0.69314366],
       [0.66535866, 0.67699229, 0.68458402, 0.69060414, 0.68835026,
        0.69297395, 0.69304804, 0.6931448 ],
       [0.64403607, 0.67711679, 0.68054995, 0.68880459, 0.69045486,
        0.69296144, 0.69311982, 0.69314611],
       [0.67328935, 0.67357606, 0.6840354 , 0.68925449, 0.69216803,
        0.69196485, 0.69313117, 0.69314569],
       [0.65211595, 0.67642285, 0.68826596, 0.68825391, 0.69286303,
        0.69222485, 0.69309591, 0.69314508],
       [0.65313005, 0.66856745, 0.68598161, 0.690691  , 0.69182909,
        0.69309072, 0.69311203, 0.69314588],
       [0.65959157, 0.6807123 , 0.68694318, 0.68965487, 0.69143388,
        0.69268101, 0.69311828, 0.69314607],
       [0.64766465, 0.66771929, 0.68320869, 0.68919589, 0.69256366,
        0.69236164, 0.69312763, 0.69314592],
       [0.66490579, 0.67765182, 0.68551565, 0.68971139, 0.69213285,
        0.69301026, 0.693125