In [1]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from matplotlib import pyplot
from statsmodels.graphics.gofplots import qqplot
from scipy import stats
from tqdm import tqdm

  from collections import Sequence
  from collections import Iterable
  from collections import Mapping, namedtuple, defaultdict, Sequence


In [2]:
def gen(n):

    x1 = np.random.uniform(size = n).reshape((n, 1))
    x2 = np.random.uniform(size = n).reshape((n, 1))
    x3 = np.random.uniform(size = n).reshape((n, 1))
    x4 = np.random.uniform(size = n).reshape((n, 1))
    x5 = np.random.uniform(size = n).reshape((n, 1))

    X = np.concatenate((x1, x2, x3, x4, x5), axis = 1)

    epi = np.random.randn(n).reshape((n, 1))

    y = 10 * np.sin(np.pi * x1 * x2) + 20 * (x3 - 0.05) ** 2 + 10 * x4 + 5 * x5 + epi

    y = y.ravel()
    
    return X, y

In [3]:
def predict_var(self, X, method = "corrected"):

    predict_all = np.zeros((X.shape[0], self.n_estimators))
    for t_idx in range(self.n_estimators):
        predict_all[:, t_idx] = self.estimators_[t_idx].predict(X)
    pred = np.mean(predict_all, axis = 1)

    inbag_times_ = self.inbag_times_

    m = X.shape[0]

    if method == "BM":
        cond_exp_full = np.zeros((self.n_samples_, m))

        for i in range(self.n_samples_):
            # cond_exp_full[i, :] = np.mean(predict_all[:, inbag_times_[i, :] >= 1], axis=1)
            cond_exp_full[i, :] = np.average(predict_all, weights = inbag_times_[i, :], axis = 1)

        zeta1_full = np.zeros(m)
        zetan_full = np.zeros(m)
        variance = np.zeros(m)
        for i in range(m):
            zeta1_full[i] = np.var(cond_exp_full[:, i])
            zetan_full[i] = np.var(predict_all[i, :])
        variance = zeta1_full * (self.n_subsamples_ ** 2) / self.n_samples_ + zetan_full / self.n_estimators
        
        return [float(variance), float(zeta1_full), float(zetan_full)]
        
    elif method == "IJ":
        
        f_centered = predict_all - np.mean(predict_all, axis=1).reshape(m, 1)
        i_centered = inbag_times_ - np.mean(inbag_times_, axis=1).reshape(self.n_samples_, 1)
        corr = np.dot(f_centered, i_centered.T) / self.n_estimators
        cov = np.dot(corr, corr.T)
        zetan_full = np.cov(predict_all)
        covariance = cov + zetan_full / self.n_estimators

        return [float(np.diagonal(covariance)), float(cov), float(zetan_full)]
    
    elif method == "corrected":
        
        cond_exp_full = np.zeros((self.n_samples_, m))

        for i in range(self.n_samples_):
            # cond_exp_full[i, :] = np.mean(predict_all[:, inbag_times_[i, :] >= 1], axis=1)
            cond_exp_full[i, :] = np.average(predict_all, weights = inbag_times_[i, :], axis = 1)

        inbag_times_ = inbag_times_[np.sum(inbag_times_, axis = 1) > 0, ]

        nk = np.sum(inbag_times_, axis = 1)
        K = len(nk)
        C = np.sum(nk)     

        SSr = np.dot(((cond_exp_full - pred) ** 2).T, nk)

        SSe = [0] * m

        for i in range(K):
            SSei = np.sum((predict_all[:, inbag_times_[i, :]>=1].T - cond_exp_full[i, :]) ** 2, axis = 0)
            SSe += SSei

        sigma_e_squared = SSe / (C - K)

        sigma_M_squared = (SSr - (K - 1) * sigma_e_squared) / (C - np.sum(nk ** 2) / C)
        
        sigma_M_squared_without = SSr / C
        
        return  [(self.n_subsamples_ ** 2) / self.n_samples_ * float(sigma_M_squared), (self.n_subsamples_ ** 2) / self.n_samples_ * float(sigma_M_squared_without)]
        

### test point 1

In [4]:
np.random.seed(11)
rep = 500
n_sub = 100
test = np.array([[0.5, 0.5, 0.5, 0.5, 0.5]])

print("========================= B = 500 =========================")

pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 500)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))



print("========================= B = 1000 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 1000)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))



print("========================= B = 2500 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 2500)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))


print("========================= B = 5000 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 5000)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))


  0%|          | 0/500 [00:00<?, ?it/s]



100%|██████████| 500/500 [07:26<00:00,  1.12it/s]
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.02817183530313616, pvalue=0.822334451360713)
variance ratio for c: 1.4962019383865082
variance ratio for uc: 7.698478492230314
C.C. for c: 0.972
C.C. for c: 1.0


100%|██████████| 500/500 [32:13<00:00,  3.87s/it]    
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.03418969131895533, pvalue=0.6075215512234917)
variance ratio for c: 1.383949252857507
variance ratio for uc: 4.9539848670331805
C.C. for c: 0.964
C.C. for c: 1.0


100%|██████████| 500/500 [35:22<00:00,  4.25s/it]
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.026378250385301838, pvalue=0.8774485030044082)
variance ratio for c: 1.0327729094031193
variance ratio for uc: 2.306802305550519
C.C. for c: 0.942
C.C. for c: 0.992


100%|██████████| 500/500 [1:06:59<00:00,  8.04s/it]

kstest: KstestResult(statistic=0.031244887893097872, pvalue=0.7134698710987883)
variance ratio for c: 1.0305116735585065
variance ratio for uc: 1.7073297150650855
C.C. for c: 0.95
C.C. for c: 0.98





In [4]:
np.random.seed(11)
rep = 500
test = np.array([[0.5, 0.5, 0.5, 0.5, 0.5]])
n_sub = 250

print("========================= B = 500 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 500)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))



print("========================= B = 1000 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 1000)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))



print("========================= B = 2500 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 2500)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))


print("========================= B = 5000 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 5000)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))


  0%|          | 0/500 [00:00<?, ?it/s]



100%|██████████| 500/500 [08:08<00:00,  1.02it/s]
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.028305771321911755, pvalue=0.8179080919708579)
variance ratio for c: 2.7233822128076977
variance ratio for uc: 9.426501791712008
C.C. for c: 1.0
C.C. for c: 1.0


100%|██████████| 500/500 [17:12<00:00,  2.07s/it]
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.0308969723864862, pvalue=0.7263636142975307)
variance ratio for c: 2.0293553543652587
variance ratio for uc: 5.744796088019631
C.C. for c: 0.98
C.C. for c: 1.0


100%|██████████| 500/500 [43:08<00:00,  5.18s/it]
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.05858648623166196, pvalue=0.062074846781612424)
variance ratio for c: 1.2496289449140852
variance ratio for uc: 2.565490467880983
C.C. for c: 0.954
C.C. for c: 0.998


100%|██████████| 500/500 [1:21:13<00:00,  9.75s/it]

kstest: KstestResult(statistic=0.033727506346396874, pvalue=0.6270890036476992)
variance ratio for c: 1.0630173089783796
variance ratio for uc: 1.7179026957172823
C.C. for c: 0.95
C.C. for c: 0.974





In [5]:
np.random.seed(11)
rep = 500
test = np.array([[0.5, 0.5, 0.5, 0.5, 0.5]])
n_sub = 500

print("========================= B = 500 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 500)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))



print("========================= B = 1000 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 1000)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))



print("========================= B = 2500 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 2500)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))


print("========================= B = 5000 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 5000)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))


  0%|          | 0/500 [00:00<?, ?it/s]



100%|██████████| 500/500 [09:48<00:00,  1.18s/it]
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.03328755220999102, pvalue=0.646043725319522)
variance ratio for c: 4.543754971779067
variance ratio for uc: 10.723673386149006
C.C. for c: 1.0
C.C. for c: 1.0


100%|██████████| 500/500 [19:20<00:00,  2.32s/it]
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.04612795484193122, pvalue=0.23097018226369512)
variance ratio for c: 3.025081524438964
variance ratio for uc: 6.441242987320396
C.C. for c: 0.99
C.C. for c: 1.0


100%|██████████| 500/500 [48:29<00:00,  5.82s/it]
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.05149289467168949, pvalue=0.13627171142812514)
variance ratio for c: 1.6639710919494546
variance ratio for uc: 2.909403594799179
C.C. for c: 0.974
C.C. for c: 0.998


100%|██████████| 500/500 [1:42:23<00:00, 12.29s/it]

kstest: KstestResult(statistic=0.02381852295622988, pvalue=0.9392070238851798)
variance ratio for c: 1.165242608482857
variance ratio for uc: 1.7294340522524703
C.C. for c: 0.95
C.C. for c: 0.972





### test point 2

In [4]:
np.random.seed(11)

x1 = np.random.uniform(size = 1)[0]
x2 = np.random.uniform(size = 1)[0]
x3 = np.random.uniform(size = 1)[0]
x4 = np.random.uniform(size = 1)[0]
x5 = np.random.uniform(size = 1)[0]
test = np.array([[x1, x2, x3, x4, x5]])

rep = 500
n_sub = 100

print("========================= B = 500 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 500)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for uc: " + str(count_uc / rep))



print("========================= B = 1000 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 1000)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))



print("========================= B = 2500 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 2500)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))


print("========================= B = 5000 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 5000)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))


  0%|          | 0/500 [00:00<?, ?it/s]



100%|██████████| 500/500 [07:08<00:00,  1.17it/s]
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.02062062338800047, pvalue=0.9835870209490137)
variance ratio for c: 1.564761004768483
variance ratio for uc: 8.387087703271844
C.C. for c: 0.964
C.C. for uc: 1.0


100%|██████████| 500/500 [15:09<00:00,  1.82s/it]
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.018931981045107737, pvalue=0.9939371189258969)
variance ratio for c: 1.3115531378904575
variance ratio for uc: 4.929019614099158
C.C. for c: 0.966
C.C. for c: 1.0


100%|██████████| 500/500 [36:16<00:00,  4.35s/it]
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.021971820921400598, pvalue=0.9692354385130867)
variance ratio for c: 1.1058144822144367
variance ratio for uc: 2.5635332039276197
C.C. for c: 0.946
C.C. for c: 0.998


100%|██████████| 500/500 [1:06:43<00:00,  8.01s/it]

kstest: KstestResult(statistic=0.021483530773912296, pvalue=0.9751249629044848)
variance ratio for c: 1.0460131565261819
variance ratio for uc: 1.7748834843695516
C.C. for c: 0.948
C.C. for c: 0.992





In [5]:
np.random.seed(11)

x1 = np.random.uniform(size = 1)[0]
x2 = np.random.uniform(size = 1)[0]
x3 = np.random.uniform(size = 1)[0]
x4 = np.random.uniform(size = 1)[0]
x5 = np.random.uniform(size = 1)[0]
test = np.array([[x1, x2, x3, x4, x5]])

rep = 500
n_sub = 250

print("========================= B = 500 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 500)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for uc: " + str(count_uc / rep))



print("========================= B = 1000 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 1000)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))



print("========================= B = 2500 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 2500)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))


print("========================= B = 5000 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 5000)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))


  0%|          | 0/500 [00:00<?, ?it/s]



100%|██████████| 500/500 [08:05<00:00,  1.03it/s]
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.0259939386941917, pvalue=0.8881031341526038)
variance ratio for c: 2.823136086089743
variance ratio for uc: 9.98091494677174
C.C. for c: 0.994
C.C. for uc: 1.0


100%|██████████| 500/500 [15:55<00:00,  1.91s/it]
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.022508669474416154, pvalue=0.9617891495339396)
variance ratio for c: 1.9682507805326042
variance ratio for uc: 5.678876094518563
C.C. for c: 0.99
C.C. for c: 1.0


100%|██████████| 500/500 [39:28<00:00,  4.74s/it]
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.02525017844991373, pvalue=0.9073978056217468)
variance ratio for c: 1.288003692879182
variance ratio for uc: 2.717573383060217
C.C. for c: 0.966
C.C. for c: 0.996


100%|██████████| 500/500 [1:18:41<00:00,  9.44s/it]

kstest: KstestResult(statistic=0.020072509014780554, pvalue=0.9877714147438206)
variance ratio for c: 1.2058729915619544
variance ratio for uc: 1.9766509240813237
C.C. for c: 0.952
C.C. for c: 0.986





In [6]:
np.random.seed(11)

x1 = np.random.uniform(size = 1)[0]
x2 = np.random.uniform(size = 1)[0]
x3 = np.random.uniform(size = 1)[0]
x4 = np.random.uniform(size = 1)[0]
x5 = np.random.uniform(size = 1)[0]
test = np.array([[x1, x2, x3, x4, x5]])

rep = 500
n_sub = 500

print("========================= B = 500 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 500)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for uc: " + str(count_uc / rep))



print("========================= B = 1000 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 1000)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))



print("========================= B = 2500 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 2500)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))


print("========================= B = 5000 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 5000)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))


  0%|          | 0/500 [00:00<?, ?it/s]



100%|██████████| 500/500 [09:45<00:00,  1.17s/it]
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.023866975662316547, pvalue=0.9382506816460875)
variance ratio for c: 4.912112606215685
variance ratio for uc: 11.670802506262733
C.C. for c: 1.0
C.C. for uc: 1.0


100%|██████████| 500/500 [19:16<00:00,  2.31s/it]
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.03067107916003886, pvalue=0.7346829729344141)
variance ratio for c: 2.783834512217878
variance ratio for uc: 5.95176802172595
C.C. for c: 0.996
C.C. for c: 1.0


100%|██████████| 500/500 [47:46<00:00,  5.73s/it]
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.0348793032065563, pvalue=0.5789942913803638)
variance ratio for c: 1.6592645481551322
variance ratio for uc: 2.923489509235412
C.C. for c: 0.978
C.C. for c: 0.994


100%|██████████| 500/500 [1:35:20<00:00, 11.44s/it]

kstest: KstestResult(statistic=0.025113330797066236, pvalue=0.9107487425804756)
variance ratio for c: 1.4412288724848288
variance ratio for uc: 2.1471384851750654
C.C. for c: 0.954
C.C. for c: 0.984





### test point 3

In [7]:
np.random.seed(7)

x1 = np.random.uniform(size = 1)[0]
x2 = np.random.uniform(size = 1)[0]
x3 = np.random.uniform(size = 1)[0]
x4 = np.random.uniform(size = 1)[0]
x5 = np.random.uniform(size = 1)[0]
test = np.array([[x1, x2, x3, x4, x5]])

rep = 500
n_sub = 100

print("========================= B = 500 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 500)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for uc: " + str(count_uc / rep))



print("========================= B = 1000 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 1000)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))



print("========================= B = 2500 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 2500)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))


print("========================= B = 5000 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 5000)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))


  0%|          | 0/500 [00:00<?, ?it/s]



100%|██████████| 500/500 [06:51<00:00,  1.22it/s]
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.029418134411848706, pvalue=0.7798182987938815)
variance ratio for c: 1.4229462492706944
variance ratio for uc: 7.302035049039813
C.C. for c: 0.966
C.C. for uc: 1.0


100%|██████████| 500/500 [13:30<00:00,  1.62s/it]
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.047201616017295056, pvalue=0.20878701553806248)
variance ratio for c: 1.2472362056355415
variance ratio for uc: 4.443300161888869
C.C. for c: 0.964
C.C. for c: 1.0


100%|██████████| 500/500 [33:23<00:00,  4.01s/it]
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.029368060316007638, pvalue=0.7815786608483761)
variance ratio for c: 1.0347984046658076
variance ratio for uc: 2.273591859153554
C.C. for c: 0.944
C.C. for c: 0.994


100%|██████████| 500/500 [1:06:37<00:00,  8.00s/it]

kstest: KstestResult(statistic=0.024812152311877, pvalue=0.9178976314130891)
variance ratio for c: 0.9884080265342633
variance ratio for uc: 1.6431598399843
C.C. for c: 0.95
C.C. for c: 0.99





In [8]:
np.random.seed(7)

x1 = np.random.uniform(size = 1)[0]
x2 = np.random.uniform(size = 1)[0]
x3 = np.random.uniform(size = 1)[0]
x4 = np.random.uniform(size = 1)[0]
x5 = np.random.uniform(size = 1)[0]
test = np.array([[x1, x2, x3, x4, x5]])

rep = 500
n_sub = 250

print("========================= B = 500 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 500)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for uc: " + str(count_uc / rep))



print("========================= B = 1000 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 1000)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))



print("========================= B = 2500 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 2500)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))


print("========================= B = 5000 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 5000)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))


  0%|          | 0/500 [00:00<?, ?it/s]



100%|██████████| 500/500 [08:05<00:00,  1.03it/s]
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.04320555403992665, pvalue=0.3004929502126097)
variance ratio for c: 2.743552875938113
variance ratio for uc: 9.596922636490302
C.C. for c: 0.994
C.C. for uc: 1.0


100%|██████████| 500/500 [15:58<00:00,  1.92s/it]
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.05889421516376803, pvalue=0.05985610347582242)
variance ratio for c: 1.8041087252444183
variance ratio for uc: 5.1754082778494155
C.C. for c: 0.984
C.C. for c: 1.0


100%|██████████| 500/500 [39:28<00:00,  4.74s/it]
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.03947738783502763, pvalue=0.41004489579816183)
variance ratio for c: 1.250110628436906
variance ratio for uc: 2.5461833437004446
C.C. for c: 0.956
C.C. for c: 0.996


100%|██████████| 500/500 [1:19:10<00:00,  9.50s/it]

kstest: KstestResult(statistic=0.027363394756617088, pvalue=0.8481941921693978)
variance ratio for c: 1.110784442617497
variance ratio for uc: 1.8044647273525896
C.C. for c: 0.952
C.C. for c: 0.986





In [9]:
np.random.seed(7)

x1 = np.random.uniform(size = 1)[0]
x2 = np.random.uniform(size = 1)[0]
x3 = np.random.uniform(size = 1)[0]
x4 = np.random.uniform(size = 1)[0]
x5 = np.random.uniform(size = 1)[0]
test = np.array([[x1, x2, x3, x4, x5]])

rep = 500
n_sub = 500

print("========================= B = 500 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 500)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for uc: " + str(count_uc / rep))



print("========================= B = 1000 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 1000)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))



print("========================= B = 2500 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 2500)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))


print("========================= B = 5000 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 5000)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("kstest: " + str(stats.kstest(stats.zscore(pred), cdf = "norm")))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))


  0%|          | 0/500 [00:00<?, ?it/s]



100%|██████████| 500/500 [10:11<00:00,  1.22s/it]
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.044722644951249224, pvalue=0.2626863687402076)
variance ratio for c: 4.832196109815473
variance ratio for uc: 11.55616005543373
C.C. for c: 0.998
C.C. for uc: 1.0


100%|██████████| 500/500 [3:20:33<00:00, 24.07s/it]      
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.052025658972071176, pvalue=0.1289084616203641)
variance ratio for c: 2.7163480344944517
variance ratio for uc: 5.795737573273643
C.C. for c: 0.994
C.C. for c: 1.0


100%|██████████| 500/500 [48:17<00:00,  5.79s/it]
  0%|          | 0/500 [00:00<?, ?it/s]

kstest: KstestResult(statistic=0.024166252053388515, pvalue=0.9321538919152805)
variance ratio for c: 1.5515205910685717
variance ratio for uc: 2.703408973760596
C.C. for c: 0.958
C.C. for c: 0.99


100%|██████████| 500/500 [1:36:02<00:00, 11.53s/it]

kstest: KstestResult(statistic=0.037844164764084975, pvalue=0.465758808136541)
variance ratio for c: 1.2448892863348302
variance ratio for uc: 1.8566865849772363
C.C. for c: 0.946
C.C. for c: 0.986





In [10]:
np.random.seed(7)

x1 = np.random.uniform(size = 1)[0]
x2 = np.random.uniform(size = 1)[0]
x3 = np.random.uniform(size = 1)[0]
x4 = np.random.uniform(size = 1)[0]
x5 = np.random.uniform(size = 1)[0]
test = np.array([[x1, x2, x3, x4, x5]])

rep = 500
n_sub = 500

print("========================= B = 500 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 500)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("normal test: " + str(stats.normaltest(pred)))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for uc: " + str(count_uc / rep))



print("========================= B = 1000 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 1000)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("normal test: " + str(stats.normaltest(pred)))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))



print("========================= B = 2500 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 2500)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("normal test: " + str(stats.normaltest(pred)))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))


print("========================= B = 5000 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 5000)
    model.fit(X, y, n_subsamples = n_sub, replace = True)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("normal test: " + str(stats.normaltest(pred)))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))


  0%|          | 0/500 [00:00<?, ?it/s]



100%|██████████| 500/500 [09:56<00:00,  1.19s/it]
  0%|          | 0/500 [00:00<?, ?it/s]

normal test: NormaltestResult(statistic=7.5323103919905865, pvalue=0.0231408645548115)
variance ratio for c: 4.832196109815473
variance ratio for uc: 11.55616005543373
C.C. for c: 0.998
C.C. for uc: 1.0


100%|██████████| 500/500 [19:33<00:00,  2.35s/it]
  0%|          | 0/500 [00:00<?, ?it/s]

normal test: NormaltestResult(statistic=26.666752630546853, pvalue=1.619527180396347e-06)
variance ratio for c: 2.7163480344944517
variance ratio for uc: 5.795737573273643
C.C. for c: 0.994
C.C. for c: 1.0


100%|██████████| 500/500 [1:18:11<00:00,  9.38s/it]    
  0%|          | 0/500 [00:00<?, ?it/s]

normal test: NormaltestResult(statistic=6.721178942012414, pvalue=0.034714789550347507)
variance ratio for c: 1.5515205910685717
variance ratio for uc: 2.703408973760596
C.C. for c: 0.958
C.C. for c: 0.99


100%|██████████| 500/500 [5:30:05<00:00, 39.61s/it]      

normal test: NormaltestResult(statistic=13.445197419887922, pvalue=0.001203406841630118)
variance ratio for c: 1.2448892863348302
variance ratio for uc: 1.8566865849772363
C.C. for c: 0.946
C.C. for c: 0.986





In [11]:
np.random.seed(7)

x1 = np.random.uniform(size = 1)[0]
x2 = np.random.uniform(size = 1)[0]
x3 = np.random.uniform(size = 1)[0]
x4 = np.random.uniform(size = 1)[0]
x5 = np.random.uniform(size = 1)[0]
test = np.array([[x1, x2, x3, x4, x5]])

rep = 500
n_sub = 500

print("========================= B = 500 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 500)
    model.fit(X, y, n_subsamples = n_sub, replace = False)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("normal test: " + str(stats.normaltest(pred)))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for uc: " + str(count_uc / rep))



print("========================= B = 1000 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 1000)
    model.fit(X, y, n_subsamples = n_sub, replace = False)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("normal test: " + str(stats.normaltest(pred)))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))



print("========================= B = 2500 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 2500)
    model.fit(X, y, n_subsamples = n_sub, replace = False)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("normal test: " + str(stats.normaltest(pred)))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))


print("========================= B = 5000 =========================")
pred = []
var_uc = []
var_c = []

for r in tqdm(range(rep)):
    
    X, y = gen(500)
    
    model = RandomForestRegressor(n_estimators = 5000)
    model.fit(X, y, n_subsamples = n_sub, replace = False)
    pred.append(model.predict(test)[0])
    var_c.append(predict_var(model, test)[0])
    var_uc.append(predict_var(model, test, method = "IJ")[0])

print("normal test: " + str(stats.normaltest(pred)))
print("variance ratio for c: " + str(np.mean(var_c) / np.var(pred)))
print("variance ratio for uc: " + str(np.mean(var_uc) / np.var(pred)))

target = np.mean(pred)
count_c = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_c[r])
    upper = pred[r] + 1.96 * np.sqrt(var_c[r])
    if target >= lower and target <= upper:
        count_c += 1
count_uc = 0
for r in range(rep):
    lower = pred[r] - 1.96 * np.sqrt(var_uc[r])
    upper = pred[r] + 1.96 * np.sqrt(var_uc[r])
    if target >= lower and target <= upper:
        count_uc += 1        
print("C.C. for c: " + str(count_c / rep))
print("C.C. for c: " + str(count_uc / rep))


  0%|          | 0/500 [00:00<?, ?it/s]



100%|██████████| 500/500 [12:08<00:00,  1.46s/it]
  0%|          | 0/500 [00:00<?, ?it/s]

normal test: NormaltestResult(statistic=15.082133863887938, pvalue=0.0005308309590496557)
variance ratio for c: -0.08730335692937251
variance ratio for uc: 0.00017460671385874507
C.C. for c: 0.0
C.C. for uc: 0.014


100%|██████████| 500/500 [24:03<00:00,  2.89s/it]
  0%|          | 0/500 [00:00<?, ?it/s]

normal test: NormaltestResult(statistic=4.853205106558651, pvalue=0.08833644169533722)
variance ratio for c: -0.05249118792560302
variance ratio for uc: 0.00010498237585120603
C.C. for c: 0.0
C.C. for c: 0.008


100%|██████████| 500/500 [59:43<00:00,  7.17s/it]
  0%|          | 0/500 [00:00<?, ?it/s]

normal test: NormaltestResult(statistic=14.320214595182383, pvalue=0.0007769711816086631)
variance ratio for c: -0.019332619110539333
variance ratio for uc: 3.866523822107869e-05
C.C. for c: 0.0
C.C. for c: 0.002


100%|██████████| 500/500 [1:59:18<00:00, 14.32s/it]

normal test: NormaltestResult(statistic=42.627724615274616, pvalue=5.539967433449242e-10)
variance ratio for c: -0.010656924486475315
variance ratio for uc: 2.1313848972950592e-05
C.C. for c: 0.0
C.C. for c: 0.016



