In [1]:
import numpy as np
rng = np.random.default_rng()

In [2]:
def generate_sample_pool(total_features:int, subsample_amt:int, n_estimators:int) -> np.ndarray:
    pool = np.full(shape=(n_estimators, total_features), fill_value=False)
    for i in range(n_estimators):
        feat = np.arange(total_features) 
        rng.shuffle(feat)
        pool[i, :] = feat < subsample_amt

    return pool

# Monte Carlo
def money_carlo_prob_est(
    n_trials:int
    , total_features:int
    , subsample_amt:int
    , n_estimators:int
    , target_times:int
) -> float:
    
    count = 0
    for _ in range(n_trials):
        times = generate_sample_pool(total_features, subsample_amt, n_estimators).sum(axis=0)
        success = 1
        for number in times:
            if number < target_times:
                success = 0
                break
        
        count += success

    return count / n_trials


In [3]:
prob = 0.
n_estimators = 0
n_trials = 500
total_features = 100
subsample_amt = 60
target_times = 6

threshold = 0.95

while prob < threshold:
    n_estimators += 1
    prob = money_carlo_prob_est(
        n_trials=n_trials
        , total_features=total_features
        , subsample_amt = subsample_amt
        , n_estimators=n_estimators
        , target_times=target_times
    )

print(f"We need {n_estimators} estimators to have {threshold*100}% confidence that all features are considered "
      f"at least {target_times} times.")

We need 22 estimators to have 95.0% confidence that all features are considered at least 6 times.


In [4]:
def run_experiment(
    target_times:int
    , total_features:int
    , subsample_amt:int
    , n_trials:int
    , threshold:float
) -> int:
    prob = 0.
    n_estimators = 0

    while prob < threshold:
        n_estimators += 1
        prob = money_carlo_prob_est(
            n_trials=n_trials
            , total_features=total_features
            , subsample_amt = subsample_amt
            , n_estimators=n_estimators
            , target_times=target_times
        )

    return n_estimators

In [5]:
run_experiment(target_times=6,total_features=100,subsample_amt=60,n_trials=500,threshold=0.95)

22

In [6]:
%%timeit
run_experiment(target_times=6,total_features=100,subsample_amt=60,n_trials=500,threshold=0.95)

706 ms ± 20.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
