Reference paper: stability is stable 
Experiment: vary the value of rho and see the minimum sample size required for replicability

func getConvergenceSampleNum(hyperparams)
    For sample_num range(min_subsets_size, max_subsets_size, step):
        For range(repeat_num):
            random draw a subset of the dataset, whose size = sample_num
            Use algorithm 10 to get a model
        Check whether the models we get are 'replicable' to each other, accoding to the hyperparams. (see definition in page 14 of the paper)
        if converged, return current sample_num

func experiment(): 
# vary the rho and see minumum sample size required for replicability. 
# To make it simple, we can fix the value of other hyperparamers, such as alpha and beta.
    for rho in range(min_rho, max_rho, step):
        sample_num = getConvergenceSampleNum(rho)
        theoretical_sample_num = getTheoreticalSampleNum(rho) # compute according to algorithm 10 in the paper
        print("Hyperparams: ", hyperparams, "Sample size: ", sample_num)
    plot(hyperparams, theoretical_sample_num) # draw the curve, where x-axis is the hyperparams and y-axis is the sample size
    plot(hyperparams, sample_num) # draw the curve, where x-axis is the hyperparams and y-axis is the sample size

In [1]:
import pandas as pd
import numpy as np
import config
import Algorithm10 as a10


##### Config Variables #####
np = <module 'numpy' from '/opt/anaconda3/envs/learn/lib/python3.12/site-packages/numpy/__init__.py'>
dataset_path = ./dataset/Invistico_Airline.csv
model_path = ./models/
max_depth = 3
random_seed = 42
selected_features = ['Class', 'Seat comfort', 'Food and drink', 'Cleanliness', 'satisfaction']
rho = 0.3
alpha = 0.3
beta = 0.1
num_H = 10
m = 100
m_up_bound = 9040.567619605657
tau_up_bound = 0.03908650337129266
tau = 3.908650337129266e-06
############################


In [2]:
data = pd.read_csv('dataset/Invistico_Airline.csv')
data.head()

Unnamed: 0,satisfaction,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,satisfied,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,2,...,2,3,3,0,3,5,3,2,0,0.0
1,satisfied,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,3,...,2,3,4,4,4,2,3,2,310,305.0
2,satisfied,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,3,...,2,2,3,3,4,4,4,2,0,0.0
3,satisfied,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,3,...,3,1,1,0,1,4,1,3,0,0.0
4,satisfied,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,3,...,4,2,2,0,2,4,2,5,0,0.0


In [3]:
data.shape

(129880, 22)

In [4]:
# function to check if two decision trees are equal
def are_trees_equal(tree1, tree2):
    # Check that both trees are fitted
    if not hasattr(tree1, 'tree_') or not hasattr(tree2, 'tree_'):
        raise ValueError("Both trees must be fitted before comparison.")

    # Compare parameters
    if tree1.get_params() != tree2.get_params():
        return False

    t1 = tree1.tree_
    t2 = tree2.tree_

    # Compare structure and splitting rules
    attributes_to_check = [
        'children_left', 'children_right',
        'feature', 'threshold',
        'impurity', 'n_node_samples', 'weighted_n_node_samples',
        'value'
    ]

    for attr in attributes_to_check:
        if not np.array_equal(getattr(t1, attr), getattr(t2, attr)):
            return False

    return True


In [5]:
def getConvergenceSampleNum(min_subset_size, max_subset_size, repeat_num, rho, sample_size_step=1):
    sample_size_replicablity_dict = {}
    for sample_size in range(min_subset_size, max_subset_size + 1, sample_size_step):
        #get dataset of size sample_size by sampling from the original dataset
        replicable_tree_list = []
        for i in range(repeat_num):
            print(f"sample size: {sample_size}, repeat: {i}")
            X_train, X_test, y_train, y_test = a10.load_dataset(config.dataset_path, sample_size=sample_size, test_size=0.01, random_state=config.random_seed+i)
            H = a10.build_candidate_trees(X_train, y_train, max_depth=config.max_depth, num_trees=config.num_H, random_state=config.random_seed+i)
            tree = a10.replicable_learner(X_train, y_train, H, random_seed=config.random_seed+i)
            replicable_tree_list.append(tree)
            # a = tree.score(X_test, y_test)
        #check the probability if the trees in the replicable_tree_list are the same
        same_tree_count = 0
        for i in range(len(replicable_tree_list)):
            for j in range(i + 1, len(replicable_tree_list)):
                if are_trees_equal(replicable_tree_list[i], replicable_tree_list[j]):
                    print(f"tree {i} and tree {j} are the same")
                    same_tree_count += 1
        prob = same_tree_count / (repeat_num * (repeat_num - 1) / 2)
        if prob >= 1-rho:
            print(f"replicable at sample size: {sample_size}, prob: {prob}")
            return sample_size 
    print(f"not replicable at sample size between {min_subset_size} and {max_subset_size}, prob: {prob}")
    return -1

            
        
    

In [6]:
print("theoretical sample size: ", config.m_up_bound)
ans_dict = getConvergenceSampleNum(min_subset_size=10000, max_subset_size=12000, repeat_num=10, rho=config.rho, sample_size_step=1000)

theoretical sample size:  9040.567619605657
sample size: 10000, repeat: 0
Train set size: 10000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 203.12it/s]


k: 19187 v max candidates: 0.2875940516665116 v min candidates: 0.21260268629834947
sample size: 10000, repeat: 1
Train set size: 10000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 217.53it/s]


k: 19187 v max candidates: 0.2868935445473655 v min candidates: 0.2119021791792034
sample size: 10000, repeat: 2
Train set size: 10000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 218.85it/s]


k: 19187 v max candidates: 0.27899495124635354 v min candidates: 0.20400358587819142
sample size: 10000, repeat: 3
Train set size: 10000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 212.80it/s]


k: 19187 v max candidates: 0.3012952525434234 v min candidates: 0.2263038871752613
sample size: 10000, repeat: 4
Train set size: 10000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 221.19it/s]


k: 19187 v max candidates: 0.2921948515566217 v min candidates: 0.2172034861884596
sample size: 10000, repeat: 5
Train set size: 10000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 121.24it/s]


k: 19187 v max candidates: 0.28919354148670756 v min candidates: 0.21420217611854547
sample size: 10000, repeat: 6
Train set size: 10000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 194.65it/s]


k: 19187 v max candidates: 0.2878933538750073 v min candidates: 0.21290198850684522
sample size: 10000, repeat: 7
Train set size: 10000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 194.61it/s]


k: 19187 v max candidates: 0.2824939078757416 v min candidates: 0.2075025425075795
sample size: 10000, repeat: 8
Train set size: 10000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 204.49it/s]


k: 19187 v max candidates: 0.30759428630577496 v min candidates: 0.23260292093761287
sample size: 10000, repeat: 9
Train set size: 10000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 136.35it/s]


k: 19187 v max candidates: 0.3024946402922432 v min candidates: 0.2275032749240811
sample size: 11000, repeat: 0
Train set size: 11000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 175.64it/s]


k: 19187 v max candidates: 0.28799405166651154 v min candidates: 0.21300268629834943
sample size: 11000, repeat: 1
Train set size: 11000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 185.09it/s]


k: 19187 v max candidates: 0.2880844536382746 v min candidates: 0.21309308827011253
sample size: 11000, repeat: 2
Train set size: 11000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 185.80it/s]


k: 19187 v max candidates: 0.27926767851908085 v min candidates: 0.20427631315091874
sample size: 11000, repeat: 3
Train set size: 11000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 123.95it/s]


k: 19187 v max candidates: 0.30181343436160524 v min candidates: 0.22682206899344315
sample size: 11000, repeat: 4
Train set size: 11000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 179.67it/s]


k: 19187 v max candidates: 0.2916312151929854 v min candidates: 0.2166398498248233
sample size: 11000, repeat: 5
Train set size: 11000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 185.38it/s]


k: 19187 v max candidates: 0.2893571778503439 v min candidates: 0.21436581248218184
sample size: 11000, repeat: 6
Train set size: 11000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 176.98it/s]


k: 19187 v max candidates: 0.28862971751137095 v min candidates: 0.21363835214320887
sample size: 11000, repeat: 7
Train set size: 11000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 126.39it/s]


k: 19187 v max candidates: 0.28326663514846884 v min candidates: 0.20827526978030675
sample size: 11000, repeat: 8
Train set size: 11000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 180.82it/s]


k: 19187 v max candidates: 0.3094488317603204 v min candidates: 0.2344574663921583
sample size: 11000, repeat: 9
Train set size: 11000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 181.32it/s]


k: 19187 v max candidates: 0.31244918574678865 v min candidates: 0.23745782037862653
sample size: 12000, repeat: 0
Train set size: 12000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 152.39it/s]


k: 19187 v max candidates: 0.28966071833317825 v min candidates: 0.21466935296501613
sample size: 12000, repeat: 1
Train set size: 12000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 118.47it/s]


k: 19187 v max candidates: 0.2880768778806988 v min candidates: 0.2130855125125367
sample size: 12000, repeat: 2
Train set size: 12000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 164.97it/s]


k: 19187 v max candidates: 0.28199495124635354 v min candidates: 0.20700358587819143
sample size: 12000, repeat: 3
Train set size: 12000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 170.51it/s]


k: 19187 v max candidates: 0.30241191921009014 v min candidates: 0.22742055384192805
sample size: 12000, repeat: 4
Train set size: 12000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 169.58it/s]


k: 19187 v max candidates: 0.29224485155662183 v min candidates: 0.21725348618845972
sample size: 12000, repeat: 5
Train set size: 12000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 120.91it/s]


k: 19187 v max candidates: 0.3139102081533743 v min candidates: 0.2389188427852122
sample size: 12000, repeat: 6
Train set size: 12000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 167.93it/s]


k: 19187 v max candidates: 0.28716002054167394 v min candidates: 0.21216865517351186
sample size: 12000, repeat: 7
Train set size: 12000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 170.11it/s]


k: 19187 v max candidates: 0.2861605745424082 v min candidates: 0.2111692091742461
sample size: 12000, repeat: 8
Train set size: 12000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 163.54it/s]


k: 19187 v max candidates: 0.3099942863057749 v min candidates: 0.23500292093761282
sample size: 12000, repeat: 9
Train set size: 12000 Test set size: 1299


100%|██████████| 10/10 [00:00<00:00, 118.70it/s]

k: 19187 v max candidates: 0.28899464029224314 v min candidates: 0.21400327492408103
not replicable at sample size between 10000 and 12000, prob: 0.0





In [7]:
ans_dict

-1