In [2]:
import pandas as pd
from numpy import sqrt, e, log
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [242]:
def get_error_bound(rho, d, m, delta):
    """
    Calculate the error bound
    Args:
        rho:    margin
        d:      VC dimension
        m:      sample size
        delta:  confidence
    Output:
        error:  theoretical error bound
    """
    first_term = sqrt((2 * d * log(e * m / d)) / m) * 2 / rho
    second_term = sqrt(log(1 / delta) / (2 * m))
    return first_term + second_term


def create_dataset(m, d, class_sep=1, random_state=42):
    """
    Create a dataset for the experiment
    Args:
        m:              sample size
        d:              VC dimension
        class_sep:      class separation
        random_state:   random seed
    Output:
        X_train:    training data
        X_test:     test data
        y_train:    training labels
        y_test:     test labels
    """
    X, y = make_classification(
        n_samples=2*m,
        n_features=d-1,
        n_classes=2,
        random_state=random_state,
        shuffle=False,
        class_sep=class_sep,
        flip_y=0.05,
    )
    # Changle class labels to +1 and -1
    y = 2*y - 1
    # Split the dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.5, random_state=42
    )
    return X_train, X_test, y_train, y_test


def evaluate_error(X_train, X_test, y_train, y_test, T=50):
    """
    Evaluate the error of the AdaBoost classifier
    Args:
        X_train:    training data
        X_test:     test data
        y_train:    training labels
        y_test:     test labels
        T:          number of iterations
    Output:
        R_train:    training error
        R_test:     test error
    """
    # Train and fit AdaBoost with perceptron as base estimator
    clf = AdaBoostClassifier(
        base_estimator=Perceptron(penalty=None),
        n_estimators=T,
        algorithm='SAMME',
        random_state=42,
    )
    clf.fit(X_train, y_train)
    # Calculate the training and test error
    R_train = 1 - clf.score(X_train, y_train)
    R_test  = 1 - clf.score(X_test, y_test)

    return R_train, R_test

In [247]:
# Fix the margin and confidence parameter
rho = 15
delta = 0.05

# Code for the experiment 3.3
experiment_d, error_bounds_d = {}, {}
range_d = range(10, 1000, 100)
for m in [500, 1000, 1500, 2000]:
    true_error, bound = [], []
    for d in tqdm(range_d):
        X_train, X_test, y_train, y_test = create_dataset(m, d)
        R_train, R_test = evaluate_error(X_train, X_test, y_train, y_test)
        bound.append(get_error_bound(rho, d, m, delta))
        true_error.append(R_test - R_train)
    experiment_d[m] = true_error
    error_bounds_d[m] = bound
    print(f'Finished for the value m = {m}')

df_experiment_d = pd.DataFrame(experiment_d)
df_experiment_d['d'] = range_d
# df_experiment_d.to_csv('data/experiment_d.csv', index=False)

df_error_bounds_d = pd.DataFrame(error_bounds_d)
df_error_bounds_d['d'] = range_d
# df_error_bounds_d.to_csv('data/bounds_d.csv', index=False)

100%|██████████| 10/10 [00:01<00:00,  9.63it/s]


Finished for the value m = 500


100%|██████████| 10/10 [00:11<00:00,  1.15s/it]


Finished for the value m = 1000


100%|██████████| 10/10 [00:20<00:00,  2.05s/it]


Finished for the value m = 1500


100%|██████████| 10/10 [00:22<00:00,  2.21s/it]

Finished for the value m = 2000





In [248]:
# Fix the margin and confidence parameter
rho = 15
delta = 0.05

# Code for the experiment 3.2
experiment_m, error_bounds_m = {}, {}
range_m = range(100, 10000, 10)
for d in [25, 50, 75, 100]:
    true_error, bound = [], []
    for m in tqdm(range_m):
        X_train, X_test, y_train, y_test = create_dataset(m, d)
        R_train, R_test = evaluate_error(X_train, X_test, y_train, y_test)
        bound.append(get_error_bound(rho, d, m, delta))
        true_error.append(R_test - R_train)
    experiment_m[d] = true_error
    error_bounds_m[d] = bound
    print(f'Finished for the value d = {d}')

df_error_bounds_m = pd.DataFrame(error_bounds_m)
df_error_bounds_m['m'] = range_m
# df_error_bounds_m.to_csv('data/bounds_m.csv', index=False)

df_experiment_m = pd.DataFrame(experiment_m)
df_experiment_m['m'] = range_m
# df_experiment_m.to_csv('data/experiment_m.csv', index=False)

100%|██████████| 3/3 [00:00<00:00, 49.48it/s]


Finished for the value d = 25


100%|██████████| 3/3 [00:00<00:00, 39.78it/s]


Finished for the value d = 50


100%|██████████| 3/3 [00:00<00:00, 30.59it/s]


Finished for the value d = 75


100%|██████████| 3/3 [00:00<00:00, 14.06it/s]

Finished for the value d = 100





In [226]:
# Fix the confidence parameter, VS-dimension and number of samples
delta = 0.05
d = 100
m = 500

# Code for the experiment 3.1
for i, (d, m) in enumerate([(100, 500), (50, 1000)]):
    range_t = range(1, 30)
    test_errors, train_errors = [0] * 30, [0] * 30
    n_iter = 100
    for j in tqdm(range(n_iter)):
        X_train, X_test, y_train, y_test = create_dataset(m, d, class_sep=0.5, random_state=j+1337)
        for T in range_t:
            R_test, R_train = evaluate_error(X_train, X_test, y_train, y_test, T)
            test_errors[T] += R_test
            train_errors[T] += R_train
    test_errors = [e / n_iter for e in test_errors[1:]]
    train_errors = [e / n_iter for e in train_errors[1:]]
    df_number_of_trees = pd.DataFrame({
        'T': range_t,
        'test_error': test_errors,
        'train_error': train_errors
    }).to_csv(f'data/experiment_T{i}.csv', index=False)

100%|██████████| 100/100 [01:40<00:00,  1.01s/it]
100%|██████████| 100/100 [01:43<00:00,  1.03s/it]


In [3]:
# Code to evaluate the experimental confidence parameter

df_error_bounds_d = pd.read_csv('data/bounds_d.csv')
df_experiment_d   = pd.read_csv('data/experiment_d.csv')
df_error_bounds_m = pd.read_csv('data/bounds_m.csv')
df_experiment_m   = pd.read_csv('data/experiment_m.csv')

delta_d = {}
for m in ['500', '1000', '1500', '2000']:
    df_trues_d = df_error_bounds_d[m] > df_experiment_d[m]
    delta_d[m] = round(df_trues_d.sum() / len(df_trues_d), 5) * 100

delta_m = {}
for d in ['25', '50', '75', '100']:
    df_trues_m = df_error_bounds_m[d] > df_experiment_m[d]
    delta_m[d] = round(df_trues_m.sum() / len(df_trues_m), 5) * 100

print(delta_d)
print(delta_m)

{'500': 82.513, '1000': 99.397, '1500': 100.0, '2000': 100.0}
{'25': 100.0, '50': 99.9, '75': 99.7, '100': 98.99900000000001}
