In [7]:

import numpy as np

import matplotlib.pyplot as plt
import tensorflow as tf
from collections import Counter
import random
import bisect

In [8]:
def load_mnist():
  data = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz'
  path = tf.keras.utils.get_file('mnist.npz', data)
  dataset = []
  with np.load(path) as f:
          X_train, y_train = f['x_train'], f['y_train']
          X_test, y_test = f['x_test'], f['y_test']
          return (X_train, y_train), (X_test, y_test)

# Select digits 0 and 1
def select_digits(X, y):
    mask = (y == 0) | (y == 1)
    X_selected = X[mask]
    y_selected = np.where(y[mask] == 0, -1, 1)  # Label 0 as -1 and 1 as 1
    return X_selected, y_selected

# Split train and validation sets
def train_val_split(X, y, val_size=1000):
    X_val = np.concatenate([X[y == -1][:val_size], X[y == 1][:val_size]])
    y_val = np.concatenate([y[y == -1][:val_size], y[y == 1][:val_size]])
    X_train = np.concatenate([X[y == -1][val_size:], X[y == 1][val_size:]])
    y_train = np.concatenate([y[y == -1][val_size:], y[y == 1][val_size:]])
    return X_train, y_train, X_val, y_val

# PCA for dimensionality reduction
def pca_transform(X, n_components=5):
    # Reshape input data if it has more than 2 dimensions
    if X.ndim > 2:
        X = X.reshape(X.shape[0], -1)  # Flatten each sample into a 1D array
    
    X_mean = np.mean(X, axis=0)
    X_centered = X - X_mean
    cov_matrix = np.cov(X_centered, rowvar=False)
    eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
    idx = np.argsort(eigenvalues)[::-1]
    eigenvectors = eigenvectors[:, idx]
    X_pca = np.dot(X_centered, eigenvectors[:, :n_components])
    return X_pca

# Train decision stump on weighted data
def train_decision_stump(X, y, weights):
    best_feature = None
    best_threshold = None
    min_weighted_error = float('inf')
    
    # Iterate over each feature
    for feature in range(X.shape[1]):
        # Get unique sorted values for the feature
        sorted_values = np.unique(X[:, feature])
        
        # Try each value as a threshold
        for value in sorted_values:
            predictions = np.sign(X[:, feature] - value)
            incorrect = predictions != y
            
            # Calculate weighted error
            weighted_error = np.sum(weights[incorrect])/np.sum(weights)
            
            # Check if this is the best split found
            if weighted_error < min_weighted_error:
                min_weighted_error = weighted_error
                best_feature = feature
                best_threshold = value
    
    return best_feature, best_threshold, min_weighted_error

# Update weights based on alpha and predictions
def update_weights(weights, alpha, predictions, y):
    incorrect = predictions != y
    factor = np.exp(alpha * incorrect)
    weights *= factor

# AdaBoost training
def adaboost_train(X_train, y_train, num_iterations=300):
    num_samples = len(X_train)
    print(num_samples)
    weights = np.ones(num_samples) / num_samples
    alphas = []
    stumps = []
    
    for t in range(num_iterations):
        # Train decision stump
        feature, threshold, error = train_decision_stump(X_train, y_train, weights)
        print(feature, threshold, error)
        
        # Calculate alpha
        alpha = np.log((1 - error) / error)
        alphas.append(alpha)
        
        # Update weights
        predictions = np.sign(X_train[:, feature] - threshold)
        update_weights(weights, alpha, predictions, y_train)
        
        # Store the stump
        stumps.append((feature, threshold))
    
    return alphas, stumps

# Evaluate ensemble on data
def evaluate_ensemble(X, alphas, stumps):
    num_trees = len(alphas)
    predictions = np.zeros(len(X))
    
    for i in range(num_trees):
        feature, threshold = stumps[i]
        alpha = alphas[i]
        predictions += alpha * np.sign(X[:, feature] - threshold)
    
    return np.sign(predictions)

# Load and preprocess data
(X_train, y_train), (X_test, y_test) = load_mnist()
X_train, y_train = select_digits(X_train, y_train)
X_test, y_test = select_digits(X_test, y_test)
    
# Split train and validation sets
X_train, y_train, X_val, y_val = train_val_split(X_train, y_train)
    
# PCA for dimensionality reduction
X_train_pca = pca_transform(X_train)
X_val_pca = pca_transform(X_val)
X_test_pca = pca_transform(X_test)
    
# Train AdaBoost
alphas, stumps = adaboost_train(X_train_pca, y_train, 10)
    
# Evaluate on validation set
y_val_pred = evaluate_ensemble(X_val_pca, alphas, stumps)
val_accuracy = np.mean(y_val_pred == y_val)
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")
    
# Evaluate on test set
y_test_pred = evaluate_ensemble(X_test_pca, alphas, stumps)
test_accuracy = np.mean(y_test_pred == y_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Plotting validation accuracy vs. number of trees
plt.figure()
plt.plot(range(1, len(alphas) + 1), [val_accuracy] * len(alphas))
plt.xlabel('Number of Trees')
plt.ylabel('Validation Accuracy')
plt.title('AdaBoost Validation Accuracy vs. Number of Trees')
plt.show()

10665
0 188.9382058693906 0.00478199718706048
0 636.9126392767417 0.2318441791640342
0 -295.9838904829736 0.19309618374583792


KeyboardInterrupt: 