In [45]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [46]:
import numpy as np
import math
from typing import Tuple
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [60]:
class Node:
    def __init__(self, feature_index: int = None, threshold: float = None, value: int = None, left: 'Node' = None, right: 'Node' = None):
        self.feature_index=feature_index
        self.threshold=threshold
        self.value=value
        self.left=left
        self.right=right

class DecisionTree:
    def __init__(self):
        pass

    def fit(self, X : np.ndarray, Y : np.ndarray, W : np.ndarray):
        self.n_classes=len(set(Y))
        self.n_features=X.shape[1]
        self.node, alpha, best_wrong_indices=self.grow_trees(X, Y, W)
        return alpha, best_wrong_indices

    def grow_trees(self, X : np.ndarray, Y : np.ndarray, W : np.ndarray, depth : int=0):
        if(depth==1):
            return Node(value=np.argmax(np.bincount(Y))), None, None

        best_loss, best_wrong_indices, best_feature, best_threshold=self._find_best_stump(X, Y, W)
        if(best_loss==0):
            best_loss=1e-6
        alpha=np.log((1-best_loss)/best_loss)
        left_indices, right_indices=self._split(X[: , best_feature], best_threshold)
        left_node, _, _=self.grow_trees(X[left_indices], Y[left_indices], W, depth+1)
        right_node, _, _=self.grow_trees(X[right_indices], Y[right_indices], W, depth+1)
        return Node(feature_index=best_feature, threshold=best_threshold, left=left_node, right=right_node) ,alpha, best_wrong_indices

    def predict(self, X : np.ndarray):
         return np.array([self._traverse_tree(x, self.node) for x in X])

    def _traverse_tree(self, X : np.ndarray, node: Node):
        if(node.left is None and node.right is None):
            return node.value
        if X[node.feature_index] <= node.threshold:
            return self._traverse_tree(X, node.left)
        else:
            return self._traverse_tree(X, node.right)



    def _find_best_stump(self, X : np.ndarray, Y : np.ndarray, W: np.ndarray):
        best_loss=1.1
        best_feature, best_threshold, best_wrong_indices=None, None, None

        for feature_index in range(self.n_features):
            sorted_indices=np.argsort(X[:,feature_index])
            sorted_x=X[sorted_indices]
            # sorted_y=Y[sorted_indices]
            thresholds=np.unique(sorted_x[:, feature_index])
            for threshold_index in range(len(thresholds)-1):
                threshold=(thresholds[threshold_index]+thresholds[threshold_index+1])/2
                left_indice,right_indice=self._split(X[:, feature_index], threshold)
                loss, wrong_indices=self._calc_loss(Y[left_indice], Y[right_indice], W)
                if(loss<best_loss):
                    best_loss=loss
                    best_wrong_indices=wrong_indices
                    best_feature=feature_index
                    best_threshold=threshold

        return best_loss, best_wrong_indices, best_feature, best_threshold



    def _calc_loss(self, y_left: np.ndarray , y_right : np.ndarray , W : np.ndarray):
        left_val=np.argmax(np.bincount(y_left))
        right_val=1-left_val
        loss=1
        left_misclassified_indices=np.where(y_left!=left_val)
        right_misclassified_indices=np.where(y_right!=right_val)
        misclassified_indices=np.concatenate((left_misclassified_indices[0], right_misclassified_indices[0]))
        sum_misclassified=np.sum(W[misclassified_indices])
        sum_weights=np.sum(W)
        loss=sum_misclassified/sum_weights
        return loss, misclassified_indices


    def _split(self, X : np.ndarray , threshold : float):
        left_indice=np.where(X<=threshold)[0]
        right_indice=np.where(X>threshold)[0]
        return left_indice, right_indice


class BoostTree:
    def __init__(self):
        self.trees=[]
        self.alphas=[]
        self.weights=None

    def _add(self, X : np.ndarray, Y: np.ndarray):
        if(len(self.trees)==0):
            self.weights=np.full(X.shape[0], 1/X.shape[0])

        tree=DecisionTree()
        alpha, wrong_indices=tree.fit(X, Y, self.weights)
        self.alphas.append(alpha)
        self.trees.append(tree)
        self.weights[wrong_indices]*=np.exp(alpha)

    def predict(self, X : np.ndarray):
        predictions = np.array([tree.predict(X) for tree in self.trees])
        return np.array([np.sign(self.alphas[i]*predictions[i]) for i in range(len(self.alphas))])







**QUESTION** **1**

In [61]:
def load_and_preprocess_data(selected_classes=[0, 1], random_subset_size=None):
    data = np.load('/content/drive/MyDrive/SML_A4/mnist.npz')
    x_train, y_train = data['x_train'], data['y_train']
    x_test, y_test = data['x_test'], data['y_test']

    train_mask = np.isin(y_train, selected_classes)
    x_train = x_train[train_mask].reshape(train_mask.sum(), -1)
    y_train = y_train[train_mask]

    test_mask = np.isin(y_test, selected_classes)
    x_test = x_test[test_mask].reshape(test_mask.sum(), -1)
    y_test = y_test[test_mask]

    if random_subset_size is not None:
        random_indices = np.random.choice(len(x_train), size=random_subset_size, replace=False)
        x_train = x_train[random_indices]
        y_train = y_train[random_indices]

    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=1000, stratify=y_train, random_state=42)

    return x_train, y_train, x_val, y_val, x_test, y_test

def apply_pca(x_train, x_val, x_test, pca_dim=5):
    pca = PCA(n_components=pca_dim)
    x_train = pca.fit_transform(x_train)
    x_val = pca.transform(x_val)
    x_test = pca.transform(x_test)
    return x_train, x_val, x_test

def train_and_evaluate(x_train, y_train, x_val, y_val, x_test, y_test, epochs=50):
    tree = BoostTree()
    for _ in tqdm(range(epochs), desc='Training'):
        tree._add(x_train, y_train)

    y_pred_val = tree.predict(x_val)
    val_accuracy = 100 * np.mean(y_pred_val == y_val)
    print(f"Validation Accuracy: {val_accuracy:.2f}%")

    y_pred_test = tree.predict(x_test)
    test_accuracy = 100 * np.mean(y_pred_test == y_test)
    print(f"Test Accuracy: {test_accuracy:.2f}%")

def main():
    x_train, y_train, x_val, y_val, x_test, y_test = load_and_preprocess_data()
    x_train, x_val, x_test = apply_pca(x_train, x_val, x_test)
    train_and_evaluate(x_train, y_train, x_val, y_val, x_test, y_test)

if __name__ == '__main__':
    main()

Training: 100%|██████████| 50/50 [16:14<00:00, 19.49s/it]


Validation Accuracy: 98.18%
Test Accuracy: 98.47%
