In [1]:
import numpy as np
from sklearn.decomposition import PCA

def calculate_gini(labels):
    n_labels = len(labels)
    if n_labels == 0:
        return 0
    counts = np.bincount(labels)
    probabilities = counts / n_labels
    gini = 1 - np.sum(probabilities ** 2)
    return gini

def find_best_split(feature, labels):
    sorted_indices = np.argsort(feature)
    best_gini = float('inf')
    best_split_value = None
    for i in range(len(feature) - 1):
        split_value = 0.5 * (feature[sorted_indices[i]] + feature[sorted_indices[i + 1]])
        # print(split_value)
        left_labels = labels[sorted_indices[:i + 1]]
        right_labels = labels[sorted_indices[i + 1:]]
        gini = (len(left_labels) * calculate_gini(left_labels) + len(right_labels) * calculate_gini(right_labels)) / len(labels)
        if gini < best_gini:
            best_gini = gini
            best_split_value = split_value
    return best_gini, best_split_value

def grow_decision_tree(features, labels, max_nodes):
    n_features = features.shape[1]
    print("hII", features.shape)
    best_split_dim = None
    best_split_value = None
    best_gini= float('inf')
    
    for dim in range(n_features):
        gini, split_value = find_best_split(features[:,dim], labels)
        if gini < best_gini:
            best_gini = gini
            best_split_dim = dim
            best_split_value = split_value
    
    left_indices = features[:, best_split_dim] <= best_split_value
    right_indices = ~left_indices
    left_labels = labels[left_indices]
    right_labels = labels[right_indices]
    left_features = features[left_indices]
    right_features = features[right_indices]
    gini_left = calculate_gini(left_labels)
    gini_right = calculate_gini(right_labels)
    
    if max_nodes == 0:
        node = {
            'split_dim': best_split_dim,
            'split_value': best_split_value,
            'left': {'class': np.argmax(np.bincount(left_labels))},
            'gini_left': gini_left,
            'gini_right': gini_right,
            'right': {'class': np.argmax(np.bincount(right_labels))}
        }
        return node
        
    if gini_left >= gini_right:
        node = {
            'split_dim': best_split_dim,
            'split_value': best_split_value,
            'left': grow_decision_tree(left_features, left_labels, max_nodes-1),
            'gini_left': gini_left,
            'gini_right': gini_right,
            'right': {'class': np.argmax(np.bincount(right_labels))}
        }
        return node
    else:
        node = {
            'split_dim': best_split_dim,
            'split_value': best_split_value,
            'left': {'class': np.argmax(np.bincount(left_labels))},
            'gini_left': gini_left,
            'gini_right': gini_right,
            'right': grow_decision_tree(right_features, right_labels, max_nodes-1)
        }
        return node
def classify_sample(sample, node):
    if 'class' in node:
        return node['class']
    elif sample[node['split_dim']] <= node['split_value']:
        return classify_sample(sample, node['left'])
    else:
        return classify_sample(sample, node['right'])


def classify_samples(samples, node):
    predictions = []
    print(samples.shape)
    for sample in samples:
        class_prediction = classify_sample(sample, node)
        predictions.append(class_prediction)
    return predictions

with np.load("C:\Shared_archcraft\SML\Assignment3\mnist.npz") as data:
    x_train, y_train = data['x_train'], data['y_train']
    x_test, y_test = data['x_test'], data['y_test']


def pca(x):
    
    X_mean = np.mean(x, axis=0)
    X_centered = x - X_mean
    pca = PCA(n_components=10)

    X_flattened = X_centered.reshape(X_centered.shape[0], -1)

    pca.fit(X_flattened)

    X_pca = pca.transform(X_flattened)
    print(X_pca.shape)
    return X_pca



train_mask = np.isin(y_train, [0, 1, 2])
x_train_012 = x_train[train_mask]
y_train_012 = y_train[train_mask]

x_train_012 = x_train_012.reshape(x_train_012.shape[0], -1)

mean_vec = np.mean(x_train_012, axis=0)

cov_mat = np.cov(x_train_012.T)

eig_vals, eig_vecs = np.linalg.eig(cov_mat)

sorted_indices = np.argsort(eig_vals)[::-1]

top_eig_vecs = eig_vecs[:, sorted_indices[:10]]

x_train_reduced = pca(x_train_012)
print(x_train_reduced.shape)

(18623, 10)
(18623, 10)


In [2]:

def q1():
    node = grow_decision_tree(x_train_reduced, y_train_012, 1)
    print(node)
    return node
node = q1()

hII (18623, 10)
hII (11923, 10)
{'split_dim': 0, 'split_value': -624.6830522025067, 'left': {'class': 1}, 'gini_left': 0.09149993316997107, 'gini_right': 0.528841468983415, 'right': {'split_dim': 1, 'split_value': 183.30346470912838, 'left': {'class': 0}, 'gini_left': 0.2631231332541133, 'gini_right': 0.20659939939939953, 'right': {'class': 2}}}


In [3]:
def q2_helper(x_test_012, y_test_012,node):

    x_test_reduced = pca(x_test_012)
    print(node)
    test_predictions = classify_samples(x_test_reduced, node)

    total_accuracy = np.mean(test_predictions == y_test_012)

    class_wise_accuracy = {}
    for i in [0, 1, 2]:
        mask = y_test_012 == i
        class_accuracy = np.mean(np.array(test_predictions)[mask] == i)
        class_wise_accuracy[i] = class_accuracy

    print("Total Accuracy:", total_accuracy)
    print("Class-wise Accuracy:", class_wise_accuracy)
def q2(node):
    train_mask = np.isin(y_test, [0, 1, 2])
    x_test_012 = x_test[train_mask]
    y_test_012 = y_test[train_mask]
    q2_helper(x_test_012, y_test_012,node)

q2(node)

(3147, 10)
{'split_dim': 0, 'split_value': -624.6830522025067, 'left': {'class': 1}, 'gini_left': 0.09149993316997107, 'gini_right': 0.528841468983415, 'right': {'split_dim': 1, 'split_value': 183.30346470912838, 'left': {'class': 0}, 'gini_left': 0.2631231332541133, 'gini_right': 0.20659939939939953, 'right': {'class': 2}}}
(3147, 10)
Total Accuracy: 0.899904671115348
Class-wise Accuracy: {0: 0.9428571428571428, 1: 0.9480176211453745, 2: 0.8062015503875969}


In [4]:
def majority_vote(predictions):
    return np.bincount(predictions).argmax()
def predict_majority_vote(x_test, trees):
    predictions = []
    for sample in x_test:
        tree_predictions = [classify_sample(sample,tree) for tree in trees]
        majority_prediction = majority_vote(tree_predictions)
        predictions.append(majority_prediction)
    return predictions
def q3_helper():

    num_trees = 5
    trees = []
    def bootstrap_sample(x_train, y_train, sample_size=500):
        indices = np.random.choice(len(x_train), size=sample_size, replace=True)
        x_sampled = x_train[indices]
        y_sampled = y_train[indices]
        return x_sampled, y_sampled
    
    
    for _ in range(num_trees):
        x_sampled, y_sampled = bootstrap_sample(x_train_012, y_train_012)
        x_sampled = pca(x_sampled)
        tree = grow_decision_tree(x_sampled, y_sampled , 1)
        trees.append(tree)
    return trees
trees = q3_helper()

(500, 10)
hII (500, 10)
hII (311, 10)
(500, 10)
hII (500, 10)
hII (309, 10)
(500, 10)
hII (500, 10)
hII (312, 10)
(500, 10)
hII (500, 10)
hII (308, 10)
(500, 10)
hII (500, 10)
hII (327, 10)


In [5]:
def q3(trees):
    train_mask = np.isin(y_test, [0, 1, 2])
    x_test_012 = x_test[train_mask]
    x_test_012 = pca(x_test_012)
    test_predictions = predict_majority_vote(x_test_012, trees)

    total_accuracy = np.mean(test_predictions == y_test[np.isin(y_test, [0, 1, 2])])

    class_wise_accuracy = {}
    for i in [0, 1, 2]:
        mask = y_test[np.isin(y_test, [0, 1, 2])] == i
        class_accuracy = np.mean(np.array(test_predictions)[mask] == i)
        class_wise_accuracy[i] = class_accuracy

    print("Total Accuracy:", total_accuracy)
    print("Class-wise Accuracy:", class_wise_accuracy)

q3(trees)

(3147, 10)
Total Accuracy: 0.9062599300921512
Class-wise Accuracy: {0: 0.9714285714285714, 1: 0.9726872246696036, 2: 0.7713178294573644}
