# Final

In [24]:
import secrets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from cvxopt import matrix
from cvxopt import solvers

from pathlib import Path
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics.pairwise import euclidean_distances

solvers.options['show_progress'] = False
sns.set_context("paper")

In [25]:
data_path = Path().resolve().parent / "data"

In [26]:
# Random seed for reproducibility
# secrets.randbits(128) # 208905213533139122735706682150229709525
rng = np.random.default_rng(208905213533139122735706682150229709525)

In [27]:
def get_data(data_path, labels, n_train_label, n_test_label, rng):
    """
    Returns train and test data for some labels.

    Parameters
    ----------

    data_path: pathlib.PosixPath
        Path of datasets

    labels: list
        Digits from MNIST set

    n_train_label: int
        Number of train samples for each label

    n_test_label: int
        Number of test samples for each label

    rng: numpy.random._generator.Generator
        Seed for reproducibility

    Returns
    -------
    tuple of np.array
        X_train, y_train, X_test, y_test

    """
    train_list = []  # Auxiliary list of train datasets
    for f_train in data_path.glob("train*.txt"):
        if f_train.stem.removeprefix("train") not in map(str, labels):
            continue
        raw_train = np.loadtxt(f_train)
        if n_train_label is not None:
            indices_train = rng.choice(raw_train.shape[0], n_train_label, replace=False)
            raw_train = raw_train[indices_train, :]
        target_train = raw_train[:, [0]]  # Target values, i.e. digit
        features_train = raw_train[:, 1:] / 255
        train_list.append(np.hstack((target_train, features_train)))
    train_data = np.vstack(train_list)  # Concatenate train datasets

    test_list = []
    for f_test in data_path.glob("test*.txt"):
        if f_test.stem.removeprefix("test") not in map(str, labels):
            continue
        raw_test = np.loadtxt(f_test)
        if n_test_label is not None:
            indices_test = rng.choice(raw_test.shape[0], n_test_label, replace=False)
            raw_test = raw_test[indices_test, :]
        target_test = raw_test[:, [0]]
        features_test = raw_test[:, 1:] / 255
        test_list.append(np.hstack((target_test, features_test)))
    test_data = np.vstack(test_list)
    X_train = train_data[:, 1:]
    y_train = train_data[:, 0].astype(int)
    X_test = test_data[:, 1:]
    y_test = test_data[:, 0].astype(int)
    return X_train, y_train, X_test, y_test

1. SVM

In [28]:
def radial_basis(X1, X2, gamma):
    K = np.exp(-gamma * euclidean_distances(X1, X2, squared=True))
    return K


class SVM_binary():
    def __init__(self, C, gamma):
        self.C = C
        self.gamma = gamma

    def train(self, X_train, y_train):
        n_train = X_train.shape[0]
        self.X_train = X_train
        self.y_train = y_train
        K_train = radial_basis(X_train, X_train, self.gamma)
        M = matrix(np.outer(y_train, y_train) * K_train)
        e = matrix(np.ones(shape=(n_train, 1), dtype=float))
        G = matrix(np.identity(n=n_train, dtype=float))
        h = matrix(self.C * e)
        A = matrix(y_train.reshape(1, -1).astype(float))
        b = matrix(0.0)
        self.sol = solvers.qp(M, e, G, h, A, b)
        self.alpha = np.array(self.sol["x"]).flatten()
        self.I = np.argwhere(self.alpha > 0).flatten()
        alpha_b_idx = self.alpha[(0 < self.alpha) & (self.alpha < self.C)].argmax()
        self.b = np.sum(
            y_train[self.I] *  self.alpha[self.I] * K_train[self.I, alpha_b_idx]
        )

    def predict(self, X_test):
        n_test = X_test.shape[0]
        K_test = radial_basis(self.X_train, X_test, self.gamma)
        class_number = (
            np.sum(
                self.y_train[self.I, np.newaxis]
                * self.alpha[self.I, np.newaxis]
                * K_test[self.I, :],
                axis=0
            )
            - self.b * np.ones(shape=(n_test))
        )
        y_pred = np.where(class_number > 0, 1, -1)
        return y_pred

    def accuracy(self, X_test, y_test):
        y_pred = self.predict(X_test)
        return np.mean(y_pred == y_test)

In [29]:
gamma = 0.03
C = 100

2. Classification 3 vs 6

In [30]:
labels = np.array([3, 6])
n_train_label = 500
n_test_label = 500
X_train_36, y_train_36, X_test_36, y_test_36 = get_data(
    data_path,
    labels,
    n_train_label,
    n_test_label,
    rng
)
min_label = np.min(labels)
y_train_bin_36 = np.where(y_train_36 == min_label, -1, 1)  # y_i \in {-1, 1}
y_test_bin_36 = np.where(y_test_36 == min_label, -1, 1)  # y_i \in {-1, 1}

In [31]:
svm_36 = SVM_binary(C=C, gamma=gamma)
svm_36.train(X_train_36, y_train_bin_36)

3. Accuracy 3 vs 6

In [32]:
svm_36.accuracy(X_test_36, y_test_bin_36)

0.846

4. Reduction of training examples

In [33]:
factors = [0.75, 0.90, 0.95]
n_train = X_train_36.shape[0]

In [34]:
X_train_36_dict = {}
y_train_36_dict = {}
for factor in factors:
    n_per_label = int((n_train - n_train * factor) / len(labels))
    X_train_tmp_list = []
    y_train_tmp_list = []
    for label in labels:
        mask_train = y_train_36 == label
        y_train_tmp = y_train_36[mask_train]
        X_train_tmp = X_train_36[mask_train, :]
        idx = rng.choice(y_train_tmp.shape[0], n_per_label, replace=False)
        X_train_tmp_list.append(X_train_tmp[idx, :])
        y_train_tmp_list.append(y_train_tmp[idx])
    X_train_36_dict[factor] = np.vstack(X_train_tmp_list)
    y_train_36_dict[factor] = np.concatenate(y_train_tmp_list)

In [35]:
for factor, X in X_train_36_dict.items():
    print(f"{factor} has {X.shape[0]} samples.")

0.75 has 250 samples.
0.9 has 100 samples.
0.95 has 50 samples.


In [36]:
pred_36 = {}
test_accuracy_36 = {}
for factor in factors:
    X_train = X_train_36_dict[factor]
    y_train = y_train_36_dict[factor]
    min_label = np.min(labels)
    y_train_bin = np.where(y_train == min_label, -1, 1)
    y_test_bin = np.where(y_test_36 == min_label, -1, 1)
    svm = SVM_binary(C=C, gamma=gamma)
    svm.train(X_train, y_train_bin)
    pred_36[factor] = svm.predict(X_test_36)
    test_accuracy_36[factor] = svm.accuracy(X_test_36, y_test_bin)


ValueError: attempt to get argmax of an empty sequence

In [37]:
test_accuracy_36

{0.75: 0.961, 0.9: 0.682}

5. Reduction of pixeles uniformly

In [50]:
n_features = X_train_36.shape[1]
X_train_36_pixel_red_dict = {}
X_test_36_pixel_red_dict = {}
for factor in factors:
    n_features_new = int(n_features * (1 - factor))
    idx = np.linspace(0, n_features, num=n_features_new, endpoint=False, dtype=int)
    X_train_36_pixel_red_dict[factor] = X_train_36[:, idx]
    X_test_36_pixel_red_dict[factor] = X_test_36[:, idx]

In [51]:
pred_pixel_36 = {}
test_accuracy_pixel_36 = {}
for factor in factors:
    X_train = X_train_36_pixel_red_dict[factor]
    X_test = X_test_36_pixel_red_dict[factor]
    svm = SVM_binary(C=C, gamma=gamma)
    svm.train(X_train, y_train_bin_36)
    pred_pixel_36[factor] = svm.predict(X_test)
    test_accuracy_pixel_36[factor] = svm.accuracy(X_test, y_test_bin_36)

In [52]:
test_accuracy_pixel_36

{0.75: 0.885, 0.9: 0.854, 0.95: 0.651}