In [45]:
from scipy.io import loadmat
import numpy as np

In [46]:
data = loadmat("MNISTmini.mat")

train_fea = data['train_fea1']
train_gnd = data['train_gnd1']
test_fea = data['test_fea1']
test_gnd = data['test_gnd1']

#print(np.shape(train_fea)) # (60000, 100)
#print(np.shape(train_gnd)) # (60000, 1)
#print(np.shape(test_fea)) # (10000, 100)
#print(np.shape(test_gnd)) # (10000, 1)

In [47]:
# insert labels as last column of feature matrix
train_samples = np.column_stack((train_fea, train_gnd)) 

#print(train_samples)
#print(train_samples.shape)

In [None]:
from sklearn.utils import resample

# sample (w/o replacement) 3000 samples from train_samples
sub_samples = resample(train_samples, replace=False, n_samples=3000, random_state=20)

# print(np.unique(sub_samples[:,-1])) # 10 total classes: 0-9, so to classify image as digit-0 -> class 1

sub_train_fea = sub_samples[0:1000, :100]
sub_train_gnd = sub_samples[0:1000, 100]

sub_val_fea = sub_samples[1000:2000, :100]
sub_val_gnd = sub_samples[1000:2000, 100]

sub_test_fea = sub_samples[2000:3000, :100]
sub_test_gnd = sub_samples[2000:3000, 100]

# digit-5 samples:
full_sample_digit_5 = sub_samples[2000:3000][sub_samples[2000:3000, -1] == 6] # digit-5 = class 6
test_digit_5_fea = full_sample_digit_5[:50,:100]
test_digit_5_gnd = full_sample_digit_5[:50, 100]

#print("Full test sample matrix containing label for digit-5:\n", full_sample_digit_5)
#print("Test sample features:\n", test_digit_5_fea) # 86/1000 total rows
#print("Test sample labels:\n", test_digit_5_gnd) # 86/1000 rows are labeled as digit 5

# digit-8 samples:
full_sample_digit_8 = sub_samples[2000:3000][sub_samples[2000:3000, -1] == 9] # digit-8 = class 9
test_digit_8_fea = full_sample_digit_8[:50, :100]
test_digit_8_gnd = full_sample_digit_8[:50, 100]

#print("Full sample matrix containing label for digit-8:\n", full_sample_digit_8)
#print("Test sample features:\n", len(test_digit_8_fea)) # 115/1000 total rows 
#print("Test sample labels\n", len(test_digit_8_gnd)) # 115/1000 rows are labeled as digit 8


In [49]:
# Logistic Classifier:

# Task: 
#   - Binary classification (digit-5 vs. digit-8)

# Suggestions:
#   - Use L2 regularization w/ hyperparameter determined via cross-validation
#   - Use 'linlinear' training algorithm (solver)

In [50]:
# Train/Validation/Test Split: 33/33/33:

# Training set
X_train = sub_train_fea
y_train = sub_train_gnd

# Validation set
X_val = sub_val_fea
y_val = sub_val_gnd

# Test set (multi-class)
X_test = sub_test_fea
y_test = sub_test_gnd

#print(X_test)
#print(y_test)

# Test set (digit-5/class 6)
X_test_digit_5 = test_digit_5_fea
y_test_digit_5 = test_digit_5_gnd

# Test set (digit-8/class 9)
X_test_digit_8 = test_digit_8_fea
y_test_digit_8 = test_digit_8_gnd


In [51]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsOneClassifier  
from sklearn.multiclass import OneVsRestClassifier

import time

In [52]:
# Check class distribution to determine if classes are balanced/imbalanced
classes, counts = np.unique(y_train, return_counts=True)

for cls, count in zip(classes, counts):
    print(f'Class {cls}: {count} samples')

print()

# Proportions for each class
proportions = counts/len(y_train)
for cls, prop in zip(classes, proportions):
    print(f'Class {cls}: {prop*100:.2f}%')

Class 1: 106 samples
Class 2: 102 samples
Class 3: 107 samples
Class 4: 101 samples
Class 5: 87 samples
Class 6: 95 samples
Class 7: 96 samples
Class 8: 101 samples
Class 9: 105 samples
Class 10: 100 samples

Class 1: 10.60%
Class 2: 10.20%
Class 3: 10.70%
Class 4: 10.10%
Class 5: 8.70%
Class 6: 9.50%
Class 7: 9.60%
Class 8: 10.10%
Class 9: 10.50%
Class 10: 10.00%


In [58]:
start = time.time()
# One-vs-rest classifier with logistic regression as base estimator
ovr_clf = OneVsRestClassifier(
    LogisticRegression(
        # liblinear implements a trust region newton method
        # n_jobs = 1 (1 core, no parallelism)
        penalty='l2', solver='liblinear', max_iter=300, random_state=20, verbose=0
    )
    ).fit(X_train, y_train) # train classifier on multi-class training samples (1000 samples)
stop = time.time()

train_time = stop - start

print(f"Training time (OvR) (1000 training samples): {train_time:.3f}s")

#ovr_clf.predict(X_test) # test on multi-class samples from test set
#ovr_clf_score = ovr_clf.score(X_test, y_test)
#print(ovr_clf_score)

start = time.time()
ovr_clf.predict(X_test_digit_5) # test classifier on digit-5 samples only from test set (50 samples)
ovr_clf.predict(X_test_digit_8) # test classifier on digit-8 samples only from test set (50 samples)
end = time.time()

inference_time = end - start

print(f'Inference time (OvR) (100 total test samples): {inference_time:.5f}s')

ovr_clf_score_5 = ovr_clf.score(X_test_digit_5, y_test_digit_5)
ovr_clf_score_8 = ovr_clf.score(X_test_digit_8, y_test_digit_8)

# Score for digit-5, digit-8 classification over test set w/ one-vs-all classifier
print(f"digit-5 (50 test samples): {ovr_clf_score_5:.3f}")
print(f"digit-8 (50 test samples): {ovr_clf_score_8:.3f}")

Training time (OvR) (1000 training samples): 0.457s
Inference time (OvR) (100 total test samples): 0.00092s
digit-5 (50 test samples): 0.860
digit-8 (50 test samples): 0.660


In [67]:
start = time.time()
# One-vs-one classifier with logistic regression as base estimator
ovo_clf = OneVsOneClassifier(
        LogisticRegression(
            penalty='l2', solver='liblinear', max_iter=300, random_state=20, verbose=0
        )
    ).fit(X_train, y_train)
stop = time.time()

train_time = stop - start

print(f"Training time (OvO) (1000 training samples): {train_time:.3f}s")

start = time.time()
ovo_clf.predict(X_test_digit_5) # test classifier on digit-5 samples only from test set (50 samples)
ovo_clf.predict(X_test_digit_8) # test classifier on digit-8 samples only from test set (50 samples)
end = time.time()

inference_time = end - start

print(f'Inference time (OvO) (100 total test samples): {inference_time:.5f}s')

ovo_clf_score_5 = ovo_clf.score(X_test_digit_5, y_test_digit_5)
ovo_clf_score_8 = ovo_clf.score(X_test_digit_8, y_test_digit_8)

# Score for digit-5, digit-8 classification w/ one-vs-one classifier
print(f"digit-5 (50 test samples): {ovo_clf_score_5:.3f}")
print(f"digit-8 (50 test samples): {ovo_clf_score_8:.3f}")

#ovo_clf.predict(X_test) # test on multi-class samples
#ovo_clf_score = ovo_clf.score(X_test, y_test)
#print(ovo_clf_score)

Training time (OvO) (1000 training samples): 0.076s
Inference time (OvO) (100 total test samples): 0.00629s
digit-5 (50 test samples): 0.960
digit-8 (50 test samples): 0.820


In [None]:
# Things to explore: 
# (1) Why the classifier performs better on digit-5 compared to digit-8.
# (2) Why the one-vs-one classifier performs slower during testing compared to one-vs-all? Probably due to the difference in the sample dimentions.
# (3) Track the training and inference time for both types of classifiers.
# (4) Use cross-validation to optimize the regularization hyperparameter.
# (5) Plot validation vs training error

In [None]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

start = time.time()
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
stop = time.time()

train_time = stop - start

print(f"Training time (OvR) (1000 training samples): {train_time:.3f}s")

#ovr_clf.predict(X_test) # test on multi-class samples from test set
#ovr_clf_score = ovr_clf.score(X_test, y_test)
#print(ovr_clf_score)

start = time.time()
model.predict(X_test_digit_5) # test classifier on digit-5 samples only from test set (50 samples)
model.predict(X_test_digit_8) # test classifier on digit-8 samples only from test set (50 samples)
end = time.time()

inference_time = end - start

print(f'Inference time (OvR) (100 total test samples): {inference_time:.5f}s')

rfc_score_5 = model.score(X_test_digit_5, y_test_digit_5)
rfc_score_8 = model.score(X_test_digit_8, y_test_digit_8)

# Score for digit-5, digit-8 classification over test set w/ one-vs-all classifier
print(f"digit-5 (50 test samples): {rfc_score_5:.3f}")
print(f"digit-8 (50 test samples): {rfc_score_8:.3f}")

Training time (OvR) (1000 training samples): 0.205s
Inference time (OvR) (100 total test samples): 0.00495s
digit-5 (50 test samples): 0.980
digit-8 (50 test samples): 0.900
