In [1]:
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
import numpy as np

In [2]:
def l2_logistic_regression(train_data, test_data, train_labels, test_labels):
    
    # Train model
    model = LogisticRegression(penalty='l2', random_state=42)
    model.fit(train_data, train_labels)

    # Test model
    y_train_pred = model.predict(train_data)
    y_test_pred = model.predict(test_data)

    # Evaluate model
    print('\nLogistic Regression - \nTrain Accuracy: ', round(np.sum(y_train_pred == train_labels) / train_data.shape[0], 4))
    print('Test Accuracy: ', round(np.sum(y_test_pred == test_labels) / test_data.shape[0], 4))

In [3]:
def decision_tree_classifier(train_data, test_data, train_labels, test_labels):

    # Train model
    model = DecisionTreeClassifier(random_state=42)
    model.fit(train_data, train_labels)
    
    # Test model
    y_train_pred = model.predict(train_data)
    y_test_pred = model.predict(test_data)

    # Evaluate model
    print('\nDecision Tree - \nTrain Accuracy: ', round(np.sum(y_train_pred == train_labels) / train_data.shape[0], 4))
    print('Test Accuracy: ', round(np.sum(y_test_pred == test_labels) / test_data.shape[0], 4))

In [4]:
def pca(data, labels, n_components):

    model = PCA(n_components=n_components)
    model.fit(data)
    X = model.transform(data)

    # Split into training and testing datasets
    new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(X, labels, test_size=0.20, random_state=42)
    
    l2_logistic_regression(new_X_train, new_X_test, new_y_train, new_y_test)
    decision_tree_classifier(new_X_train, new_X_test, new_y_train, new_y_test)

In [5]:
def my_pca(data, d):
    u = np.mean(data, axis=0)
    cov = np.dot((data - u).T, (data - u)) / (data.shape[0] - 1)
    eigen_values, eigen_vectors = np.linalg.eig(cov)
    indices = np.argsort(abs(eigen_values))[::-1]

    output = np.zeros((eigen_values.shape[0], d))
    for i in range(eigen_values.shape[0]):
        for j in range(d):
            output[i][j] = eigen_vectors[i][indices[j]]

    return output

In [6]:
def run_my_pca(X_train, X_test, y_train, y_test, d):
    
    matrix_w = my_pca(X_train, d)
    
    new_X_train = np.dot(X_train, matrix_w)
    new_X_test = np.dot(X_test, matrix_w)
    
    l2_logistic_regression(new_X_train, new_X_test, y_train, y_test)
    decision_tree_classifier(new_X_train, new_X_test, y_train, y_test)

In [7]:
# Fetch data
mnist_dataset = fetch_mldata('MNIST original')
mnist_data = mnist_dataset.data
mnist_labels = mnist_dataset.target

In [8]:
# Split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(mnist_data, mnist_labels, test_size=0.20, random_state=42)

In [9]:
print('Before PCA')
l2_logistic_regression(X_train, X_test, y_train, y_test)
decision_tree_classifier(X_train, X_test, y_train, y_test)

Before PCA

Logistic Regression - 
Train Accuracy:  0.9327
Test Accuracy:  0.9179

Decision Tree - 
Train Accuracy:  1.0
Test Accuracy:  0.8737


In [10]:
print('PCA using library')
for d in [5, 20]:
    print('d=' + str(d))
    pca(mnist_data, mnist_labels, d)
    print('--------------')

PCA using library
d=5

Logistic Regression - 
Train Accuracy:  0.645
Test Accuracy:  0.6509

Decision Tree - 
Train Accuracy:  1.0
Test Accuracy:  0.6644
--------------
d=20

Logistic Regression - 
Train Accuracy:  0.8606
Test Accuracy:  0.8637

Decision Tree - 
Train Accuracy:  1.0
Test Accuracy:  0.8469
--------------


In [11]:
print('PCA own implementation')
for d in [5, 20]:
    print('d=' + str(d))
    run_my_pca(X_train, X_test, y_train, y_test, d)
    print('--------------')

PCA own implementation
d=5


  # Remove the CWD from sys.path while we load stuff.



Logistic Regression - 
Train Accuracy:  0.6449
Test Accuracy:  0.6514

Decision Tree - 
Train Accuracy:  1.0
Test Accuracy:  0.6618
--------------
d=20

Logistic Regression - 
Train Accuracy:  0.8596
Test Accuracy:  0.864

Decision Tree - 
Train Accuracy:  1.0
Test Accuracy:  0.85
--------------


# With normalizing

In [12]:
# Normalize data
norm_mnist_data = np.divide(mnist_data, 255)
norm_X_train = np.divide(X_train, 255)
norm_X_test = np.divide(X_test, 255)

print(norm_mnist_data.shape)
print(norm_X_train.shape)
print(norm_X_test.shape)

(70000, 784)
(56000, 784)
(14000, 784)


In [13]:
print('Before PCA')
l2_logistic_regression(norm_X_train, norm_X_test, y_train, y_test)
decision_tree_classifier(norm_X_train, norm_X_test, y_train, y_test)

Before PCA

Logistic Regression - 
Train Accuracy:  0.9287
Test Accuracy:  0.9178

Decision Tree - 
Train Accuracy:  1.0
Test Accuracy:  0.8737


In [14]:
print('PCA using library')
for d in [5, 20]:
    print('d=' + str(d))
    pca(norm_mnist_data, mnist_labels, d)
    print('--------------')

PCA using library
d=5

Logistic Regression - 
Train Accuracy:  0.6449
Test Accuracy:  0.6509

Decision Tree - 
Train Accuracy:  1.0
Test Accuracy:  0.6656
--------------
d=20

Logistic Regression - 
Train Accuracy:  0.8608
Test Accuracy:  0.8639

Decision Tree - 
Train Accuracy:  1.0
Test Accuracy:  0.8468
--------------


In [15]:
print('PCA own implementation')
for d in [5, 20]:
    print('d=' + str(d))
    run_my_pca(norm_X_train, norm_X_test, y_train, y_test, d)
    print('--------------')

PCA own implementation
d=5


  # Remove the CWD from sys.path while we load stuff.



Logistic Regression - 
Train Accuracy:  0.6452
Test Accuracy:  0.6527

Decision Tree - 
Train Accuracy:  1.0
Test Accuracy:  0.6618
--------------
d=20

Logistic Regression - 
Train Accuracy:  0.861
Test Accuracy:  0.8646

Decision Tree - 
Train Accuracy:  1.0
Test Accuracy:  0.85
--------------
