In [1]:
from __future__ import division, print_function, absolute_import
from sklearn.linear_model import LogisticRegression
from tensorflow.examples.tutorials.mnist import input_data
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def l2_logistic_regression(train_data, test_data, train_labels, test_labels):
    
    # Train model
    model = LogisticRegression(penalty='l2', random_state=42)
    model.fit(train_data, train_labels)    
    
    # Test model
    y_train_pred = model.predict(train_data)
    y_test_pred = model.predict(test_data)

    # Evaluate model
    print('Logistic Regression - \nTrain Accuracy: ', round(np.sum(y_train_pred == train_labels) / train_data.shape[0], 4))
    print('Test Accuracy: ', round(np.sum(y_test_pred == test_labels) / test_data.shape[0], 4))

In [3]:
def knn(k, distance_matrix, train_labels, test_labels):

    predicted_labels = []       # List of predicted labels for test set 
    for i in range(distance_matrix.shape[0]):
          
        # Store indices of sorted neighbors
        neighbors = np.argsort(distance_matrix[i])
            
        # Store labels of k nearest neighbors
        k_nearest_labels = []   
        for j in range(k):
            k_nearest_labels.append(train_labels[neighbors[j]])

        # Store value of the max no. of k_nearest_labels
        predicted_labels.append(max(k_nearest_labels, key=k_nearest_labels.count))

    # Print accuracy
    print('k=%d:' % (k), str(round(accuracy_score(test_labels, np.asarray(predicted_labels)) * 100, 2)), '%')

In [4]:
def find_k_nearest_neighbors(idx, k, distance_matrix, train_labels, test_labels):

    # Store indices of sorted neighbors
    neighbors = np.argsort(distance_matrix[idx])

    # Store labels of k nearest neighbors
    k_nearest_labels = []   
    for j in range(k):
        k_nearest_labels.append(train_labels[neighbors[j]])

    # Store value of the max no. of k_nearest_labels
    predicted_label = max(k_nearest_labels, key=k_nearest_labels.count)

    print('Indices of neighbors')
    print(neighbors[:k])
    print('Labels of neighbors')
    print(k_nearest_labels)
    print('Predicted Label')
    print(predicted_label)
    print('Correct Label')
    print(test_labels[idx], '\n')

In [5]:
def knn_classification(y_train, y_test, original_distance_matrix, encoder_distance_matrix, decoder_distance_matrix):
    
    K = 5
    k_nearest_neighbors = 10
    
    # Classification on original data
    print('kNN on Original Image')
    knn(K, original_distance_matrix, y_train, y_test)

    # Classification on encoder data
    print('kNN on Encoder Image')
    knn(K, encoder_distance_matrix, y_train, y_test)

    # Classification on reconstructed data
    print('kNN on Decoder Image')
    knn(K, decoder_distance_matrix, y_train, y_test)

    for idx in [0, 400, 800, 1200, 1600, 1950]:
        print('k-Nearest Neighbors of Original Image')
        find_k_nearest_neighbors(idx, k_nearest_neighbors, original_distance_matrix, y_train, y_test)
        print('k-Nearest Neighbors of Encoder Image')
        find_k_nearest_neighbors(idx, k_nearest_neighbors, encoder_distance_matrix, y_train, y_test)
        print('k-Nearest Neighbors of Decoder Image')
        find_k_nearest_neighbors(idx, k_nearest_neighbors, decoder_distance_matrix, y_train, y_test)
        print('--------------------')

In [6]:
# Fetch data
spambase_path = abspath('datasets', 'spambase.data')
spambase_dataset = np.loadtxt(open(spambase_path, 'rb'), delimiter=',')

# Data and labels
spambase_data = spambase_dataset[:, list(range(0, spambase_dataset.shape[1] - 1))]
spambase_labels = spambase_dataset[:, spambase_dataset.shape[1] - 1]

print(spambase_dataset.shape)
print(spambase_data.shape)
print(spambase_labels.shape)

(4601, 58)
(3680, 57)
(921, 57)
(3680,)
(921,)


In [None]:
# Split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(spambase_data, spambase_labels, test_size=0.20, random_state=42)

print(X_train.shape)
print(X_test.shape)

In [7]:
# Training Parameters
learning_rate = 0.01
num_steps = 5000
batch_size = 100

display_step = 1000
examples_to_show = 10

# Network Parameters
num_hidden_1 = 300 # 1st layer num features
num_hidden_2 = 100 # 2nd layer num features (the latent dim)
num_input = 57 # MNIST data input (img shape: 28*28)

# tf Graph input (only pictures)
X = tf.placeholder("float", [None, num_input])

weights = {
    'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1])),
    'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2])),
    'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1])),
    'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input])),
}
biases = {
    'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1])),
    'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2])),
    'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1])),
    'decoder_b2': tf.Variable(tf.random_normal([num_input])),
}

In [8]:
# Building the encoder
def encoder(x):
    # Encoder Hidden layer with sigmoid activation #1
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']),
                                   biases['encoder_b1']))
    # Encoder Hidden layer with sigmoid activation #2
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']),
                                   biases['encoder_b2']))
    return layer_2


# Building the decoder
def decoder(x):
    # Decoder Hidden layer with sigmoid activation #1
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']),
                                   biases['decoder_b1']))
    # Decoder Hidden layer with sigmoid activation #2
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']),
                                   biases['decoder_b2']))
    return layer_2

# Construct model
encoder_op = encoder(X)
decoder_op = decoder(encoder_op)

# Prediction
y_pred = decoder_op
# Targets (Labels) are the input data.
y_true = X

# Define loss and optimizer, minimize the squared error
loss = tf.reduce_mean(tf.pow(y_true - y_pred, 2))
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(loss)

# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()

In [9]:
# Start Training
# Start a new TF session
sess = tf.Session()

# Run the initializer
sess.run(init)

train_data_batches = [X_train[i:i + batch_size] for i in range(0, len(X_train), batch_size)]
train_labels_batches = [y_train[i:i + batch_size] for i in range(0, len(y_train), batch_size)]

# Training
for i in range(1, num_steps+1):
    # Prepare Data
    # Get the next batch of MNIST data (only images are needed, not labels)
    batch_x, _ = [train_data_batches[i % len(train_data_batches)], train_labels_batches[i % len(train_labels_batches)]]

    # Run optimization op (backprop) and cost op (to get loss value)
    _, l = sess.run([optimizer, loss], feed_dict={X: batch_x})
    # Display logs per step
    if i % display_step == 0 or i == 1:
        print('Step %i: Minibatch Loss: %f' % (i, l))

Step 1: Minibatch Loss: 5017.149414
Step 1000: Minibatch Loss: 5007.533203
Step 2000: Minibatch Loss: 6763.244141
Step 3000: Minibatch Loss: 5666.380371
Step 4000: Minibatch Loss: 4431.377930
Step 5000: Minibatch Loss: 4728.243652


In [10]:
# Classification on original data
l2_logistic_regression(X_train, X_test, y_train, y_test)

Logistic Regression - 
Train Accuracy:  0.9323
Test Accuracy:  0.9229


In [11]:
# Classification on encoder data
encoder_X_train = sess.run(encoder_op, feed_dict={X: X_train})
encoder_X_test = sess.run(encoder_op, feed_dict={X: X_test})
l2_logistic_regression(encoder_X_train, encoder_X_test, y_train, y_test)

Logistic Regression - 
Train Accuracy:  0.9141
Test Accuracy:  0.9197


In [12]:
# Classification on decoder data
decoder_X_train = sess.run(decoder_op, feed_dict={X: X_train})
decoder_X_test = sess.run(decoder_op, feed_dict={X: X_test})
l2_logistic_regression(decoder_X_train, decoder_X_test, y_train, y_test)

Logistic Regression - 
Train Accuracy:  0.8913
Test Accuracy:  0.8903


In [None]:
original_cosine_distance_matrix = cosine_distances(X_test, X_train)
encoder_cosine_distance_matrix = cosine_distances(encoder_X_test, encoder_X_train)
decoder_cosine_distance_matrix = cosine_distances(decoder_X_test, decoder_X_train)
original_euclidean_distance_matrix = euclidean_distances(X_test, X_train)
encoder_euclidean_distance_matrix = euclidean_distances(encoder_X_test, encoder_X_train)
decoder_euclidean_distance_matrix = euclidean_distances(decoder_X_test, decoder_X_train)

In [None]:
print('-------------------------------------------------')
print('Using Euclidean distances ....')
print('-------------------------------------------------')
knn_classification(y_train, y_test, original_cosine_distance_matrix, encoder_cosine_distance_matrix, decoder_cosine_distance_matrix)

In [None]:
print('-------------------------------------------------')
print('Using Cosine distances ....')
print('-------------------------------------------------')
knn_classification(y_train, y_test, original_euclidean_distance_matrix, encoder_euclidean_distance_matrix, decoder_euclidean_distance_matrix)

In [13]:
sampled_X_train = []
sampled_y_train = []
sampled_encoder_X_train = []
sampled_decoder_X_train = []

for i in range(len(X_train)):
    if i % 5 == 0:
        sampled_X_train.append(X_train[i])
        sampled_y_train.append(y_train[i])
        sampled_encoder_X_train.append(encoder_X_train[i])
        sampled_decoder_X_train.append(decoder_X_train[i])
        
sampled_X_test = []
sampled_y_test = []
sampled_encoder_X_test = []
sampled_decoder_X_test = []
for i in range(len(X_test)):
    if i % 5 == 0:
        sampled_X_test.append(X_test[i])
        sampled_y_test.append(y_test[i])
        sampled_encoder_X_test.append(encoder_X_test[i])
        sampled_decoder_X_test.append(decoder_X_test[i])

sampled_X_train = np.array(sampled_X_train)
sampled_X_test = np.array(sampled_X_test)
sampled_y_train = np.array(sampled_y_train)  #np.argmax(np.array(sampled_y_train), axis=1)
sampled_y_test = np.array(sampled_y_test) #np.argmax(np.array(sampled_y_test), axis=1)
sampled_encoder_X_train = np.array(sampled_encoder_X_train)
sampled_encoder_X_test = np.array(sampled_encoder_X_test)
sampled_decoder_X_train = np.array(sampled_decoder_X_train)
sampled_decoder_X_test = np.array(sampled_decoder_X_test)

original_cosine_distance_matrix = cosine_distances(sampled_X_test, sampled_X_train)
encoder_cosine_distance_matrix = cosine_distances(sampled_encoder_X_test, sampled_encoder_X_train)
decoder_cosine_distance_matrix = cosine_distances(sampled_decoder_X_test, sampled_decoder_X_train)
original_euclidean_distance_matrix = euclidean_distances(sampled_X_test, sampled_X_train)
encoder_euclidean_distance_matrix = euclidean_distances(sampled_encoder_X_test, sampled_encoder_X_train)
decoder_euclidean_distance_matrix = euclidean_distances(sampled_decoder_X_test, sampled_decoder_X_train)

In [14]:
print(original_euclidean_distance_matrix)
print(encoder_euclidean_distance_matrix)
print(decoder_euclidean_distance_matrix)

[[1536.27219477   11.20622613 1154.43116325 ...   66.33055457
   889.57855481  327.36906868]
 [1521.73691156    8.05833159 1140.04342586 ...   52.24044864
   875.21359243  312.89898934]
 [1129.4601315   397.88167209  768.07116835 ...  346.20287501
   508.76535554  104.0276995 ]
 ...
 [1690.66916339 3072.22410592 1941.13313301 ... 3017.1854029
  2201.16552569 2763.18594568]
 [ 624.78451704  910.03373646  259.37757022 ...  855.70611026
   111.84825221  593.51604769]
 [1481.58888749   44.44959302 1100.10257122 ...   17.07870598
   835.32952424  272.64531845]]
[[5.1629634 5.031575  4.9819    ... 5.6266074 4.1949472 4.354979 ]
 [4.702338  4.835774  4.4555364 ... 4.8411703 4.6504254 4.659995 ]
 [3.0190423 4.294355  3.2951205 ... 3.738743  3.2982914 3.2968235]
 ...
 [4.2286787 4.0948896 2.6621609 ... 3.8360438 3.2271783 3.3011303]
 [3.3571708 4.6037397 3.225803  ... 4.0259132 3.6999373 3.8256848]
 [5.264831  4.6121182 4.670916  ... 5.0390463 5.1290693 4.7809005]]
[[2.20773    1.7322783  2.143

In [15]:
knn_classification(sampled_y_train, sampled_y_test, original_cosine_distance_matrix, encoder_cosine_distance_matrix, decoder_cosine_distance_matrix)

kNN on Original Image
k=5: 70.81 %
kNN on Encoder Image
k=5: 85.95 %
kNN on Decoder Image
k=5: 79.46 %
k-Nearest Neighbors of Original Image
Indices of neighbors
[649 427 247  67 505 169 425  33 230  53]
Labels of neighbors
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Predicted Label
0.0
Correct Label
0.0 

k-Nearest Neighbors of Encoder Image
Indices of neighbors
[ 67 427 247 505 169 649 425 230  33 438]
Labels of neighbors
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Predicted Label
0.0
Correct Label
0.0 

k-Nearest Neighbors of Decoder Image
Indices of neighbors
[505 247 169 427 649  67  33 230 425 499]
Labels of neighbors
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Predicted Label
0.0
Correct Label
0.0 

--------------------
k-Nearest Neighbors of Original Image
Indices of neighbors
[365 212 458 200 345 184 398  89  21 228]
Labels of neighbors
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Predicted Label
0.0
Correct Label
0.0 

k-Nearest Neighbors of Encoder

In [16]:
knn_classification(sampled_y_train, sampled_y_test, original_euclidean_distance_matrix, encoder_euclidean_distance_matrix, decoder_euclidean_distance_matrix)

kNN on Original Image
k=5: 72.97 %
kNN on Encoder Image
k=5: 88.11 %
kNN on Decoder Image
k=5: 81.08 %
k-Nearest Neighbors of Original Image
Indices of neighbors
[427  67 649 505 169 247  33 425 230 438]
Labels of neighbors
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Predicted Label
0.0
Correct Label
0.0 

k-Nearest Neighbors of Encoder Image
Indices of neighbors
[ 67 505 169 427 247 649  33 425 230 438]
Labels of neighbors
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Predicted Label
0.0
Correct Label
0.0 

k-Nearest Neighbors of Decoder Image
Indices of neighbors
[649  67 169 505 247 427 425  33 230 499]
Labels of neighbors
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Predicted Label
0.0
Correct Label
0.0 

--------------------
k-Nearest Neighbors of Original Image
Indices of neighbors
[ 21 200 212  89 184 345 365 398 458 228]
Labels of neighbors
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Predicted Label
0.0
Correct Label
0.0 

k-Nearest Neighbors of Encoder