In [1]:
from __future__ import division, print_function, absolute_import
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from tensorflow.examples.tutorials.mnist import input_data
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

Instructions for updating:
Use the retry module or similar alternatives.


In [2]:
def l2_logistic_regression(train_data, test_data, train_labels, test_labels):
    
    # Train model
    model = LogisticRegression(penalty='l2', random_state=42)
    model.fit(train_data, train_labels)    
    
    # Test model
    y_train_pred = model.predict(train_data)
    y_test_pred = model.predict(test_data)

    # Evaluate model
    print('Logistic Regression - \nTrain Accuracy: ', round(np.sum(y_train_pred == train_labels) / train_data.shape[0], 4))
    print('Test Accuracy: ', round(np.sum(y_test_pred == test_labels) / test_data.shape[0], 4))

In [3]:
def knn(k, distance_matrix, train_labels, test_labels):

    predicted_labels = []       # List of predicted labels for test set 
    for i in range(distance_matrix.shape[0]):
          
        # Store indices of sorted neighbors
        neighbors = np.argsort(distance_matrix[i])
            
        # Store labels of k nearest neighbors
        k_nearest_labels = []   
        for j in range(k):
            k_nearest_labels.append(train_labels[neighbors[j]])

        # Store value of the max no. of k_nearest_labels
        predicted_labels.append(max(k_nearest_labels, key=k_nearest_labels.count))

    # Print accuracy
    print('k=%d:' % (k), str(round(accuracy_score(test_labels, np.asarray(predicted_labels)) * 100, 2)), '%')

In [4]:
def find_k_nearest_neighbors(idx, k, distance_matrix, train_labels, test_labels):

    # Store indices of sorted neighbors
    neighbors = np.argsort(distance_matrix[idx])

    # Store labels of k nearest neighbors
    k_nearest_labels = []   
    for j in range(k):
        k_nearest_labels.append(train_labels[neighbors[j]])

    # Store value of the max no. of k_nearest_labels
    predicted_label = max(k_nearest_labels, key=k_nearest_labels.count)

    print('Indices of neighbors')
    print(neighbors[:k])
    print('Labels of neighbors')
    print(k_nearest_labels)
    print('Predicted Label')
    print(predicted_label)
    print('Correct Label')
    print(test_labels[idx], '\n')

In [5]:
def knn_classification(y_train, y_test, original_distance_matrix, encoder_distance_matrix, decoder_distance_matrix):
    
    # Classification on original data
    print('kNN on Original Image')
    knn(5, original_distance_matrix, y_train, y_test)

    # Classification on encoder data
    print('kNN on Encoder Image')
    knn(5, encoder_distance_matrix, y_train, y_test)

    # Classification on reconstructed data
    print('kNN on Decoder Image')
    knn(5, decoder_distance_matrix, y_train, y_test)

    for idx in [0, 400, 800, 1200, 1600, 1950]:
        print('k-Nearest Neighbors of Original Image')
        find_k_nearest_neighbors(idx, 10, original_distance_matrix, y_train, y_test)
        print('k-Nearest Neighbors of Encoder Image')
        find_k_nearest_neighbors(idx, 10, encoder_distance_matrix, y_train, y_test)
        print('k-Nearest Neighbors of Decoder Image')
        find_k_nearest_neighbors(idx, 10, decoder_distance_matrix, y_train, y_test)
        print('--------------------')

In [8]:
# Fetch data
ng_train = fetch_20newsgroups(subset='train')
ng_test = fetch_20newsgroups(subset='test')

# Training and testing data
X_train = ng_train.data
X_test = ng_test.data

# Set of label names for training and testing data
# ng_train.target_names
# ng_test.target_names

# Training and testing labels, where each label is associated with a number corresponding to the index in target_names
# ng_train.target
# ng_test.target

# Training and testing labels
y_train = []
y_test = []

for i in range(len(X_train)):
    y_train.append(ng_train.target_names[ng_train.target[i]])
for i in range(len(X_test)):
    y_test.append(ng_test.target_names[ng_test.target[i]])

print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

11314
7532
11314
7532


In [9]:
# Converting text to vectors
tfidf = TfidfVectorizer()
tfidf.fit(X_train)
vect_X_train = tfidf.transform(X_train)
vect_X_test = tfidf.transform(X_test)

print(vect_X_train.shape)
print(vect_X_test.shape)

(11314, 130107)
(7532, 130107)


In [10]:
# Reduce data to top K features
K = 1000
model = SelectKBest(chi2, K)

model.fit(vect_X_train, y_train)
importances = model.scores_      # Scores of each feature

indices = np.argsort(importances)[::-1]
features = tfidf.get_feature_names()

# Store top k features
new_features = []
for f in range(K):
    new_features.append(features[indices[f]])

# Convert train and test data to vectors based on k features
new_tfidf = TfidfVectorizer(stop_words='english', vocabulary=new_features)
new_tfidf.fit(X_train)

vect_X_train = new_tfidf.transform(X_train)
vect_X_test = new_tfidf.transform(X_test)

In [11]:
X_train = vect_X_train.todense()
X_test = vect_X_test.todense()
y_train = np.array(y_train)
y_test = np.array(y_test)

In [14]:
# Training Parameters
learning_rate = 0.01
num_steps = 1000
batch_size = 256

display_step = 1000
examples_to_show = 10

# Network Parameters
num_hidden_1 = 200 # 1st layer num features
num_hidden_2 = 300 # 2nd layer num features (the latent dim)
num_input = K # MNIST data input (img shape: 28*28)

# tf Graph input (only pictures)
X = tf.placeholder("float", [None, num_input])

weights = {
    'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1])),
    'decoder_h1': tf.Variable(tf.random_normal([num_hidden_1, num_input])),
}
biases = {
    'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1])),
    'decoder_b1': tf.Variable(tf.random_normal([num_input])),
}

In [15]:
# Building the encoder
def encoder(x):
    # Encoder Hidden layer with sigmoid activation #1
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']),
                                   biases['encoder_b1']))
    return layer_1

# Building the decoder
def decoder(x):
    # Decoder Hidden layer with sigmoid activation #1
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']),
                                   biases['decoder_b1']))
    return layer_2

# Construct model
encoder_op = encoder(X)
decoder_op = decoder(encoder_op)

# Prediction
y_pred = decoder_op
# Targets (Labels) are the input data.
y_true = X

# Define loss and optimizer, minimize the squared error
loss = tf.reduce_mean(tf.pow(y_true - y_pred, 2))
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(loss)

# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()

In [16]:
# Start Training
# Start a new TF session
sess = tf.Session()

# Run the initializer
sess.run(init)

train_data_batches = [X_train[i:i + batch_size] for i in range(0, len(X_train), batch_size)]
train_labels_batches = [y_train[i:i + batch_size] for i in range(0, len(y_train), batch_size)]

# Training
for i in range(1, num_steps+1):
    # Prepare Data
    # Get the next batch of MNIST data (only images are needed, not labels)
    batch_x, _ = [train_data_batches[i % len(train_data_batches)], train_labels_batches[i % len(train_labels_batches)]]

    # Run optimization op (backprop) and cost op (to get loss value)
    _, l = sess.run([optimizer, loss], feed_dict={X: batch_x})
    # Display logs per step
    if i % display_step == 0 or i == 1:
        print('Step %i: Minibatch Loss: %f' % (i, l))

Step 1: Minibatch Loss: 0.445287


KeyboardInterrupt: 

In [14]:
# Classification on original data
l2_logistic_regression(X_train, X_test, y_train, y_test)

Logistic Regression - 
Train Accuracy:  0.8984
Test Accuracy:  0.782


In [15]:
# Classification on encoder data
encoder_X_train = sess.run(encoder_op, feed_dict={X: X_train})
encoder_X_test = sess.run(encoder_op, feed_dict={X: X_test})
l2_logistic_regression(encoder_X_train, encoder_X_test, y_train, y_test)

Logistic Regression - 
Train Accuracy:  0.7538
Test Accuracy:  0.6178


In [16]:
# Classification on decoder data
decoder_X_train = sess.run(decoder_op, feed_dict={X: X_train})
decoder_X_test = sess.run(decoder_op, feed_dict={X: X_test})
l2_logistic_regression(decoder_X_train, decoder_X_test, y_train, y_test)

Logistic Regression - 
Train Accuracy:  0.2885
Test Accuracy:  0.2147


In [17]:
original_cosine_distance_matrix = cosine_distances(X_test, X_train)
encoder_cosine_distance_matrix = cosine_distances(encoder_X_test, encoder_X_train)
decoder_cosine_distance_matrix = cosine_distances(decoder_X_test, decoder_X_train)
original_euclidean_distance_matrix = euclidean_distances(X_test, X_train)
encoder_euclidean_distance_matrix = euclidean_distances(encoder_X_test, encoder_X_train)
decoder_euclidean_distance_matrix = euclidean_distances(decoder_X_test, decoder_X_train)

In [18]:
print(original_euclidean_distance_matrix)
print(encoder_euclidean_distance_matrix)
print(decoder_euclidean_distance_matrix)

[[1.29699703 1.34797196 1.15252602 ... 1.32501457 1.35570769 1.40048142]
 [1.3100872  1.34037821 1.2068342  ... 1.3346829  1.32227752 1.3799027 ]
 [1.38766882 1.39077191 1.32665566 ... 1.38965758 1.38643113 1.40949877]
 ...
 [1.24641856 1.34704568 1.12103862 ... 1.30716256 1.33975397 1.39929128]
 [1.38396084 1.30463927 1.33261582 ... 1.38534656 1.38871811 1.40174279]
 [1.34798992 1.38806177 1.2848917  ... 1.35293219 1.28842777 1.40647872]]
[[2.982163  2.7886112 2.4877844 ... 2.9892602 2.881203  2.8749177]
 [2.9845629 2.8570328 2.4858747 ... 2.968079  2.8960345 3.0808465]
 [3.1438835 3.1569703 2.7153494 ... 3.0679073 3.1852374 3.055724 ]
 ...
 [3.0133624 2.7740948 2.1846032 ... 2.961271  2.8668973 3.1065128]
 [3.1748478 2.7347307 2.7060602 ... 2.8281305 2.9967136 2.9330983]
 [3.1365168 3.0460536 2.7134213 ... 2.9467225 3.1675787 2.974955 ]]
[[0.07473096 0.12327917 0.05606524 ... 0.08899052 0.0782226  0.08769509]
 [0.26221386 0.27654484 0.25315392 ... 0.26639938 0.26212654 0.27184245]
 [

In [19]:
knn_classification(y_train, y_test, original_cosine_distance_matrix, encoder_cosine_distance_matrix, decoder_cosine_distance_matrix)

kNN on Original Image
k=5: 65.56 %
kNN on Encoder Image
k=5: 62.31 %
kNN on Decoder Image
k=5: 26.23 %
k-Nearest Neighbors of Original Image
Indices of neighbors
[5272 9048 2577 2297 6959 5540 8841 9228 9229 7763]
Labels of neighbors
[7, 12, 3, 6, 5, 1, 10, 19, 19, 10]
Predicted Label
10
Correct Label
7 

k-Nearest Neighbors of Encoder Image
Indices of neighbors
[5272 9048 2577 5540 6959 2297 6677 8841 2833 7148]
Labels of neighbors
[7, 12, 3, 1, 5, 6, 13, 10, 6, 10]
Predicted Label
6
Correct Label
7 

k-Nearest Neighbors of Decoder Image
Indices of neighbors
[6677 5464 5364  572 4264 8475 8876  487 2443   86]
Labels of neighbors
[13, 3, 15, 15, 15, 0, 16, 19, 8, 2]
Predicted Label
15
Correct Label
7 

--------------------
k-Nearest Neighbors of Original Image
Indices of neighbors
[11195  9076  3577   974   310  9224  8091  7260  4586 10026]
Labels of neighbors
[0, 0, 0, 0, 0, 0, 15, 0, 0, 0]
Predicted Label
0
Correct Label
0 

k-Nearest Neighbors of Encoder Image
Indices of neighbors


In [20]:
knn_classification(y_train, y_test, original_euclidean_distance_matrix, encoder_euclidean_distance_matrix, decoder_euclidean_distance_matrix)

kNN on Original Image
k=5: 65.18 %
kNN on Encoder Image
k=5: 62.9 %
kNN on Decoder Image
k=5: 26.41 %
k-Nearest Neighbors of Original Image
Indices of neighbors
[5272 9048 2577 2297 6959 5540 8841 9228 9229 7763]
Labels of neighbors
[7, 12, 3, 6, 5, 1, 10, 19, 19, 10]
Predicted Label
10
Correct Label
7 

k-Nearest Neighbors of Encoder Image
Indices of neighbors
[5272 9048 2577 5540 6959 2297 8841 6677 2833 7148]
Labels of neighbors
[7, 12, 3, 1, 5, 6, 10, 13, 6, 10]
Predicted Label
6
Correct Label
7 

k-Nearest Neighbors of Decoder Image
Indices of neighbors
[6677 5464 5364 4264 8876 8475  572 3504 2443  487]
Labels of neighbors
[13, 3, 15, 15, 16, 0, 15, 2, 8, 19]
Predicted Label
15
Correct Label
7 

--------------------
k-Nearest Neighbors of Original Image
Indices of neighbors
[11195  9076  3577   974  9622   310  9224  8091  7260  4586]
Labels of neighbors
[0, 0, 0, 0, 18, 0, 0, 15, 0, 0]
Predicted Label
0
Correct Label
0 

k-Nearest Neighbors of Encoder Image
Indices of neighbors
