In [3]:
#__Import the required libraries 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
import tensorflow as tf
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.datasets import fetch_20newsgroups

In [5]:
#Loading the dataset and transforming it
newsgroups = fetch_20newsgroups(subset='train')
vectorizer = TfidfVectorizer(stop_words='english',min_df=2)
train = vectorizer.fit_transform(newsgroups.data)
X_train = train.toarray()
Y_train=newsgroups.target

newsgroups2 = fetch_20newsgroups(subset='test')
test = vectorizer.transform(newsgroups2.data)
X_test = test.toarray()
Y_test=newsgroups2.target

In [7]:
#__all the constants
learning_rate = 0.01
num_steps = 30000
batch_size = 256
display_step = 1000
examples_to_show = 10


## AutoEncoder for 20NG with K=20

In [8]:
#Autoencoder with K = 20
input_dim = len(X_train[0])
hidden_dim =20
epoch = 10
batch_size = 100

x = tf.placeholder(dtype=tf.float32, shape=[None, input_dim])
        
with tf.name_scope('encode'):
    weights = tf.Variable(tf.random_normal([input_dim, hidden_dim], dtype=tf.float32), name='weights')
    biases = tf.Variable(tf.zeros([hidden_dim]), name='biases')
    encoded = tf.nn.sigmoid(tf.matmul(x, weights) + biases)
            
with tf.name_scope('decode'):
    weights = tf.Variable(tf.random_normal([hidden_dim, input_dim], dtype=tf.float32), name='weights')
    biases = tf.Variable(tf.zeros([input_dim]), name='biases')
    decoded = tf.nn.sigmoid(tf.matmul(encoded, weights) + biases)
            
loss = tf.sqrt(tf.reduce_mean(tf.square(tf.subtract(x,decoded))))
train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
saver = tf.train.Saver()

def get_batch(X, size):
    a = np.random.choice(len(X), size, replace=False)
    return X[a]

def train(data):
    with tf.Session() as sess:
        init = tf.global_variables_initializer()
        sess.run(init)
        for i in range(epoch):
            for j in range(np.shape(data)[0] // batch_size):
                batch_data = get_batch(data, batch_size)
                l, _ = sess.run([loss, train_op], feed_dict={x:batch_data})
            print('epoch {0}: loss = {1}'.format(i+1, l))
            saver.save(sess, './autoencoder.ckpt')
        saver.save(sess, './autoencoder.ckpt')
            
def test(data):
    with tf.Session() as sess:
        saver.restore(sess, "./autoencoder.ckpt")
        hidden, reconstructed = sess.run([encoded, decoded], feed_dict={x:data})
        return hidden, reconstructed

In [9]:
train(X_train)
h,r=test(X_train)
q,p=test(X_test)

epoch 1: loss = 0.005129413679242134
epoch 2: loss = 0.004350527189671993
epoch 3: loss = 0.004268299322575331
epoch 4: loss = 0.004255563020706177
epoch 5: loss = 0.004227327182888985
epoch 6: loss = 0.0042236484587192535
epoch 7: loss = 0.004212360829114914
epoch 8: loss = 0.004208816215395927
epoch 9: loss = 0.00421184254810214
epoch 10: loss = 0.00420741131529212
INFO:tensorflow:Restoring parameters from ./autoencoder.ckpt
INFO:tensorflow:Restoring parameters from ./autoencoder.ckpt


In [10]:
print("Logistic Regression")

print("\nORIGINAL DATASET :")
loReg = LogisticRegression(penalty='l2',solver = 'lbfgs',multi_class='multinomial')
loReg.fit(X_train,Y_train)
score = loReg.score(X_train, Y_train)
print(" The training accuracy is ",score)
score = loReg.score(X_test, Y_test)
print(" The testing accuracy is ",score)

print("\nRECONSTRUCTED DATASET :")
loReg = LogisticRegression(penalty='l2',solver = 'lbfgs',multi_class='multinomial')
loReg.fit(r,Y_train)
score = loReg.score(r, Y_train)
print(" The training accuracy is ",score)
score = loReg.score(p, Y_test)
print(" The testing accuracy is ",score)

Logistic Regression

ORIGINAL DATASET :
    The training accuracy is  0.9777267102704614
    The testing accuracy is  0.8327137546468402

RECONSTRUCTED DATASET :
    The training accuracy is  0.0839667668375464
    The testing accuracy is  0.07713754646840149


## AutoEncoder for 20NG with K=200

In [14]:
input_dim = len(X_train[0])
hidden_dim =200
epoch = 10
batch_size = 100

x = tf.placeholder(dtype=tf.float32, shape=[None, input_dim])
        
with tf.name_scope('encode'):
    weights = tf.Variable(tf.random_normal([input_dim, hidden_dim], dtype=tf.float32), name='weights')
    biases = tf.Variable(tf.zeros([hidden_dim]), name='biases')
    encoded = tf.nn.sigmoid(tf.matmul(x, weights) + biases)
            
with tf.name_scope('decode'):
    weights = tf.Variable(tf.random_normal([hidden_dim, input_dim], dtype=tf.float32), name='weights')
    biases = tf.Variable(tf.zeros([input_dim]), name='biases')
    decoded = tf.nn.sigmoid(tf.matmul(encoded, weights) + biases)
            
loss = tf.sqrt(tf.reduce_mean(tf.square(tf.subtract(x,decoded))))
train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
saver = tf.train.Saver()

def get_batch(X, size):
    a = np.random.choice(len(X), size, replace=False)
    return X[a]

def train(data):
    with tf.Session() as sess:
        init = tf.global_variables_initializer()
        sess.run(init)
        for i in range(epoch):
            for j in range(np.shape(data)[0] // batch_size):
                batch_data = get_batch(data, batch_size)
                l, _ = sess.run([loss, train_op], feed_dict={x:batch_data})
            print('epoch {0}: loss = {1}'.format(i+1, l))
            saver.save(sess, './autoencoder.ckpt')
        saver.save(sess, './autoencoder.ckpt')
            
def test(data):
    with tf.Session() as sess:
        saver.restore(sess, "./autoencoder.ckpt")
        hidden, reconstructed = sess.run([encoded, decoded], feed_dict={x:data})
        return hidden, reconstructed

train(X_train)
h,r=test(X_train)
q,p=test(X_test)

print("Logistic Regression")

print("\nORIGINAL DATASET :")
loReg = LogisticRegression(penalty='l2',solver = 'lbfgs',multi_class='multinomial')
loReg.fit(X_train,Y_train)
score = loReg.score(X_train, Y_train)
print(" The training accuracy is ",score)
score = loReg.score(X_test, Y_test)
print(" The testing accuracy is ",score)

print("\nRECONSTRUCTED DATASET :")
loReg = LogisticRegression(penalty='l2',solver = 'lbfgs',multi_class='multinomial')
loReg.fit(r,Y_train)
score = loReg.score(r, Y_train)
print(" The training accuracy is ",score)
score = loReg.score(p, Y_test)
print(" The testing accuracy is ",score)

epoch 1: loss = 0.134831041097641
epoch 2: loss = 0.10174984484910965
epoch 3: loss = 0.08981727808713913
epoch 4: loss = 0.0828646644949913
epoch 5: loss = 0.0779142826795578
epoch 6: loss = 0.07406099885702133
epoch 7: loss = 0.06997630000114441
epoch 8: loss = 0.06699789315462112
epoch 9: loss = 0.0646982342004776
epoch 10: loss = 0.062023572623729706
INFO:tensorflow:Restoring parameters from ./autoencoder.ckpt
INFO:tensorflow:Restoring parameters from ./autoencoder.ckpt
Logistic Regression

ORIGINAL DATASET :
    The training accuracy is  0.9777267102704614
    The testing accuracy is  0.8327137546468402

RECONSTRUCTED DATASET :
    The training accuracy is  0.1060632844263744
    The testing accuracy is  0.06558682952734997
