In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from collections import Counter
from sklearn.datasets import fetch_20newsgroups

In [4]:
categories = ["comp.graphics","sci.space","rec.sport.baseball"]

newsgroups_train = fetch_20newsgroups(subset='train' ,categories=categories,remove=('headers','footers','quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories,remove=('headers','footers','quotes'))

In [5]:
print('total texts in train:',len(newsgroups_train.data))
print('total texts in test:',len(newsgroups_test.data))

('total texts in train:', 1774)
('total texts in test:', 1180)


In [6]:
print('text',newsgroups_train.data[0])
print('category:',newsgroups_train.target[0])

('text', u"\nBy '8 grey level images' you mean 8 items of 1bit images?\nIt does work(!), but it doesn't work if you have more than 1bit\nin your screen and if the screen intensity is non-linear.\n\nWith 2 bit per pixel; there could be 1*c_1 + 4*c_2 timing,\nthis gives 16 levels, but they are linear if screen intensity is\nlinear.\nWith 1*c_1 + 2*c_2 it works, but we have to find the best\ncompinations -- there's 10 levels, but 16 choises; best 10 must be\nchosen. Different compinations for the same level, varies a bit, but\nthe levels keeps their order.\n\nReaders should verify what I wrote... :-)")
('category:', 0)


In [7]:
vocab = Counter()

for text in newsgroups_train.data:
    for word in text.split(' '):
        vocab[word.lower()]+=1
        
for text in newsgroups_test.data:
    for word in text.split(' '):
        vocab[word.lower()]+=1

In [8]:
print("Total words:",len(vocab))

('Total words:', 85451)


In [9]:
total_words = len(vocab)

def get_word_2_index(vocab):
    word2index = {}
    for i,word in enumerate(vocab):
        word2index[word.lower()] = i
        
    return word2index

word2index = get_word_2_index(vocab)

print("Index of the word 'one':",word2index['one'])

("Index of the word 'one':", 1854)


In [10]:
def get_batch(data,i,batch_size):
    batches = []
    results = []
    texts = data.data[i*batch_size:i*batch_size+batch_size]
    categories = data.target[i*batch_size:i*batch_size+batch_size]
    for text in texts:
        layer = np.zeros(total_words,dtype=float)
        for word in text.split(' '):
            layer[word2index[word.lower()]] += 1
            
        batches.append(layer)
        
    for category in categories:
        y = np.zeros((20),dtype=float)
        for k in range(20):
            if category == k:
                y[k]=1.
        results.append(y)
     
    return np.array(batches),np.array(results)

In [11]:
print("Each batch has 128 texts and each matrix has elements (words):",get_batch(newsgroups_train,1,128)[0].shape)

('Each batch has 128 texts and each matrix has elements (words):', (128, 85451))


In [12]:
print("Each batch has 128 texts and each matrix has 3 categories:",get_batch(newsgroups_test,1,128)[1].shape)

('Each batch has 128 texts and each matrix has 3 categories:', (128, 20))


In [13]:
learning_rate = 0.05
training_epochs = 5
batch_size = 128
display_step = 1

# Network Parameters
n_hidden = 100      # layer number of features
n_input = total_words # Words in vocab
n_classes = 20      

tf_input = tf.placeholder(tf.float32,[None, n_input],name="input")
tf_output = tf.placeholder(tf.float32,[None, n_classes],name="output") 

In [14]:
def neuron_network(tf_input,weights,biases):
    # hidden layer
    layer_mul = tf.matmul(tf_input,weights['h1'])
    layer_add = tf.add(layer_mul,biases['b1'])
    layer = tf.nn.relu(layer_add)
    
    # output layer 
    out_layer_mul=tf.matmul(layer,weights['out'])
    out_layer = tf.add(out_layer_mul,biases['out'])
    
    return out_layer 


In [15]:
weights={
    'h1': tf.Variable(tf.random_normal([n_input,n_hidden])),
    'out':tf.Variable(tf.random_normal([n_hidden,n_classes]))
}

biases={
    'b1': tf.Variable(tf.random_normal([n_hidden])),
    'out':tf.Variable(tf.random_normal([n_classes]))
}

logits=neuron_network(tf_input,weights,biases)

# loss and optimizer
loss_op=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits,labels=tf_output))

optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss_op)


In [16]:
init= tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    
   # writer = tf.summary.FileWriter(' ./graphs' , sess.graph)
    # Training cycle
    for epoch in range(training_epochs):
        avg = 0.
        
        total_batch = int(len(newsgroups_train.data)/batch_size)
        
        # Loop over all batches
        for i in range(total_batch):
            batch_x,batch_y = get_batch(newsgroups_train,i,batch_size)
            
            # Run optimization op  and cost op 
            loss,acc = sess.run([loss_op,optimizer], feed_dict={tf_input: batch_x,tf_output:batch_y})
            
            # Compute average loss
            avg += loss / total_batch
            
        # Display logs per epoch step
        if epoch % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), "loss=", \
                "{:.9f}".format(avg))
            
    print("Optimization Finished!")

    # Test model
    correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(tf_output, 1))
    
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_pred, "float"))
    
    total_test_data = len(newsgroups_test.target)
    
    batch_x_test,batch_y_test = get_batch(newsgroups_test,0,total_test_data)
    
    print("Accuracy:", accuracy.eval({tf_input: batch_x_test, tf_output: batch_y_test}))
    


('Epoch:', '0001', 'loss=', '83.265316743')
('Epoch:', '0002', 'loss=', '4.690325517')
('Epoch:', '0003', 'loss=', '6.089372072')
('Epoch:', '0004', 'loss=', '4.622743740')
('Epoch:', '0005', 'loss=', '1.333185417')
Optimization Finished!
('Accuracy:', 0.79067796)
