In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
#!pip install -q tensorflow-gpu==2.0.0-beta1
import tensorflow as tf
from tensorflow import keras
from collections import Counter
import numpy as np
import time

print(tf.__version__)

1.13.1


# Merge labeled 0 and 1 data and add documents as a list

In [None]:
df_labeled = pd.concat([df_label_1, df_label_00], axis = 0)
df_labeled.describe()

In [11]:
df_docs = pd.read_csv('docs.csv', names = ["ID", 'output'])
df_docs.head()

Unnamed: 0,ID,output
0,0,advocate industry enterprise journal mechanica...
1,1,steam ship great britain vavt lif pzwexoe irl ...
2,2,speak gently peak gently better far rule love ...
3,3,valuable recent publications harper brothers b...
4,4,advocate industry enterprise journal mechanica...


In [61]:
df_total = pd.merge(df_labeled, df_docs, on = 'ID')
df_total.head(2)

Unnamed: 0,ID,label,phrases,output
0,62757,1,"['direct_current', 'alternating_current', 'dir...",direct current era return discussing general q...
1,40648,1,"['induction_motors', 'induction_motors', 'sing...",electric traction long distance railways ton a...


In [15]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12264 entries, 0 to 12263
Data columns (total 4 columns):
ID         12264 non-null int64
label      12264 non-null int64
phrases    6132 non-null object
output     12232 non-null object
dtypes: int64(2), object(2)
memory usage: 479.1+ KB


# Building Feedforward Neural Network

In [67]:
vocab = Counter()
train, test = train_test_split(df_total, test_size=0.3)

In [68]:
train.head()

Unnamed: 0,ID,label,phrases,output
9680,115888,0,,tion interested geometers whether independent ...
3983,41850,1,"['electric_machine', 'manufactur_ing', 'electr...",curtain roll support jones fin tain shade adju...
10007,150754,0,,horizontal linear polarization left irciji pol...
8683,152991,0,,begin voyage discovery takes quarks cosmos bey...
11379,167951,0,,reviews sight mind oncoming crisis misuse hidd...


In [69]:
for text in train.output:
    for word in str(text).split(' '):
        vocab[word] += 1
for text in test.output:
    for word in str(text).split(' '):
        vocab[word] += 1   

In [70]:
print('Total words in vocabulary: {}'.format(len(vocab)))

Total words in vocabulary: 255177


In [71]:
total_words = len(vocab)

def get_word_2_index(vocab):
    word2index = {}
    for i, word in enumerate(vocab):
        word2index[word] = i
    return word2index

word2index = get_word_2_index(vocab)

print('Index of the word AUTOMATION:', word2index['automation'])

Index of the word AUTOMATION: 17562


In [72]:
def text_to_vector(text):
    layer = np.zeros(total_words, dtype = float)
    for word in str(text).split(' '):
        if word in word2index.keys():
            layer[word2index[word]] += 1
    return layer

def category_to_vector(category):
    y = np.zeros((2), dtype = float)
    if category == 0:
        y[0] = 1.
    else:
        y[1] = 1.
    return y

In [73]:
def get_batch( df, i, batch_size):
    batches = []
    results = []
    texts = df.output[i*batch_size: i*batch_size+batch_size]
    categories = df.label[i*batch_size: i*batch_size+batch_size]
    
    for text in texts:
        layer = text_to_vector(text)
        batches.append(layer)
        
    for category in categories:
        y = category_to_vector(category)
        results.append(y)
        
    return np.array(batches), np.array(results)

In [74]:
#documents
get_batch(train, 1, 100)[0].shape

(100, 255177)

In [75]:
#labels
get_batch(train, 1, 100)[1].shape

(100, 2)

In [76]:
learning_rate = 0.01
training_epochs = 10
batch_size = 150
display_step = 1

#network parameters
n_hidden_1 = 100
n_hidden_2 = 100
n_input = total_words
n_classes = 2

input_tensor = tf.placeholder(tf.float32, [None, n_input], name = 'input')
output_tensor = tf.placeholder(tf.float32, [None, n_classes], name = 'output')


In [77]:
def multilayer_perceptron(input_tensor, weights, biases):
    layer_1_multiplication = tf.matmul(input_tensor, weights['h1'])
    layer_1_addition = tf.add(layer_1_multiplication, biases['b1'])
    layer_1 = tf.nn.relu(layer_1_addition)
    
    #relu activation layer
    layer_2_multiplication = tf.matmul(layer_1, weights['h2'])
    layer_2_addition = tf.add(layer_2_multiplication, biases['b2'])
    layer_2 = tf.nn.relu(layer_2_addition)
    
    #outpur layer
    out_layer_multiplication = tf.matmul(layer_2, weights['out'])
    out_layer_addition = out_layer_multiplication + biases['out']
    
    return out_layer_addition    

In [78]:
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))
}

biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}

#constract model
prediction = multilayer_perceptron(input_tensor, weights, biases)

#define loss and optimizer
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits = prediction, labels = output_tensor))
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(loss)

init = tf.global_variables_initializer()

saver = tf.train.Saver()


In [79]:
with tf.Session() as sess:
    sess.run(init)
    
    #training
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = int(len(train.output)/batch_size)
        for i in range(total_batch):
            batch_x, batch_y = get_batch(train, i, batch_size)
            c, _ = sess.run([loss, optimizer], feed_dict = {input_tensor: batch_x, output_tensor: batch_y})
            #compute avg loss
            avg_cost += c / total_batch
        if epoch%display_step == 0:
            print('Epoch: ', '%04d'% (epoch+1), "loss = " \
                 "{:.9f}".format(avg_cost))
    print("Optimization finished.")
    
    #test model
    correct_prediction = tf.equal(tf.argmax(prediction, 1), tf.argmax(output_tensor, 1))
    
    #calc accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    total_test_data = len(test.label)
    batch_x_test, batch_y_test = get_batch(test, 0, total_test_data)
    print('Accuracy: ', accuracy.eval({input_tensor: batch_x_test, output_tensor: batch_y_test}))
    
    save_path = saver.save(sess, "/tmp/model6133.ckpt")
    print("Model saved to path: %s" % save_path)

Epoch:  0001 loss = 142.990193551
Epoch:  0002 loss = 8.457810515
Epoch:  0003 loss = 1.514389339
Epoch:  0004 loss = 0.999445257
Epoch:  0005 loss = 0.521440702
Epoch:  0006 loss = 0.562979542
Epoch:  0007 loss = 0.865961109
Epoch:  0008 loss = 0.725709166
Epoch:  0009 loss = 0.886099337
Epoch:  0010 loss = 0.952546857
Optimization finished.
Accuracy:  0.9024457
Model saved to path: /tmp/model6133.ckpt


# Making a prediction

In [123]:
#slice of documents
last = 1000

#range must start from 1
for i in range(1, 171):
    
    df_run = df_docs._slice(slice(i-1 + (i-1)*last, last*i+i-1), 0)
    
    x_10_texts, y_10_correct_labels = get_batch(df_run, 0, 1000*i)
    saver = tf.train.Saver()

    with tf.Session() as sess:
        saver.restore(sess, "/tmp/model6133.ckpt")
        print("Model restored.")
        print('Slice #', i, "range: ", i-1 + (i-1)*last, "-", last*i+i-1)
        classification = sess.run(tf.argmax(prediction, 1), feed_dict = {input_tensor: x_10_texts})
        df_docs['label'][df_docs._slice(slice(i-1 + (i-1)*last, last*i+i-1), 0).index] = classification
        

INFO:tensorflow:Restoring parameters from /tmp/model6133.ckpt
Model restored.
Slice # 120 range:  119119 - 120119


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


INFO:tensorflow:Restoring parameters from /tmp/model6133.ckpt
Model restored.
Slice # 121 range:  120120 - 121120
INFO:tensorflow:Restoring parameters from /tmp/model6133.ckpt
Model restored.
Slice # 122 range:  121121 - 122121
INFO:tensorflow:Restoring parameters from /tmp/model6133.ckpt
Model restored.
Slice # 123 range:  122122 - 123122
INFO:tensorflow:Restoring parameters from /tmp/model6133.ckpt
Model restored.
Slice # 124 range:  123123 - 124123
INFO:tensorflow:Restoring parameters from /tmp/model6133.ckpt
Model restored.
Slice # 125 range:  124124 - 125124
INFO:tensorflow:Restoring parameters from /tmp/model6133.ckpt
Model restored.
Slice # 126 range:  125125 - 126125
INFO:tensorflow:Restoring parameters from /tmp/model6133.ckpt
Model restored.
Slice # 127 range:  126126 - 127126
INFO:tensorflow:Restoring parameters from /tmp/model6133.ckpt
Model restored.
Slice # 128 range:  127127 - 128127
INFO:tensorflow:Restoring parameters from /tmp/model6133.ckpt
Model restored.
Slice # 12