# In this document, I continue to strive for high accuracy in predicting the classification of documents based on their anonymized words

## However, in contrast to the previous effort, I take the order of the words into account here and employ a Recurrent Neural Network (specifically an LSTM). This ends up being important, because my resultant accuracy is 95.5%

In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import random
df = pd.read_csv("shuffled-full-set-hashed.csv", header=None)

### It's worth pointing out that my TF-IDF vectorizer here is much smaller, and that's because I re-trained it using only single words, not n-grams of up to 3. I did this for the size and speed of the vectorizer, but also because, we are taking word order into account for the LSTM beyond the n-gram length.

In [2]:
encoder = pickle.load(open('../server/webservice/pickles/encoder.pkl', 'rb'))
smaller_vectorizer = pickle.load(open('./smaller_vectorizer.pkl', 'rb'))
new_lsa = pickle.load(open('./new_lsa.pkl', 'rb'))

### I begin by encoding the label for each row as a one-hot vector based on the encoded integer

In [3]:
def get_label_vect(label):
    vect = [0] * output_size
    vect[label - 1] = 1
    return vect

output_size = len(encoder.classes_)
sample = df[0:10000]
sample.dropna(inplace=True)
sample_y = sample[0]
encoded_y = np.array([get_label_vect(l) for l in encoder.transform(sample_y)])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [None]:
def chunks(l, n):
    """ Break a list into sized sub-lists """
    for i in range(0, len(l), n):
        new_chunk = l[i:i+n]
        if len(new_chunk) < n:
            new_chunk = ([''] * (n - len(new_chunk))) + new_chunk
        yield new_chunk

def word_2_vect(word):
    """ For a word, transform into a wordvec """
    return new_lsa.transform(smaller_vectorizer.transform(pd.Series([word])))[0]        

def get_batches(x, y, batch_size=300):
    """ Break up input into batch-sized tensors """
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

### Break the documents up into 20 word sized chunks, each associated with the correct label vector

In [4]:
chunk_size = 20
training_set = []
training_vects = []  
        
doc_arr = [x for x in sample[1]]
doc_arr[0].split(' ')
doc_words = [doc.split(' ') for doc in doc_arr]

for i in range(len(doc_words)):
    for chunk in chunks(doc_words[i], chunk_size):
        training_set.append((encoded_y[i], chunk))     
        
random.shuffle(training_set)        

### Now, with our previously saved vectorizers, convert each of those words into 100-dimension vectors. Don't forget to save!

In [7]:
i = 0
for label, chunk in training_set:
    training_vects.append((label, [word_2_vect(word) for word in chunk]))
    if i%500 == 0:
        print('{} of {}'.format(i, len(training_set)))
    i += 1
    

i = 0
for ch in chunks(training_vects, 80000):
    pickle.dump(ch, open('training_vects_big_{}.pkl'.format(i), 'wb'))
    i += 1

In [14]:
lstm_size = 256
lstm_layers = 2
batch_size = 500
learning_rate = 0.001

In [15]:
# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
    inputs_ = tf.placeholder("float", [None, chunk_size, 100])
    labels_ = tf.placeholder(tf.int32, [None, output_size])
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

### Now we begin building our LSTM, First we create our two layers, each wrapped in a dropout layer

In [16]:
with graph.as_default():
    with tf.name_scope("RNN_layers"):
        def lstm_cell():
            lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size, reuse=tf.get_variable_scope().reuse)
            return tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)

        cell = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(lstm_layers)])
        initial_state = cell.zero_state(batch_size, tf.float32)
        
    outputs, final_state = tf.nn.dynamic_rnn(cell, inputs_,
                                             initial_state=initial_state)        

W1003 20:04:28.423163 140736238351232 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

W1003 20:04:28.424947 140736238351232 deprecation.py:323] From <ipython-input-16-ab0169ed8749>:5: BasicLSTMCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
W1003 20:04:28.431388 140736238351232 deprecation.py:323] From <ipython-input-16-ab0169ed8749>:10: MultiRNNCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instruction

### Now we generate predictions, an output vector based on a logistic activation. The cost is the mean squared error with our one-hot label vectors, and our optimization algorithm, ADAM.

In [18]:
with graph.as_default():
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], output_size, activation_fn=tf.sigmoid)
    cost = tf.losses.mean_squared_error(labels_, predictions)
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

W1003 20:04:43.100605 140736238351232 deprecation.py:323] From /Users/boaz.reisman/.virtualenvs/datascience/lib/python3.6/site-packages/tensorflow/python/ops/losses/losses_impl.py:121: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [19]:
with graph.as_default():
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

### Now we break up our cleaned data set into a training group, a validation group, and a testing group.

In [23]:
train_prop = 0.8

set_size = x.shape[0]

split_idx = int(set_size*0.8)
train_x, rest_x = x[:split_idx], x[split_idx:]
train_y, rest_y = y[:split_idx], y[split_idx:]

val_prop = 0.5
rest_size = rest_x.shape[0]
val_idx = int(val_prop * rest_size)
val_x, test_x = rest_x[:val_idx], rest_x[val_idx:]
val_y, test_y = rest_y[:val_idx], rest_y[val_idx:]

### And run it through our LSTM

In [27]:
epochs = 10

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (batch_x, batch_y) in enumerate(get_batches(train_x, train_y, batch_size), 1):

            feed = {inputs_: batch_x,
                    labels_: batch_y,
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)

            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                
                for batch_val_x, batch_val_y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: batch_val_x,
                            labels_: batch_val_y,
                            keep_prob: 1, 
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)

                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))

            
            iteration +=1
            saver.save(sess, "checkpoints/final_sentiment.ckpt")
    saver.save(sess, "checkpoints/final_sentiment.ckpt")

Epoch: 0/10 Iteration: 5 Train loss: 0.195
Epoch: 0/10 Iteration: 10 Train loss: 0.061
Epoch: 0/10 Iteration: 15 Train loss: 0.062
Epoch: 0/10 Iteration: 20 Train loss: 0.061
Epoch: 0/10 Iteration: 25 Train loss: 0.058
Val acc: 0.929
Epoch: 0/10 Iteration: 30 Train loss: 0.060
Epoch: 0/10 Iteration: 35 Train loss: 0.060
Epoch: 0/10 Iteration: 40 Train loss: 0.058
Epoch: 0/10 Iteration: 45 Train loss: 0.057
Epoch: 0/10 Iteration: 50 Train loss: 0.059
Val acc: 0.929
Epoch: 0/10 Iteration: 55 Train loss: 0.057
Epoch: 0/10 Iteration: 60 Train loss: 0.057
Epoch: 0/10 Iteration: 65 Train loss: 0.057
Epoch: 0/10 Iteration: 70 Train loss: 0.058
Epoch: 0/10 Iteration: 75 Train loss: 0.058
Val acc: 0.929
Epoch: 0/10 Iteration: 80 Train loss: 0.058
Epoch: 0/10 Iteration: 85 Train loss: 0.057
Epoch: 0/10 Iteration: 90 Train loss: 0.057
Epoch: 0/10 Iteration: 95 Train loss: 0.056
Epoch: 0/10 Iteration: 100 Train loss: 0.057
Val acc: 0.929
Epoch: 0/10 Iteration: 105 Train loss: 0.057
Epoch: 0/10 Ite

Epoch: 3/10 Iteration: 865 Train loss: 0.045
Epoch: 3/10 Iteration: 870 Train loss: 0.041
Epoch: 3/10 Iteration: 875 Train loss: 0.041
Val acc: 0.947
Epoch: 3/10 Iteration: 880 Train loss: 0.043
Epoch: 3/10 Iteration: 885 Train loss: 0.043
Epoch: 3/10 Iteration: 890 Train loss: 0.040
Epoch: 3/10 Iteration: 895 Train loss: 0.040
Epoch: 3/10 Iteration: 900 Train loss: 0.042
Val acc: 0.948
Epoch: 3/10 Iteration: 905 Train loss: 0.041
Epoch: 3/10 Iteration: 910 Train loss: 0.043
Epoch: 3/10 Iteration: 915 Train loss: 0.040
Epoch: 3/10 Iteration: 920 Train loss: 0.039
Epoch: 3/10 Iteration: 925 Train loss: 0.041
Val acc: 0.948
Epoch: 3/10 Iteration: 930 Train loss: 0.041
Epoch: 3/10 Iteration: 935 Train loss: 0.041
Epoch: 3/10 Iteration: 940 Train loss: 0.039
Epoch: 3/10 Iteration: 945 Train loss: 0.042
Epoch: 3/10 Iteration: 950 Train loss: 0.043
Val acc: 0.949
Epoch: 3/10 Iteration: 955 Train loss: 0.040
Epoch: 3/10 Iteration: 960 Train loss: 0.040
Epoch: 3/10 Iteration: 965 Train loss: 0

Val acc: 0.952
Epoch: 6/10 Iteration: 1705 Train loss: 0.037
Epoch: 6/10 Iteration: 1710 Train loss: 0.036
Epoch: 6/10 Iteration: 1715 Train loss: 0.037
Epoch: 6/10 Iteration: 1720 Train loss: 0.037
Epoch: 6/10 Iteration: 1725 Train loss: 0.037
Val acc: 0.952
Epoch: 6/10 Iteration: 1730 Train loss: 0.037
Epoch: 6/10 Iteration: 1735 Train loss: 0.036
Epoch: 6/10 Iteration: 1740 Train loss: 0.040
Epoch: 6/10 Iteration: 1745 Train loss: 0.037
Epoch: 6/10 Iteration: 1750 Train loss: 0.036
Val acc: 0.952
Epoch: 6/10 Iteration: 1755 Train loss: 0.034
Epoch: 6/10 Iteration: 1760 Train loss: 0.038
Epoch: 6/10 Iteration: 1765 Train loss: 0.038
Epoch: 6/10 Iteration: 1770 Train loss: 0.041
Epoch: 6/10 Iteration: 1775 Train loss: 0.036
Val acc: 0.952
Epoch: 6/10 Iteration: 1780 Train loss: 0.040
Epoch: 6/10 Iteration: 1785 Train loss: 0.040
Epoch: 6/10 Iteration: 1790 Train loss: 0.036
Epoch: 6/10 Iteration: 1795 Train loss: 0.039
Epoch: 6/10 Iteration: 1800 Train loss: 0.038
Val acc: 0.952
Epoch

Epoch: 9/10 Iteration: 2545 Train loss: 0.037
Epoch: 9/10 Iteration: 2550 Train loss: 0.036
Val acc: 0.953
Epoch: 9/10 Iteration: 2555 Train loss: 0.035
Epoch: 9/10 Iteration: 2560 Train loss: 0.034
Epoch: 9/10 Iteration: 2565 Train loss: 0.041
Epoch: 9/10 Iteration: 2570 Train loss: 0.036
Epoch: 9/10 Iteration: 2575 Train loss: 0.038
Val acc: 0.953
Epoch: 9/10 Iteration: 2580 Train loss: 0.038
Epoch: 9/10 Iteration: 2585 Train loss: 0.038
Epoch: 9/10 Iteration: 2590 Train loss: 0.040
Epoch: 9/10 Iteration: 2595 Train loss: 0.036
Epoch: 9/10 Iteration: 2600 Train loss: 0.037
Val acc: 0.953
Epoch: 9/10 Iteration: 2605 Train loss: 0.034
Epoch: 9/10 Iteration: 2610 Train loss: 0.036
Epoch: 9/10 Iteration: 2615 Train loss: 0.038
Epoch: 9/10 Iteration: 2620 Train loss: 0.033
Epoch: 9/10 Iteration: 2625 Train loss: 0.035
Val acc: 0.954
Epoch: 9/10 Iteration: 2630 Train loss: 0.038
Epoch: 9/10 Iteration: 2635 Train loss: 0.035
Epoch: 9/10 Iteration: 2640 Train loss: 0.038
Epoch: 9/10 Iteratio

### And run the test data set through it...

In [179]:
test_acc = []
with tf.Session(graph=graph) as sess:
    saver.restore(sess, "checkpoints/final_sentiment.ckpt")
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (bat_test_x, bat_test_y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: bat_test_x,
                labels_: bat_test_y,
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

Test accuracy: 0.954


### 95.4% Accuracy! Good time to push this live.