In [2]:
import numpy as np
import tensorflow as tf
import pandas as pd
df = pd.read_csv("shuffled-full-set-hashed.csv", header=None)

In [3]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

In [4]:
encoder = pickle.load(open('../server/webservice/pickles/encoder.pkl', 'rb'))
vectorizer = pickle.load(open('../server/webservice/pickles/vectorizer.pkl', 'rb'))
lsa = pickle.load(open('../server/webservice/pickles/lsa.pkl', 'rb'))
knn_lsa = pickle.load(open('../server/webservice/pickles/knn_lsa.pkl', 'rb'))

In [5]:
vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.5, max_features=10000,
                min_df=2, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [107]:
df.head()

Unnamed: 0,0,1
0,DELETION OF INTEREST,e04a09c87692 d6b72e591b91 5d066f0246f1 ed41171...
1,RETURNED CHECK,a3b334c6eefd be95012ebf2b 41d67080e078 ff1c26e...
2,BILL,586242498a88 9ccf259ca087 54709b24b45f 6bf9c0c...
3,BILL,cd50e861f48b 6ca2dd348663 d38820625542 f077614...
4,BILL,9db5536263d8 1c303d15eb65 3f89b4673455 b73e657...


In [172]:
output_size = len(encoder.classes_)

In [173]:
sample = df[0:10000]
sample.dropna(inplace=True)
sample_y = sample[0]

def get_label_vect(label):
    vect = [0] * output_size
    vect[label - 1] = 1
    return vect
encoded_y = np.array([get_label_vect(l) for l in encoder.transform(sample_y)])
encoded_y

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [174]:
chunk_size = 20
def chunks(l, n):
    # For item i in a range that is a length of l,
    for i in range(0, len(l), n):
        # Create an index range for l of n items:
        new_chunk = l[i:i+n]
        if len(new_chunk) < n:
            new_chunk = ([''] * (n - len(new_chunk))) + new_chunk
        yield new_chunk
        
doc_arr = [x for x in sample[1]]
doc_arr[0].split(' ')
doc_words = [doc.split(' ') for doc in doc_arr]


In [219]:
training_set = []
len(doc_words), len(encoded_y)
for i in range(len(doc_words)):
    for chunk in chunks(doc_words[i], chunk_size):
        training_set.append((encoded_y[i], chunk))
        
training_vects = []        

In [220]:
# Now for each word, get the tfidf vector
def word_2_vect(word):
    return lsa.transform(vectorizer.transform(pd.Series([word])))[0]

i = 0
for label, chunk in training_set:
    training_vects.append((label, [word_2_vect(word) for word in chunk]))
    if i%500 == 0:
        print('{} of {}'.format(i, len(training_set)))
    i += 1

training_vects[0]

0 of 170380
500 of 170380
1000 of 170380
1500 of 170380
2000 of 170380
2500 of 170380
3000 of 170380
3500 of 170380
4000 of 170380
4500 of 170380
5000 of 170380
5500 of 170380
6000 of 170380
6500 of 170380
7000 of 170380
7500 of 170380
8000 of 170380
8500 of 170380
9000 of 170380
9500 of 170380
10000 of 170380
10500 of 170380
11000 of 170380
11500 of 170380
12000 of 170380
12500 of 170380
13000 of 170380
13500 of 170380
14000 of 170380
14500 of 170380
15000 of 170380
15500 of 170380
16000 of 170380
16500 of 170380
17000 of 170380
17500 of 170380
18000 of 170380
18500 of 170380
19000 of 170380
19500 of 170380
20000 of 170380
20500 of 170380
21000 of 170380
21500 of 170380
22000 of 170380
22500 of 170380
23000 of 170380
23500 of 170380
24000 of 170380
24500 of 170380
25000 of 170380
25500 of 170380
26000 of 170380
26500 of 170380
27000 of 170380
27500 of 170380
28000 of 170380
28500 of 170380
29000 of 170380
29500 of 170380
30000 of 170380
30500 of 170380
31000 of 170380
31500 of 170380


KeyboardInterrupt: 

In [221]:
len(training_vects)

35646

In [222]:
import pickle
pickle.dump(training_vects, open('training_vects.pkl', 'wb'))

In [178]:
"""
Take about 100 lines, just as a toy example
Encode the label
Break the string up into an array of 20 words get the remainder as another chunk with empty str as the first 
    20 - n
For each word in the array, get its truncated tfidf vector so one of those 20 word arrays should correspond to 
    a 20x100 array
Then build the LSTM
"""

'\nTake about 100 lines, just as a toy example\nEncode the label\nBreak the string up into an array of 20 words get the remainder as another chunk with empty str as the first \n    20 - n\nFor each word in the array, get its truncated tfidf vector so one of those 20 word arrays should correspond to \n    a 20x100 array\nThen build the LSTM\n'

In [223]:
lstm_size = 256
lstm_layers = 2
batch_size = 100
learning_rate = 0.001

In [224]:
# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
    inputs_ = tf.placeholder("float", [None, chunk_size, 100])
    labels_ = tf.placeholder(tf.int32, [None, output_size])
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

In [225]:
with graph.as_default():
    with tf.name_scope("RNN_layers"):
        def lstm_cell():
            # Your basic LSTM cell
            lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size, reuse=tf.get_variable_scope().reuse)
            # Add dropout to the cell
            return tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)

        # Stack up multiple LSTM layers, for deep learning
        cell = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(lstm_layers)])

        # Getting an initial state of all zeros
        initial_state = cell.zero_state(batch_size, tf.float32)

In [226]:
with graph.as_default():
    outputs, final_state = tf.nn.dynamic_rnn(cell, inputs_,
                                             initial_state=initial_state)

In [227]:
with graph.as_default():
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], output_size, activation_fn=tf.sigmoid)
    cost = tf.losses.mean_squared_error(labels_, predictions)
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [237]:
with graph.as_default():
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [229]:
y = np.array([tup[0] for tup in training_vects])
y.shape

(35646, 14)

In [230]:
x = np.array([tup[1] for tup in training_vects])
x.shape

(35646, 20, 100)

In [231]:
def get_batches(x, y, batch_size=300):
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

In [232]:
train_prop = 0.8

set_size = x.shape[0]

split_idx = int(set_size*0.8)
train_x, rest_x = x[:split_idx], x[split_idx:]
train_y, rest_y = y[:split_idx], y[split_idx:]

val_prop = 0.5
rest_size = rest_x.shape[0]
val_idx = int(val_prop * rest_size)
val_x, test_x = rest_x[:val_idx], rest_x[val_idx:]
val_y, test_y = rest_y[:val_idx], rest_y[val_idx:]

In [233]:
train_y.shape, y.shape, val_y.shape, rest_y.shape

((28516, 14), (35646, 14), (3565, 14), (7130, 14))

In [234]:
val_y[:10]

array([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]])

In [235]:
epochs = 10

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (batch_x, batch_y) in enumerate(get_batches(train_x, train_y, batch_size), 1):

            feed = {inputs_: batch_x,
                    labels_: batch_y,
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)

            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                
                for batch_val_x, batch_val_y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: batch_val_x,
                            labels_: batch_val_y,
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)

                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))

            
            iteration +=1
            saver.save(sess, "checkpoints/sentiment.ckpt")
    saver.save(sess, "checkpoints/sentiment.ckpt")

Epoch: 0/10 Iteration: 5 Train loss: 0.213
Epoch: 0/10 Iteration: 10 Train loss: 0.061
Epoch: 0/10 Iteration: 15 Train loss: 0.059
Epoch: 0/10 Iteration: 20 Train loss: 0.061
Epoch: 0/10 Iteration: 25 Train loss: 0.076
Val acc: 0.929
Epoch: 0/10 Iteration: 30 Train loss: 0.078
Epoch: 0/10 Iteration: 35 Train loss: 0.058
Epoch: 0/10 Iteration: 40 Train loss: 0.065
Epoch: 0/10 Iteration: 45 Train loss: 0.049
Epoch: 0/10 Iteration: 50 Train loss: 0.066
Val acc: 0.929
Epoch: 0/10 Iteration: 55 Train loss: 0.071
Epoch: 0/10 Iteration: 60 Train loss: 0.066
Epoch: 0/10 Iteration: 65 Train loss: 0.060
Epoch: 0/10 Iteration: 70 Train loss: 0.072
Epoch: 0/10 Iteration: 75 Train loss: 0.070
Val acc: 0.929
Epoch: 0/10 Iteration: 80 Train loss: 0.066
Epoch: 0/10 Iteration: 85 Train loss: 0.059
Epoch: 0/10 Iteration: 90 Train loss: 0.074
Epoch: 0/10 Iteration: 95 Train loss: 0.059
Epoch: 0/10 Iteration: 100 Train loss: 0.072
Val acc: 0.929
Epoch: 0/10 Iteration: 105 Train loss: 0.056
Epoch: 0/10 Ite

Epoch: 3/10 Iteration: 865 Train loss: 0.052
Epoch: 3/10 Iteration: 870 Train loss: 0.053
Epoch: 3/10 Iteration: 875 Train loss: 0.049
Val acc: 0.929
Epoch: 3/10 Iteration: 880 Train loss: 0.083
Epoch: 3/10 Iteration: 885 Train loss: 0.068
Epoch: 3/10 Iteration: 890 Train loss: 0.050
Epoch: 3/10 Iteration: 895 Train loss: 0.060
Epoch: 3/10 Iteration: 900 Train loss: 0.045
Val acc: 0.929
Epoch: 3/10 Iteration: 905 Train loss: 0.070
Epoch: 3/10 Iteration: 910 Train loss: 0.063
Epoch: 3/10 Iteration: 915 Train loss: 0.065
Epoch: 3/10 Iteration: 920 Train loss: 0.059
Epoch: 3/10 Iteration: 925 Train loss: 0.065
Val acc: 0.929
Epoch: 3/10 Iteration: 930 Train loss: 0.066
Epoch: 3/10 Iteration: 935 Train loss: 0.059
Epoch: 3/10 Iteration: 940 Train loss: 0.046
Epoch: 3/10 Iteration: 945 Train loss: 0.074
Epoch: 3/10 Iteration: 950 Train loss: 0.054
Val acc: 0.929
Epoch: 3/10 Iteration: 955 Train loss: 0.066
Epoch: 3/10 Iteration: 960 Train loss: 0.051
Epoch: 3/10 Iteration: 965 Train loss: 0

Val acc: 0.937
Epoch: 5/10 Iteration: 1705 Train loss: 0.024
Epoch: 5/10 Iteration: 1710 Train loss: 0.031
Epoch: 6/10 Iteration: 1715 Train loss: 0.065
Epoch: 6/10 Iteration: 1720 Train loss: 0.035
Epoch: 6/10 Iteration: 1725 Train loss: 0.047
Val acc: 0.939
Epoch: 6/10 Iteration: 1730 Train loss: 0.043
Epoch: 6/10 Iteration: 1735 Train loss: 0.084
Epoch: 6/10 Iteration: 1740 Train loss: 0.072
Epoch: 6/10 Iteration: 1745 Train loss: 0.040
Epoch: 6/10 Iteration: 1750 Train loss: 0.048
Val acc: 0.939
Epoch: 6/10 Iteration: 1755 Train loss: 0.031
Epoch: 6/10 Iteration: 1760 Train loss: 0.065
Epoch: 6/10 Iteration: 1765 Train loss: 0.053
Epoch: 6/10 Iteration: 1770 Train loss: 0.071
Epoch: 6/10 Iteration: 1775 Train loss: 0.048
Val acc: 0.942
Epoch: 6/10 Iteration: 1780 Train loss: 0.053
Epoch: 6/10 Iteration: 1785 Train loss: 0.056
Epoch: 6/10 Iteration: 1790 Train loss: 0.041
Epoch: 6/10 Iteration: 1795 Train loss: 0.042
Epoch: 6/10 Iteration: 1800 Train loss: 0.082
Val acc: 0.937
Epoch

Epoch: 8/10 Iteration: 2545 Train loss: 0.024
Epoch: 8/10 Iteration: 2550 Train loss: 0.058
Val acc: 0.944
Epoch: 8/10 Iteration: 2555 Train loss: 0.038
Epoch: 8/10 Iteration: 2560 Train loss: 0.018
Epoch: 8/10 Iteration: 2565 Train loss: 0.022
Epoch: 9/10 Iteration: 2570 Train loss: 0.078
Epoch: 9/10 Iteration: 2575 Train loss: 0.026
Val acc: 0.945
Epoch: 9/10 Iteration: 2580 Train loss: 0.037
Epoch: 9/10 Iteration: 2585 Train loss: 0.041
Epoch: 9/10 Iteration: 2590 Train loss: 0.086
Epoch: 9/10 Iteration: 2595 Train loss: 0.075
Epoch: 9/10 Iteration: 2600 Train loss: 0.038
Val acc: 0.945
Epoch: 9/10 Iteration: 2605 Train loss: 0.041
Epoch: 9/10 Iteration: 2610 Train loss: 0.030
Epoch: 9/10 Iteration: 2615 Train loss: 0.061
Epoch: 9/10 Iteration: 2620 Train loss: 0.052
Epoch: 9/10 Iteration: 2625 Train loss: 0.068
Val acc: 0.945
Epoch: 9/10 Iteration: 2630 Train loss: 0.049
Epoch: 9/10 Iteration: 2635 Train loss: 0.055
Epoch: 9/10 Iteration: 2640 Train loss: 0.055
Epoch: 9/10 Iteratio

In [240]:
test_acc = []
with tf.Session(graph=graph) as sess:
    saver.restore(sess, "checkpoints/sentiment.ckpt")
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (bat_test_x, bat_test_y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: bat_test_x,
                labels_: bat_test_y,
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

Test accuracy: 0.943
