In [2]:
import numpy as np
import tensorflow as tf
import pandas as pd
df = pd.read_csv("shuffled-full-set-hashed.csv", header=None)

In [3]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

In [4]:
encoder = pickle.load(open('../server/webservice/pickles/encoder.pkl', 'rb'))
vectorizer = pickle.load(open('../server/webservice/pickles/vectorizer.pkl', 'rb'))
lsa = pickle.load(open('../server/webservice/pickles/lsa.pkl', 'rb'))
knn_lsa = pickle.load(open('../server/webservice/pickles/knn_lsa.pkl', 'rb'))

In [5]:
vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.5, max_features=10000,
                min_df=2, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [6]:
df.head()

Unnamed: 0,0,1
0,DELETION OF INTEREST,e04a09c87692 d6b72e591b91 5d066f0246f1 ed41171...
1,RETURNED CHECK,a3b334c6eefd be95012ebf2b 41d67080e078 ff1c26e...
2,BILL,586242498a88 9ccf259ca087 54709b24b45f 6bf9c0c...
3,BILL,cd50e861f48b 6ca2dd348663 d38820625542 f077614...
4,BILL,9db5536263d8 1c303d15eb65 3f89b4673455 b73e657...


In [11]:
sample = df[0:100]
sample_y = sample[0]
encoded_y = encoder.transform(sample_y)
encoded_y

array([ 7, 13,  1,  1,  1, 11, 11,  1,  4,  1,  1, 11,  4,  1, 11,  6,  4,
       11,  5, 10,  3,  3, 11,  4,  3, 11,  4,  1,  1, 12, 13,  1,  3,  7,
       11, 10,  3,  3, 12,  7,  1, 11,  3,  4,  1, 11,  3,  1,  1, 12,  1,
       12,  3,  1, 11, 13,  1, 11,  4, 11,  1,  1, 12,  1,  3,  4,  3,  4,
        3,  8,  7,  1,  1,  4,  1, 11,  3, 10,  1,  3, 11, 11, 11, 12,  1,
       12,  8,  1,  4, 11, 12,  7,  7,  1,  3,  3,  1,  4,  3, 12])

In [35]:
chunk_size = 20
def chunks(l, n):
    # For item i in a range that is a length of l,
    for i in range(0, len(l), n):
        # Create an index range for l of n items:
        new_chunk = l[i:i+n]
        if len(new_chunk) < n:
            new_chunk = ([''] * (n - len(new_chunk))) + new_chunk
        yield new_chunk
        
doc_arr = [x for x in sample[1]]
doc_words = [doc.split(' ') for doc in doc_arr]
training_set = []
len(doc_words), len(encoded_y)
for i in range(len(doc_words)):
    for chunk in chunks(doc_words[i], chunk_size):
        training_set.append((encoded_y[i], chunk))

training_set


[(7,
  ['e04a09c87692',
   'd6b72e591b91',
   '5d066f0246f1',
   'ed41171e2e6d',
   '59260a2781dc',
   'ec56ff31bb7a',
   '1cf70e99f986',
   '7d7400d32c11',
   'fbe7c05e32d5',
   '6b0cb5728b14',
   '54709b24b45f',
   '25c57acdf805',
   '8bd6e6f02cbc',
   '31cbd98f4b3c',
   'f7548baf29d4',
   'bd0972f16400',
   '1b43925e3c28',
   'b2c878a75d7e',
   '59260a2781dc',
   'cf4fc632eed2']),
 (7,
  ['25c57acdf805',
   'b4221b1edff9',
   'de9738ee8b24',
   '135307dba198',
   '19e9f3592995',
   '1cf70e99f986',
   '266dc1fd820c',
   'b73e657498f2',
   'f1ec22325b37',
   '1fa87d60c46c',
   'e4a319284bf9',
   '6b343f522f78',
   '60fb2adbbb87',
   '37428698b32e',
   'd03283541bce',
   '59f0408bc81b',
   '9ccf259ca087',
   '54709b24b45f',
   '8bd6e6f02cbc',
   '63198bea516d']),
 (7,
  ['7991590bf0b6',
   '2575240863a4',
   'b5ed9af384f4',
   '87b8193a0183',
   '3e5199ae28ae',
   '094e2de7e1cd',
   '422068f04236',
   '25c57acdf805',
   '179dce4734b4',
   'd38820625542',
   '7cd2e94152fb',
   'f1413aff

In [52]:
# Now for each word, get the tfidf vector
def word_2_vect(word):
    return lsa.transform(vectorizer.transform(pd.Series([word])))[0]


training_vects = []

for label, chunk in training_set:
    training_vects.append((label, [word_2_vect(word) for word in chunk]))

training_vects[0]

(7, [array([ 0.08960233, -0.00748783, -0.00058986,  0.05938175, -0.12760809,
          0.03329331,  0.01501089, -0.01228236, -0.01097976,  0.09372074,
         -0.00042565, -0.04883901, -0.03243717, -0.04086047,  0.01062963,
         -0.07297007, -0.14343412, -0.07510015,  0.01631904, -0.04688877,
         -0.02663099,  0.03933323, -0.10316763,  0.03483377,  0.08032407,
         -0.04567542,  0.02828497,  0.01095692, -0.13252986,  0.19941989,
          0.01186697,  0.12988176,  0.02840109, -0.05922385, -0.06007388,
          0.00622728,  0.07550943, -0.07479682,  0.06536086,  0.01164758,
          0.0172462 ,  0.00536241,  0.02114648, -0.03503942,  0.07921029,
          0.15787231, -0.00349768, -0.02853324,  0.00718559, -0.01943849,
         -0.03561621, -0.04511181,  0.01890446,  0.08670718,  0.05073979,
         -0.01248845,  0.06290013, -0.06206476,  0.04023385,  0.01402097,
          0.15232352,  0.04681264, -0.04817635,  0.02173046,  0.06184984,
         -0.01303118, -0.04213028, 

In [7]:
"""
Take about 100 lines, just as a toy example
Encode the label
Break the string up into an array of 20 words get the remainder as another chunk with empty str as the first 
    20 - n
For each word in the array, get its truncated tfidf vector so one of those 20 word arrays should correspond to 
    a 20x100 array
Then build the LSTM
"""

In [53]:
lstm_size = 256
lstm_layers = 2
batch_size = 500
learning_rate = 0.001

In [54]:
# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
#     inputs_ = tf.placeholder(tf.int32, [None, None], name='inputs')
#     labels_ = tf.placeholder(tf.int32, [None, None], name='labels')
    inputs_ = tf.placeholder("float", [None, chunk_size, 100])
    labels_ = tf.placeholder(tf.int32, [None, None])
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

In [55]:
with graph.as_default():
    with tf.name_scope("RNN_layers"):
        def lstm_cell():
            # Your basic LSTM cell
            lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size, reuse=tf.get_variable_scope().reuse)
            # Add dropout to the cell
            return tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)

        # Stack up multiple LSTM layers, for deep learning
        cell = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(lstm_layers)])

        # Getting an initial state of all zeros
        initial_state = cell.zero_state(batch_size, tf.float32)

W1003 11:28:54.835916 140735741789056 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

W1003 11:28:54.836936 140735741789056 deprecation.py:323] From <ipython-input-55-ab0169ed8749>:5: BasicLSTMCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
W1003 11:28:54.842984 140735741789056 deprecation.py:323] From <ipython-input-55-ab0169ed8749>:10: MultiRNNCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instruction

In [58]:
with graph.as_default():
    outputs, final_state = tf.nn.dynamic_rnn(cell, inputs_,
                                             initial_state=initial_state)

W1003 11:31:09.624288 140735741789056 deprecation.py:323] From <ipython-input-58-94a791623b59>:3: dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
W1003 11:31:09.965821 140735741789056 deprecation.py:506] From /Users/boaz.reisman/.virtualenvs/datascience/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W1003 11:31:09.975109 140735741789056 deprecation.py:506] From /Users/boaz.reisman/.virtualenvs/datascience/lib/python3.6/site-packages/tensorflow/python/ops/rnn_cell_impl.py:738: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will 

In [59]:
with graph.as_default():
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
    cost = tf.losses.mean_squared_error(labels_, predictions)
    
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

W1003 11:31:38.247855 140735741789056 deprecation.py:323] From /Users/boaz.reisman/.virtualenvs/datascience/lib/python3.6/site-packages/tensorflow/python/ops/losses/losses_impl.py:121: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [60]:
with graph.as_default():
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)

In [69]:
y = np.array([tup[0] for tup in training_vects])
y.shape

(1780,)

In [68]:
x = np.array([tup[1] for tup in training_vects])
x.shape

(1780, 20, 100)

In [82]:
def get_batches(x, y, batch_size=300):
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

In [89]:
train_prop = 0.8

set_size = x.shape[0]

split_idx = int(set_size*0.8)
train_x, rest_x = x[:split_idx], x[split_idx:]
train_y, rest_y = y[:split_idx], y[split_idx:]

val_prop = 0.5
rest_size = rest_x.shape[0]
val_idx = int(val_prop * rest_size)
val_x, test_x = rest_x[:val_idx], rest_x[val_idx:]
val_y, test_y = rest_y[:val_idx], rest_x[val_idx:]

In [90]:
train_x.shape, val_x.shape, test_y.shape

((1424, 20, 100), (178, 20, 100), (178, 20, 100))

In [99]:
# batch_y = y[:batch_size]
# batch_x = x[:batch_size]
# val_y = y[batch_size:(2 * batch_size)]
# val_x = x[batch_size:(2 * batch_size)]
batch_size=500

In [106]:
epochs = 10

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (batch_x, batch_y) in enumerate(get_batches(train_x, train_y, batch_size), 1):

            feed = {inputs_: batch_x,
                    labels_: batch_y[:, None],
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%2==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                
                for batch_val_x, batch_val_y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: batch_val_x,
                            labels_: batch_val_y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)

                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))

            
            iteration +=1
            saver.save(sess, "checkpoints/sentiment.ckpt")
    saver.save(sess, "checkpoints/sentiment.ckpt")

Val acc: nan
Val acc: nan
Epoch: 2/10 Iteration: 5 Train loss: 52.583
Val acc: nan
Val acc: nan
Epoch: 4/10 Iteration: 10 Train loss: 29.990
Val acc: nan
Val acc: nan
Val acc: nan
Epoch: 7/10 Iteration: 15 Train loss: 49.700
Val acc: nan
Val acc: nan
Epoch: 9/10 Iteration: 20 Train loss: 29.990
Val acc: nan


In [76]:
batch_size

500