In [1]:
from __future__ import print_function
import numpy as np
import tensorflow as tf
import os
from gensim.models.word2vec import Word2Vec


In [2]:
def build_embeds(data_path):

    data_path = data_path
    _save_path = os.path.join(data_path,'model.ckpt')
    epsilon = 1e-4

    binary_file = os.path.join(data_path,
                               'GoogleNews-vectors-negative300.bin')
    w2v_dat = os.path.join(data_path,'embed.dat')
    w2v_vocab = os.path.join(data_path,'embed.vocab')

    if not os.path.exists(w2v_dat):
        print("Caching word embeddings in memmapped format.\nPlease be patient...")
        wv = Word2Vec.load_word2vec_format(
            binary_file,binary=True)
        fp = np.memmap(w2v_dat, dtype=np.double,
                       mode='w+', shape=wv.syn0.shape)
        fp[:] = wv.syn0[:]
        with open(w2v_vocab, "w") as f:
            for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()):
                print(w, file=f)
        del fp, wv

    # create word embeddings and mapping of vocabulary item to index
    embeddings = np.memmap(w2v_dat, dtype=np.float64,
                                mode="r", shape=(3000000, 300))
    with open(w2v_vocab) as f:
        vocab_list = map(lambda string: string.strip(), f.readlines())
    vocab_dict = {w: i for i, w in enumerate(vocab_list)}

    # mean of 20 rarest words, used as a stand-in for pairwise distances
    # if a word is out-of-vocabulary
    avg_rare_word = np.mean(np.vstack((embeddings[-20:])),axis=0)
    
    return embeddings,vocab_list,vocab_dict,avg_rare_word


In [3]:
embed,v_list,vocab,rare = build_embeds('/Users/Rutherford/Desktop/data')

In [5]:
def context_win(phrase, win):
    '''
    win : int corresponding to the size of the window
    given a list of indexes composing a sentence

    l : array containing the word indices

    return : a list of list of indexes corresponding
    to context windows surrounding each word in the sentence
    '''
    
    # does it make sense to give it index of rarest word?
    l = [vocab[w] if w in vocab else -1 for w in phrase.split()]
    
    # window length must be odd and positive
    assert (win % 2) == 1
    assert win > 0
    #l = list(l)  no longer needed once you had the list of indices be the input
    
    # pad with window_size//2 -1's on each side
    lpadded = win // 2 * [-1] + l + win // 2 * [-1]
    out = [lpadded[i:(i + win)] for i in range(len(l))]

    assert len(out) == len(l)
    return out

In [6]:
test_string = "this is a phrase that I want to break into parts"

In [7]:
test_context = context_win(test_string,5)

In [8]:
# could have the input be a tensor rank 3? does that lose the point?
# (height= #words in sentence, width=size of context window, depth = 300)

In [9]:
embed[[i for i in test_context[0]]]

memmap([[ 0.04516602, -0.04516602, -0.00393677, ...,  0.07958984,
         0.07226562,  0.01300049],
       [ 0.04516602, -0.04516602, -0.00393677, ...,  0.07958984,
         0.07226562,  0.01300049],
       [ 0.109375  ,  0.140625  , -0.03173828, ...,  0.00765991,
         0.12011719, -0.1796875 ],
       [ 0.00704956, -0.07324219,  0.171875  , ...,  0.01123047,
         0.1640625 ,  0.10693359],
       [ 0.04516602, -0.04516602, -0.00393677, ...,  0.07958984,
         0.07226562,  0.01300049]])

In [10]:
test_context

[[-1, -1, 28, 4, -1],
 [-1, 28, 4, -1, 7880],
 [28, 4, -1, 7880, 3],
 [4, -1, 7880, 3, 20],
 [-1, 7880, 3, 20, 189],
 [7880, 3, 20, 189, -1],
 [3, 20, 189, -1, 843],
 [20, 189, -1, 843, 69],
 [189, -1, 843, 69, 1286],
 [-1, 843, 69, 1286, -1],
 [843, 69, 1286, -1, -1]]

In [11]:
# 300 stacks, 1 row (first word in sentence), 5 columns (len of context window)
embed[[i for i in test_context[0]]].T.reshape((300,1,5))

memmap([[[ 0.04516602,  0.04516602,  0.109375  ,  0.00704956,  0.04516602]],

       [[-0.04516602, -0.04516602,  0.140625  , -0.07324219, -0.04516602]],

       [[-0.00393677, -0.00393677, -0.03173828,  0.171875  , -0.00393677]],

       ..., 
       [[ 0.07958984,  0.07958984,  0.00765991,  0.01123047,  0.07958984]],

       [[ 0.07226562,  0.07226562,  0.12011719,  0.1640625 ,  0.07226562]],

       [[ 0.01300049,  0.01300049, -0.1796875 ,  0.10693359,  0.01300049]]])

In [12]:
# (300, 1, 55) doesn't seem too reasonable, easy fix to 300, # words, window size

In [13]:
dic = {}
for e,row in enumerate(test_context):
    dic[e] = embed[[i for i in row]].T.reshape((300,1,5))

In [14]:
for word in dic.keys():
    if word == 0:
        x = dic[word]
    else:
        x = np.dstack([x,dic[word]])

In [15]:
# thinking that the order didn't really matter as long as it's all intact
# (window size, 300, # words in sentence)
# .T : (11, 300, 5)

In [16]:
dic = {}
for e,row in enumerate(test_context):
    dic[e] = embed[[i for i in row]]

In [17]:
for word in dic.keys():
    if word == 0:
        x = dic[word]
    else:
        x = np.dstack([x,dic[word]])

In [18]:
# and a la http://deeplearning.net/tutorial/rnnslu.html 
# where they just stick embeddings for each word in the row into the same row
# i.e., result is [# words in sentence,300*window size]

In [19]:
for row in range(len(test_context)):
    if row == 0:
        tut = np.concatenate([embed[word] for word in test_context[row]])
    else:
        tut = np.vstack((tut,np.concatenate([embed[word] for word in test_context[row]])))

In [20]:
def embedder(cntxt_array):
    for row in range(len(cntxt_array)):
        if row == 0:
            tut = np.concatenate([embed[word] for word in cntxt_array[row]])
        else:
            tut = np.vstack((tut,np.concatenate([embed[word] for word in cntxt_array[row]])))
    return tut

In [21]:
from nltk.corpus import brown

In [22]:
sentences = brown.sents()

In [23]:
dic = {}
for i,sent in enumerate(sentences[:10000]):
    dic[i] = embedder(context_win(' '.join(sent),5))


In [24]:
#6008 -- AZ - Dot Product with GRU

In [29]:
import re

In [43]:
puncs = '[!@#$%^&*()_+=:;"/,]+'

In [52]:
test_string2 = "this # (isn't) ==: anything / to $ awe$ome write % home & 'bout, but here goez nuttin'"

In [56]:
test_string2 = re.sub(puncs,'',test_string2)

In [57]:
test_string2.split()

['this',
 "isn't",
 'anything',
 'to',
 'aweome',
 'write',
 'home',
 "'bout",
 'but',
 'here',
 'goez',
 "nuttin'"]

In [73]:
np.asarray([embed[vocab[word]] if word in vocab else embed[-1] for word in test_string2.split()]).shape

(12, 300)

In [95]:
dic = {1:2,2:1,3:1,4:2,'dog':1,500:1,51:0,50:0}

In [97]:
for k in dic.keys():
    print(k)

50
1
2
3
4
51
500
dog


In [106]:
#Y = np.asarray([1,2,1,2,2,2,1,2,1,1,1,1,1,2,1,1,2]).reshape([-1,1])
#np.asarray([[1,0] if target == 1 else [0,1] for target in Y])

In [143]:
from __future__ import print_function
import os
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import euclidean_distances,confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import fuzz
import pandas as pd
from pyemd import emd
import tensorflow as tf
from gensim.models.word2vec import Word2Vec


class error_checker():
    """
    Error checker class that builds embeddings upon instantiation, is capable
    of being retrained, making predictions, and inspecting performance.
    expects data_path upon instantiation, which is a directory in which
    the 3000000x300 pretrained Google News vectors binary file should be at
    very least, and will create embeddings and vocab (embed.dat, embed.vocab)
    in that directory if they do not exist. In order to perform training, the
    class expects 'dataset.csv' as well, which should have no header, and
    three entries per datapoint (Error [1 for minor, 2 for major],
    String 1 [first transcription],String 2 [second transcription]). Some
    files will be created as a result of training (model.ckpt, fuzzy.csv).
    """
    def __init__(self,data_path):

        self.data_path = data_path
        self._save_path = os.path.join(self.data_path,'model.ckpt')
        self.epsilon = 1e-4
        self.puncs = '[!@#$%^&*()_+=:;"/,]+'
        
        self.training_set = os.path.join(self.data_path,'sentences.csv')

        binary_file = os.path.join(self.data_path,
                                   'GoogleNews-vectors-negative300.bin')
        w2v_dat = os.path.join(self.data_path,'embed.dat')
        w2v_vocab = os.path.join(self.data_path,'embed.vocab')

        if not os.path.exists(w2v_dat):
            print("Caching word embeddings in memmapped format.                     Please be patient...")
            wv = Word2Vec.load_word2vec_format(
                binary_file,binary=True)
            fp = np.memmap(w2v_dat, dtype=np.double,
                           mode='w+', shape=wv.syn0.shape)
            fp[:] = wv.syn0[:]
            with open(w2v_vocab, "w") as f:
                for _, w in sorted((voc.index, word)                                    for word, voc in wv.vocab.items()):
                    print(w, file=f)
            del fp, wv

        # create word embeddings and mapping of vocabulary item to index
        self.embeddings = np.memmap(w2v_dat, dtype=np.float64,
                                    mode="r", shape=(3000000, 300))
        with open(w2v_vocab) as f:
            vocab_list = map(lambda string: string.strip(), f.readlines())
        self.vocab_dict = {w: i for i, w in enumerate(vocab_list)}

        # mean of 20 rarest words, used as a stand-in for pairwise distances
        # if a word is out-of-vocabulary
        self.avg_rare_word = np.mean(np.vstack((self.embeddings[-20:])),axis=0)

    def _data_generator(self,string):
        """
        Transform string into array of embeddings
        """
        
        wordlist = re.sub(puncs,'',string).split()
        X = np.asarray([self.embeddings[self.vocab_dict[word]] if word in self.vocab_dict 
                               else self.avg_rare_word
                               for word in wordlist])

        return X

    def _train_data_generator(self,seed):
        """
        Returns: X, Y, shuffled indices, original X (as pairs of strings)
            The latter two are included purely for examining performance.
        """


        X_in = np.genfromtxt(self.training_set,
                      delimiter=',',usecols=(1),dtype=str)
        Y_in = np.genfromtxt(self.training_set,
                      delimiter=',',usecols=(0)).reshape((-1,1))

        #X = {}
        X = []
        Y = []
        indices = []
        bad = []

        for i,string in enumerate(X_in):
            try:
                
                #X[i] = self._data_generator(string)
                X.append(self._data_generator(string))

                # target
                Y.append(Y_in[i])
                indices.append(i)
            except:
                bad.append(i)
                continue
        self.bad = bad
        
        
        """############"""
        X = np.asarray(X)#.reshape((len(X),-1))
        Y = np.asarray(Y).reshape((-1,1))

        # randomly shuffle the data
        np.random.seed(seed)
        np.random.shuffle(indices)
        #X = X[indices]
        Y = Y[indices]

        # transform Y from either 1 or 2 to a one-hot vector ([1,0] or [0,1])
        Y = np.asarray([[1,0] if target == 1 else [0,1] for target in Y])
        
        # X_in is NOT shuffled for the purpose of testing later
        return X,Y,indices,X_in

    def _build_graph(self,training):

        # inputs and outputs (latter are one-hot vectors)
        X = tf.placeholder(tf.float32, shape=[None,12])#"""612])"""
        Y = tf.placeholder(tf.float32, shape=[None,2])
        lr = tf.placeholder(tf.float32)
        glob_step = tf.Variable(0,dtype=tf.float32,trainable=False)

        weight_shape1 = [12,256]#"""6"""
        weight_shape2 = [256,128]
        weight_shape3 = [128,16]
        weight_shape4 = [16,2]

        [n_inputs1,n_outputs1,n_inputs3,n_outputs3,n_outputs_final] =             weight_shape1[0],weight_shape1[1],weight_shape3[0],             weight_shape3[1],weight_shape4[1]

        init_range1 = tf.sqrt(6.0/(n_inputs1+n_outputs1))
        init_range2 = tf.sqrt(6.0/(n_outputs1+n_inputs3))
        init_range3 = tf.sqrt(6.0/(n_inputs3+n_outputs3))
        init_range4 = tf.sqrt(6.0/(n_outputs3+n_outputs_final))
        w1 = tf.Variable(tf.random_uniform(weight_shape1,
                                           -init_range1,init_range1),name='w1')
        w2 = tf.Variable(tf.random_uniform(weight_shape2,
                                           -init_range2,init_range2),name='w2')
        w3 = tf.Variable(tf.random_uniform(weight_shape3,
                                           -init_range3,init_range3),name='w3')
        w4 = tf.Variable(tf.random_uniform(weight_shape4,
                                           -init_range4,init_range4),name='w4')
        b = tf.Variable(tf.constant(.1,shape=[n_outputs_final]))


        # network - batch normalization in training, relu activations
        dot1 = tf.matmul(X,w1)
        batch_normed1 = self._batch_norm_wrapper(dot1,training)
        rel1 = tf.nn.relu(batch_normed1)

        dot2 = tf.matmul(rel1,w2)
        batch_normed2 = self._batch_norm_wrapper(dot2,training)
        rel2 = tf.nn.relu(batch_normed2)

        dot3 = tf.matmul(rel2,w3)
        batch_normed3 = self._batch_norm_wrapper(dot3,training)
        rel3 = tf.nn.relu(batch_normed3)

        # softmax layer
        logits = tf.matmul(rel3,w4)+b
        probs_x = tf.nn.softmax(logits)

        # cost:
        #    per pair
        rows_of_cost =             tf.nn.softmax_cross_entropy_with_logits(logits,Y,
                                                    name='rows_of_cost')
        #    average over all pairs
        cost = tf.reduce_mean(rows_of_cost,reduction_indices=None,
                              keep_dims=False,name='cost')

        # gradients and training
        opt = tf.train.AdagradOptimizer(learning_rate=lr)
        train_op = opt.minimize(cost,global_step=glob_step,
                                var_list=[w1,w2,w3,w4,b])

        # predictions and accuracy
        correct_prediction = tf.equal(tf.arg_max(probs_x,1),tf.arg_max(Y,1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))

        return (X,Y),cost,train_op,accuracy,probs_x,lr,tf.train.Saver()

    def train(self,shuffle=True,seed=42,validation_size=.2,test_size=.1):

        """
        """



        print("Generating and splitting data...")
        X_data,Y_data,self.shuffled_idx,self.raw_X = self._train_data_generator(shuffle,seed)
                
        # create split indices for validation, test, and train sets
        self._validation_test_split_idx = int(len(Y_data)*validation_size)
        self._train_test_split_idx =                    int(len(Y_data)*test_size)+self._validation_test_split_idx

        # split data
        self.x_validation = X_data[:self._validation_test_split_idx]
        self.x_test = X_data[self._validation_test_split_idx:
                             self._train_test_split_idx]
        self.x_train = X_data[self._train_test_split_idx:]
        self.y_validation = Y_data[:self._validation_test_split_idx]
        self.y_test = Y_data[self._validation_test_split_idx:
                             self._train_test_split_idx]
        self.y_train = Y_data[self._train_test_split_idx:]

        print("Training model...")
        # build and run network in training mode
        tf.reset_default_graph()
        (X,Y),cost,train_op,accuracy,probs_x,lr,saver =                 self._build_graph(training=True)

        self.accuracy = []
        with tf.Session() as sess:
            sess.run(tf.initialize_all_variables())
            mini_batch_size = 64
            start_end = zip(range(0,len(self.x_train),mini_batch_size),
                           range(mini_batch_size,len(self.x_train)+1,
                                 mini_batch_size))
            cost_list = []

            # number of training epochs
            num_passes = 101
            for pass_i in range(num_passes):
                for (s,e) in start_end:

                    # learning rate scheduling
                    if pass_i < 20:
                        cost_list.append(sess.run(
                                [cost],feed_dict={X:self.x_train[s:e,],
                                                  Y:self.y_train[s:e],
                                                  lr:.9}))
                        sess.run([train_op],feed_dict={X:self.x_train[s:e,],
                                                       Y:self.y_train[s:e],
                                                       lr:.9})
                    else:
                        cost_list.append(sess.run(
                                [cost],feed_dict={X:self.x_train[s:e,],
                                                  Y:self.y_train[s:e],
                                                  lr:.0005}))
                        sess.run([train_op],feed_dict={X:self.x_train[s:e,],
                                                       Y:self.y_train[s:e],
                                                       lr:.0005})
                # show current accuracy
                if pass_i % 5 == 0:
                    result = sess.run([accuracy],
                                      feed_dict={X:self.x_validation,
                                                 Y:self.y_validation})
                    self.accuracy.append(result[0])
                    print('Pass number: ',pass_i,
                          ' -- validation set accuracy: ',result[0])
            # save cost and result lists for examining model performance
            self._cost_list = cost_list
            self._result_list = sess.run([tf.arg_max(probs_x,1)],
                                         feed_dict={X:self.x_test,
                                                    Y:self.y_test})
            # save model in self._save_path
            save_path = saver.save(sess,self._save_path)
            print("Model saved in file: {}".format(save_path))

    def check_results(self):
        """
        Prints a confusion matrix of performance on the test set,
        and instantiates lists of True Positive, True Negative,
        False Positive, and False Negative for inspection as
        self._TP, self._TN, self._FP, self._FN.
        """

        # print confusion matrix
        true_y_labels = np.array(self.y_test[:,1])
        print('\t\tPredicted:')
        print('\t\tmin. maj.')
        print('Actual:\t min.',
              confusion_matrix(true_y_labels,self._result_list[0])[0])
        print('    \t maj.',
              confusion_matrix(true_y_labels,self._result_list[0])[1])

        # identify predicted and true positives and negatives
        predicted_pos = np.where(self._result_list[0]==1)
        predicted_neg = np.where(self._result_list[0]==0)
        actual_pos = np.where(np.argmax(self.y_test,1)==1)
        actual_neg = np.where(np.argmax(self.y_test,1)==0)

        # indices of shuffled and split data (just y_test)
        true_pos = np.intersect1d(predicted_pos,actual_pos).tolist()
        true_neg = np.intersect1d(predicted_neg,actual_neg).tolist()
        false_pos = np.intersect1d(predicted_pos,actual_neg).tolist()
        false_neg = np.intersect1d(predicted_neg,actual_pos).tolist()
        y_indices = self.shuffled_idx[self._validation_test_split_idx:
                                      self._train_test_split_idx]

        # create lists of true and false positives and negatives
        self._TP = [list(self.raw_X[y_indices[i]]) for i in true_pos]
        self._TN = [list(self.raw_X[y_indices[i]]) for i in true_neg]
        self._FP = [list(self.raw_X[y_indices[i]]) for i in false_pos]
        self._FN = [list(self.raw_X[y_indices[i]]) for i in false_neg]

    def predict(self,csv_file):
        """
        Predicts the type of error between the two strings in each row of
        a CSV file.

        Returns:
            0 for minor, 1 for major,
            'No error' for identical strings,
            and 'Unknown' if a prediction cannot be made (could change to 0).
        """
        predictions = []
        # build graph and initialize session
        tf.reset_default_graph()
        (X,_),_,_,_,pred_y,lr,saver = self._build_graph(training=False)
        with tf.Session() as sess:
            sess.run(tf.initialize_all_variables())
            saver.restore(sess,self._save_path)

            # generate calculations from 2d array of input strings
            for row in np.genfromtxt(csv_file,dtype='str',delimiter=','):
                str_1,str_2 = row[0],row[1]

                # strings identical
                if str_1 == str_2:
                    predictions.append('No error')
                    continue

                # model prediction
                try:
                    pred = sess.run([tf.arg_max(pred_y,1)],
                                    feed_dict=\
                                    {X: self._data_generator(str_1,str_2)})
                    predictions.append(str(pred[0][0]+1))

                # can't predict
                except:
                    predictions.append('Unknown')
        
        print(','.join(predictions))


In [144]:
classifier = error_checker('/Users/Rutherford/Desktop/data')

In [80]:
test_string2

"this  isn't  anything  to  aweome write  home  'bout but here goez nuttin'"

In [478]:
puncs2 = '[!@#$%^&*()_+=:;"/,`.]+'

In [479]:
sent_list = [' '.join(sent) for sent in sentences[:100]]
clean_sents = [re.sub("''",'',almost) for almost in [re.sub(puncs2,'',sent) for sent in sent_list]]
with open('/Users/Rutherford/Desktop/sents.txt','w') as f:
    for sent in clean_sents:
        f.write('1,{}\n'.format(sent))

In [482]:
bad_sent_list = [' '.join(sent) for sent in sentences[100:150]]
clean_bad_sents = [re.sub("''",'',almost) for almost in [re.sub(puncs2,'',sent) for sent in bad_sent_list]]
with open('/Users/Rutherford/Desktop/bad_sents.txt','w') as f:
    for sent in clean_bad_sents:
        f.write('2,{}\n'.format(sent))

In [483]:
X1,Y1,indices1,X_in1 = classifier._train_data_generator()

In [484]:
Y1

array([[0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [0, 1],
       [1, 0],
       [1, 0],
       [0,

In [485]:
indices1

[135,
 115,
 131,
 55,
 95,
 29,
 156,
 51,
 101,
 145,
 19,
 85,
 15,
 66,
 24,
 30,
 132,
 105,
 151,
 16,
 75,
 18,
 12,
 9,
 31,
 154,
 98,
 56,
 134,
 159,
 139,
 78,
 60,
 84,
 2,
 94,
 45,
 42,
 69,
 152,
 26,
 141,
 117,
 93,
 133,
 36,
 82,
 22,
 126,
 67,
 97,
 11,
 65,
 86,
 6,
 27,
 76,
 142,
 38,
 41,
 4,
 138,
 32,
 144,
 109,
 68,
 10,
 96,
 111,
 0,
 122,
 123,
 64,
 44,
 146,
 28,
 40,
 114,
 25,
 23,
 119,
 81,
 79,
 39,
 90,
 108,
 158,
 137,
 47,
 124,
 61,
 73,
 33,
 112,
 120,
 128,
 62,
 161,
 100,
 104,
 53,
 5,
 118,
 127,
 150,
 49,
 35,
 80,
 77,
 34,
 46,
 7,
 43,
 70,
 125,
 110,
 91,
 83,
 147,
 148,
 89,
 8,
 155,
 113,
 13,
 59,
 140,
 3,
 17,
 72,
 143,
 136,
 149,
 63,
 54,
 107,
 50,
 160,
 58,
 48,
 88,
 21,
 57,
 157,
 129,
 37,
 153,
 1,
 52,
 130,
 103,
 99,
 116,
 87,
 74,
 121,
 162,
 20,
 71,
 106,
 14,
 92,
 102]

In [486]:
_validation_test_split_idx = int(len(Y1)*.3)
# split data
x_validation = X1[:_validation_test_split_idx]
x_train = X1[_validation_test_split_idx:]
y_validation = Y1[:_validation_test_split_idx]
y_train = Y1[_validation_test_split_idx:]

In [503]:
import functools


def lazy_property(function):
    attribute = '_' + function.__name__

    @property
    @functools.wraps(function)
    def wrapper(self):
        if not hasattr(self, attribute):
            setattr(self, attribute, function(self))
        return getattr(self, attribute)
    return wrapper


class VariableSequenceClassification:

    def __init__(self, data, target, num_hidden=200, num_layers=2):
        self.data = data
        self.target = target
        self._num_hidden = num_hidden
        self._num_layers = num_layers
        self.prediction
        self.error
        self.optimize

    @lazy_property    
    def length(self):
        used = tf.sign(tf.reduce_max(tf.abs(self.data), reduction_indices=2))
        length = tf.reduce_sum(used, reduction_indices=1)
        length = tf.cast(length, tf.int32)
        return length

    @lazy_property
    def prediction(self):
        # Recurrent network.
        output, _ = tf.nn.dynamic_rnn(
            tf.nn.rnn_cell.GRUCell(self._num_hidden),
            data,
            dtype=tf.float32,
            sequence_length=self.length,
        )
        last = self._last_relevant(output, self.length)
        # Softmax layer.
        weight, bias = self._weight_and_bias(
            self._num_hidden, int(self.target.get_shape()[1]))
        prediction = tf.nn.softmax(tf.matmul(last, weight) + bias)
        #prediction = tf.arg_max(tf.nn.softmax(tf.matmul(last, weight) + bias),1)
        
        ### predictions and accuracy
        ##correct_prediction = tf.equal(tf.arg_max(probs_x,1),tf.arg_max(Y,1))
        ##accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))

        
        return prediction

    @lazy_property
    def cost(self):
        cross_entropy = -tf.reduce_sum(self.target * tf.log(self.prediction))
        return cross_entropy

    @lazy_property
    def optimize(self):
        #learning_rate = 0.003
        #optimizer = tf.train.RMSPropOptimizer(learning_rate)
        learning_rate = 30
        optimizer = tf.train.AdagradOptimizer(learning_rate)
        return optimizer.minimize(self.cost)

    @lazy_property
    def error(self):
        mistakes = tf.not_equal(
            tf.argmax(self.target, 1), tf.argmax(self.prediction, 1))
        return tf.reduce_mean(tf.cast(mistakes, tf.float32))

    @staticmethod
    def _weight_and_bias(in_size, out_size):
        weight = tf.truncated_normal([in_size, out_size], stddev=0.01)
        bias = tf.constant(0.1, shape=[out_size])
        return tf.Variable(weight), tf.Variable(bias)

    @staticmethod
    def _last_relevant(output, length):
        batch_size = tf.shape(output)[0]
        max_length = int(output.get_shape()[1])
        output_size = int(output.get_shape()[2])
        index = tf.range(0, batch_size) * max_length + (length - 1)
        flat = tf.reshape(output, [-1, output_size])
        relevant = tf.gather(flat, index)
        return relevant

In [488]:
#y_train

In [489]:
#y_validation

In [490]:
most_words = max(map(lambda x:len(x.split()),X_in1))

lis = []
for i in range(x_train.shape[0]):
    lis.append(np.vstack((x_train[i],np.zeros([most_words-x_train[i].shape[0],300]))))
new_x_train = np.asarray(lis)

In [491]:
lis = []
for i in range(x_validation.shape[0]):
    lis.append(np.vstack((x_validation[i],np.zeros([most_words-x_validation[i].shape[0],300]))))
new_x_validation = np.asarray(lis)

In [492]:
new_x_validation.shape

(48, 52, 300)

In [493]:
new_x_train.shape

(115, 52, 300)

In [502]:
sum(np.argmax(Y1,1))/float(len(Y1))

0.33128834355828218

In [504]:
tf.reset_default_graph()

rows, row_size = new_x_train.shape[1],new_x_train.shape[2]
num_classes = 2
data = tf.placeholder(tf.float32, [None, rows, row_size])
target = tf.placeholder(tf.float32, [None, num_classes])
model = VariableSequenceClassification(data, target)
sess = tf.Session()
sess.run(tf.initialize_all_variables())
for epoch in range(10):
    for _ in range(100):
        # take 75 random train examples per run
        idxs = np.random.randint(0,new_x_train.shape[0],75)
        #batch_x = x_train[idxs]
        #batch_y = y_train[idxs]
        batch_x = new_x_train[idxs]
        batch_y = y_train[idxs]
        #print(batch_x.shape,batch_y.shape)
        sess.run(model.optimize, {data: batch_x, target: batch_y})
        ###print(sess.run(model.prediction,{data: batch_x, target: batch_y}))
        #sess.run(model.optimize, {data: batch_x[0].reshape([1,-1,300]), target: batch_y[0]})
    error = sess.run(model.error, {data: new_x_validation, target: y_validation})
    print('Epoch {:2d} error {:3.1f}%'.format(epoch + 1, 100 * error))
    # i'm so glad i've trained it to get dumber!

Epoch  1 error 35.4%
Epoch  2 error 35.4%
Epoch  3 error 35.4%


KeyboardInterrupt: 

In [None]:
def fGRU(iSize, hSize, nameScope='Recursion'):
    '''
        :param iSize: Dimensionality of input sequence elements - not the sequence length
        but the length of each element in the sequence.
        :param hSize: Hidden layer size
        :return: fR uses the input parameters to define weight and bias functions and
        then returns the GRU recursion function in form useable by tf scan function
        '''
    with tf.name_scope(nameScope) as scope:

        #Random Initialization
        sd = 0.01
        Wr = tf.Variable(tf.truncated_normal([iSize, hSize], stddev=sd), name="Wx")
        Ur = tf.Variable(tf.truncated_normal([hSize, hSize], stddev=sd), name='Ur')
        Wz = tf.Variable(tf.truncated_normal([iSize, hSize], stddev=sd), name="Wz")
        Uz = tf.Variable(tf.truncated_normal([hSize, hSize], stddev=sd), name='Uz')
        Wh = tf.Variable(tf.truncated_normal([iSize, hSize], stddev=sd), name="Wh")
        Uh = tf.Variable(tf.truncated_normal([hSize, hSize], stddev=sd), name='Uh')

        br = tf.Variable(tf.zeros([hSize]), name="br")
        bz = tf.Variable(tf.zeros([hSize]), name="bz")
        bh = tf.Variable(tf.zeros([hSize]), name="bh")

        def fR(htm1, x):
            zt = tf.sigmoid(tf.matmul(htm1,Uz) + tf.matmul(x, Wz) + bz)
            rt = tf.sigmoid(tf.matmul(htm1,Ur) + tf.matmul(x, Wr) + br)
            hSquig = tf.tanh(tf.matmul(htm1,Uh) + rt*tf.matmul(x,Wh) + bh)
            current_hidden_state = (1 - zt) * htm1 + zt * hSquig
            return current_hidden_state
    return fR

In [None]:
tf.reset_default_graph()

#replace with placeholders
input_size = 300 #1
hidden_size = 10 # changed from 32
target_size = 2

# Weights for output layers
Wo = tf.Variable(tf.truncated_normal([hidden_size, target_size], mean=0, stddev=.01), name="Wo")
bo = tf.Variable(tf.truncated_normal([target_size], mean=0, stddev=.01), name="bo")


inputs = tf.placeholder(tf.float32, shape=[None, None, input_size], name='inputs')
inputs_ = tf.transpose(inputs, perm=[1,0,2])

y = tf.placeholder(tf.float32, shape=[None, target_size])

initial_hidden = inputs[:, 0, :]
initial_hidden = tf.matmul(initial_hidden, tf.zeros([input_size, hidden_size]))

#define recursion for scan function
#fRScan = fElman(inputSize, hiddenSize)
#fRScan = mlElman(inputSize, hiddenSize, 1)
fRScan = fGRU(input_size, hidden_size)

#hidden state sequence
all_hidden_states = tf.scan(fRScan, inputs_, initializer=initial_hidden, name="RNNStates")

#extract last hidden state
last_hidden_state = tf.reverse(all_hidden_states, [True, False, False])[0, :, :]

#develop logit from last hidden state
last_output = tf.matmul(last_hidden_state, Wo) + bo




rows_of_cost = tf.nn.softmax_cross_entropy_with_logits(last_output,y,
                                            name='rows_of_cost')
#    average over all pairs
cost = tf.reduce_mean(rows_of_cost,reduction_indices=None,
                      keep_dims=False,name='cost')



#loss definition for gradients
loss = tf.reduce_mean(tf.square(y - last_output))

#rms error for printing
rms_error = tf.sqrt(loss)

#training function
train_step = tf.train.AdamOptimizer().minimize(loss)

with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())

    for epoch in range(int(len(indices1)*.8)-1):  # 120 ... 4096
        #X_train, y_train = data_gen(100, 10)
        #X_test, y_test = data_gen(100, 10)
        
        
        
        ts, lhs = sess.run([train_step, last_hidden_state], 
                           {inputs: x_train[epoch].reshape([1,-1,300]), y: y_train[epoch].reshape([1,2])})


        Loss = str(sess.run(loss, {inputs: x_train[epoch].reshape([1,-1,300]), y: y_train[epoch].reshape([1,2])}))
        Train_error = str(sess.run(rms_error, {inputs: x_train[epoch].reshape([1,-1,300]), y: y_train[epoch].reshape([1,2])}))
        Test_error = str(sess.run(rms_error, {inputs: x_validation[epoch//int(len(indices1)*.2)-1].reshape([1,-1,300]), y: y_validation[epoch//int(len(indices1)*.2)-1].reshape([1,2])}))

        print("\rIteration: %s Loss: %s Train Error: %s Test Error: %s" %
              (epoch, Loss, Train_error, Test_error)),

In [None]:
"""with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())

    for epoch in range(int(len(indices1)*.8)-1):  # 120 ... 4096
        #X_train, y_train = data_gen(100, 10)
        #X_test, y_test = data_gen(100, 10)
        
        
        
        ts, lhs = sess.run([train_step, last_hidden_state], 
                           {inputs: x_train[epoch].reshape([1,-1,300]), y: y_train[epoch].reshape([1,2])})


        Loss = str(sess.run(loss, {inputs: x_train[epoch].reshape([1,-1,300]), y: y_train[epoch].reshape([1,2])}))
        Train_error = str(sess.run(rms_error, {inputs: x_train[epoch].reshape([1,-1,300]), y: y_train[epoch].reshape([1,2])}))
        Test_error = str(sess.run(rms_error, {inputs: x_validation[epoch//int(len(indices1)*.2)-1].reshape([1,-1,300]), y: y_validation[epoch//int(len(indices1)*.2)-1].reshape([1,2])}))

        print("\rIteration: %s Loss: %s Train Error: %s Test Error: %s" %
              (epoch, Loss, Train_error, Test_error)),"""