In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import utls as utl
from collections import Counter
from sklearn.model_selection import LeaveOneOut
import collections

In [2]:
#Load GLOVE vectors
filepath_glove = '/home/avaratharaj/Glove/glove.6B/glove.6B.100d.txt'
glove_vocab = []
glove_embd=[]
embedding_dict = {}
 
file = open(filepath_glove,'r',encoding='UTF-8')
for line in file.readlines():
    row = line.strip().split(' ')
    vocab_word = row[0]
    glove_vocab.append(vocab_word)
    embed_vector = [float(i) for i in row[1:]] # convert to list of float
    embedding_dict[vocab_word]=embed_vector
file.close()
  
print('Loaded GLOVE')
 
glove_vocab_size = len(glove_vocab)
embedding_dim = len(embed_vector)

Loaded GLOVE


In [3]:
def build_dictionaries(words):
    count = collections.Counter(words).most_common() #creates list of word/count pairs;
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary) #len(dictionary) increases each iteration
        reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, reverse_dictionary

In [4]:
data = pd.read_csv('open_response_filter.csv')
#data= data.loc[data['problem_id']==1096508]
for index, row in data.iterrows():
    sentence=row['answer_text']
    sentence=sentence.replace("."," ")
    sentence=sentence.replace("+"," + ")
    sentence=sentence.replace("/"," / ")
    sentence=sentence.replace("*"," * ")
    sentence=sentence.replace("-"," - ")
    sentence=sentence.replace("("," ( ")
    sentence=sentence.replace(")"," ) ")
    sentence=sentence.replace("'"," ")
    sentence=sentence.replace("="," = ")
    sentence=sentence.replace(":"," : ")
    sentence=sentence.replace("<"," < ")
    sentence=sentence.replace(">"," > ")
    sentence=sentence.replace("^"," ^ ")
    sentence=sentence.replace(","," ")
    sentence=sentence.replace(u'\xa0', u' ')
    sentence=sentence.replace(u'\r\n', u'')
    sentence=sentence.replace('  ', ' ')
    sentence=sentence.replace('   ', ' ')
    sentence=sentence.replace('-', ' ')
    
    
    sentence=sentence.lower()
    sentence=sentence.strip()
    data.set_value(index,'answer_text',sentence)


In [8]:
#Converting floats to integers
list(set(data['correct']))
data.loc[data['correct']==0.0,'correct']=0
data.loc[data['correct']==0.25,'correct']=1
data.loc[data['correct']==0.5,'correct']=2
data.loc[data['correct']==0.75,'correct']=3



In [9]:
list(set(data['correct']))

[0.0, 1.0, 2.0, 3.0]

In [None]:
# get messages and sentiment labels
messages = data.answer_text.values
labels = data.correct.values

messages = np.array([utl.preprocess_ST_message(message) for message in messages])

messages=messages.tolist()
labels=labels.tolist()
for i in messages:
    if(len(i)>600):
        indx=messages.index(i)
        del messages[indx]
        del labels[indx]
full_lexicon = " ".join(messages).split()
dictionary, reverse_dictionary = build_dictionaries(full_lexicon)
messages_lens = Counter([len(x) for x in messages])
print("Zero-length messages: {}".format(messages_lens[0]))
print("Maximum message length: {}".format(max(messages_lens)))
print("Average message length: {}".format(np.mean([len(x) for x in messages])))
messages, labels = utl.drop_empty_messages(messages, labels)
messages = utl.encode_ST_messages(messages, dictionary)
labels = utl.encode_ST_labels(labels)

In [10]:
dictionary['rea']

1897

In [11]:
#Create embedding array
import random
from scipy import spatial
  
doc_vocab_size = len(dictionary)
dict_as_list = sorted(dictionary.items(), key = lambda x : x[1])
 
embeddings_tmp=[]
 
for i in range(doc_vocab_size):
    item = dict_as_list[i][0]
    if item in glove_vocab:
        embeddings_tmp.append(embedding_dict[item])
    else:
        rand_num = np.random.uniform(low=0.0, high=0.0,size=embedding_dim)
        embeddings_tmp.append(rand_num)
 
    # final embedding array corresponds to dictionary of words in the document
embedding = np.asarray(embeddings_tmp)
 
# create tree so that we can later search for closest vector to prediction
tree = spatial.KDTree(embedding)

In [12]:
def model_inputs():
    """
    Create the model inputs
    """
    inputs_ = tf.placeholder(tf.int32, [None, None], name='inputs')
    labels_ = tf.placeholder(tf.float32, [None, None], name='labels')
    keep_prob_ = tf.placeholder(tf.float32, name='keep_prob')
    
    return inputs_, labels_, keep_prob_

In [13]:
def build_embedding_layer(inputs_, vocab_size, embed_size):
    """
    Create the embedding layer
    """
    #embedding = tf.Variable(tf.random_uniform((vocab_size, embed_size), -1, 1))
    #embed = tf.nn.embedding_lookup(embedding, inputs_)
    with tf.name_scope("embedding"):
        W = tf.Variable(tf.constant(0.0, shape=[doc_vocab_size, embedding_dim]), trainable=True, name="W")
        embedding_placeholder = tf.placeholder(tf.float32, [doc_vocab_size, embedding_dim])
        embedding_init = W.assign(embedding_placeholder)
        embed = tf.nn.embedding_lookup(W,inputs_)
    
    
    
    
    return embed

In [14]:
def build_lstm_layers(lstm_sizes, embed, keep_prob_, batch_size):
    """
    Create the LSTM layers
    """
    lstms = [tf.contrib.rnn.BasicLSTMCell(size) for size in lstm_sizes]
    # Add dropout to the cell
    drops = [tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob_) for lstm in lstms]
    # Stack up multiple LSTM layers, for deep learning
    cell = tf.contrib.rnn.MultiRNNCell(drops)
    # Getting an initial state of all zeros
    initial_state = cell.zero_state(batch_size, tf.float32)
    
    lstm_outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)
    
    return initial_state, lstm_outputs, cell, final_state

In [15]:
def build_cost_fn_and_opt(lstm_outputs, labels_, learning_rate):
    """
    Create the Loss function and Optimizer
    """
    predictions = tf.contrib.layers.fully_connected(lstm_outputs[:, -1], 1, activation_fn=tf.sigmoid)
    loss = tf.losses.mean_squared_error(labels_, predictions)
    optimzer = tf.train.AdadeltaOptimizer(learning_rate).minimize(loss)
    
    return predictions, loss, optimzer

In [16]:
def build_accuracy(predictions, labels_):
    """
    Create accuracy
    """
    #print("INSIDE BUILD ACCURACY")
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.float32), labels_)
    #print("Predictions:",predictions)
    #print("Labels:",labels_)
    #accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    accuracy = tf.sqrt(tf.reduce_mean(tf.square(tf.subtract(predictions,labels_))))
    #print("ACC:",accuracy)
    return accuracy

In [17]:
def build_and_train_network(lstm_sizes, vocab_size, embed_size, epochs, batch_size,
                            learning_rate, keep_prob, train_x, val_x, train_y, val_y):
    
    inputs_, labels_, keep_prob_ = model_inputs()
    #embed = build_embedding_layer(inputs_, vocab_size, embed_size)
    with tf.name_scope("embedding"):
        W = tf.Variable(tf.constant(0.0, shape=[doc_vocab_size, embedding_dim]), trainable=True, name="W")
        embedding_placeholder = tf.placeholder(tf.float32, [doc_vocab_size, embedding_dim])
        embedding_init = W.assign(embedding_placeholder)
        embed = tf.nn.embedding_lookup(W,inputs_)
    
    initial_state, lstm_outputs, lstm_cell, final_state = build_lstm_layers(lstm_sizes, embed, keep_prob_, batch_size)
    predictions, loss, optimizer = build_cost_fn_and_opt(lstm_outputs, labels_, learning_rate)
    accuracy = build_accuracy(predictions, labels_)
    #print("Pred:",predictions)
    #print("lables:",labels_)
    #print("Embed:")
    #print(embed)
    saver = tf.train.Saver()
    
    with tf.Session() as sess:
        
        sess.run(tf.global_variables_initializer())
        sess.run(embedding_init, feed_dict={embedding_placeholder: embedding}) #added
        n_batches = len(train_x)//batch_size
        for e in range(epochs):
            state = sess.run(initial_state)
            
            train_acc = []
            for ii, (x, y) in enumerate(utl.get_batches(train_x, train_y, batch_size), 1):
                #print("Train_x")
                #print(x)
                #print("Shape:",x.shape)
                #print("Size of x before:",x.shape)
                x_lens = Counter([len(x_) for x_ in x])
                #print("X_lens:",x_lens)
                #print("max:",max(x_lens))
                x = utl.zero_pad_messages(x, seq_len=max(x_lens))
                
                #print("Size of x after:",x.shape)
                feed = {inputs_: x,
                        labels_: y[:, None],
                        keep_prob_: keep_prob,
                        initial_state: state}
                loss_, state, _,  batch_acc = sess.run([loss, final_state, optimizer, accuracy], feed_dict=feed)
                train_acc.append(batch_acc)
                """ 
                if (ii + 1) % n_batches == 0:
                     
                    val_acc = []
                    val_state = sess.run(lstm_cell.zero_state(batch_size, tf.float32))
                    for xx, yy in utl.get_batches(val_x, val_y, batch_size):
                        
                        xx_lens = Counter([len(x_) for x_ in xx])
                        xx = utl.zero_pad_messages(xx, seq_len=max(xx_lens))
                        feed = {inputs_: xx,
                                labels_: yy[:, None],
                                keep_prob_: 1,
                                initial_state: val_state}
                        val_batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                        val_acc.append(val_batch_acc)
                """        
        val_acc = []
        val_state = sess.run(lstm_cell.zero_state(batch_size, tf.float32))
        for xx, yy in utl.get_batches(val_x, val_y, batch_size):

            xx_lens = Counter([len(x_) for x_ in xx])
            xx = utl.zero_pad_messages(xx, seq_len=max(xx_lens))
            feed = {inputs_: xx,
                    labels_: yy[:, None],
                    keep_prob_: 1,
                    initial_state: val_state}
            val_batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
            val_acc.append(val_batch_acc)
        saver.save(sess, "checkpoints/sentiment.ckpt")
    print("Epoch: {}/{}...".format(e+1, epochs),
          "Batch: {}/{}...".format(ii+1, n_batches),
          "Train Loss: {:.3f}...".format(loss_),
          "Train Accruacy: {:.3f}...".format(np.mean(train_acc)),
          "Val Accuracy: {:.3f}".format(np.mean(val_acc)))
    return val_acc,np.mean(val_acc)
                        

In [18]:
# Define Inputs and Hyperparameters
lstm_sizes = [10]
vocab_size = doc_vocab_size #len(vocab_to_int) + 1 #add one for padding
embed_size = embedding_dim
epochs = 20
batch_size = 1
learning_rate = 0.1
keep_prob = 0.5

In [19]:
    # read data from csv file
data1 = pd.read_csv('open_response_filter.csv')
results=[]
problem_ids=list(set(data['problem_id']))
    for p in problem_ids:
        print("Prob id:",p)
        data= data1.loc[data1['problem_id']==p]
        for index, row in data.iterrows():
            sentence=row['answer_text']
            sentence=sentence.replace("."," ")
            sentence=sentence.replace("+"," + ")
            sentence=sentence.replace("/"," / ")
            sentence=sentence.replace("*"," * ")
            sentence=sentence.replace("-"," - ")
            sentence=sentence.replace("("," ( ")
            sentence=sentence.replace(")"," ) ")
            sentence=sentence.replace("'"," ")
            sentence=sentence.replace("="," = ")
            sentence=sentence.replace(":"," : ")
            sentence=sentence.replace("<"," < ")
            sentence=sentence.replace(">"," > ")
            sentence=sentence.replace("^"," ^ ")
            sentence=sentence.replace(","," ")
            sentence=sentence.replace(u'\xa0', u' ')
            sentence=sentence.replace(u'\r\n', u'')
            sentence=sentence.replace('  ', ' ')
            sentence=sentence.replace('   ', ' ')
            sentence=sentence.replace('-', ' ')


            sentence=sentence.lower()
            sentence=sentence.strip()
            data.set_value(index,'answer_text',sentence)
        # get messages and sentiment labels
        messages = data.answer_text.values
        labels = data.correct.values


        #messages=data['answer_text']
        #labels= data['correct']
        # View sample of messages with sentiment

        #for i in range(10):
            #print("Messages: {}...".format(messages[i]),
                  #"Correctnes: {}".format(labels[i]))
        messages = np.array([utl.preprocess_ST_message(message) for message in messages])
        messages=messages.tolist()
        labels=labels.tolist()
        for i in messages:
            if(len(i)>600):
                indx=messages.index(i)
                del messages[indx]
                del labels[indx]
        
        #Create dictionary and reverse dictionary with word ids

        full_lexicon = " ".join(messages).split()

        #messages = utl.encode_ST_messages(messages, dictionary)
        messages = utl.encode_ST_messages(messages, dictionary)
        labels = utl.encode_ST_labels(labels)
        #for i in range(10):
        #    print("Messages: {}...".format(messages[i]))
        
        
        loo = LeaveOneOut()


        #print(loo.get_n_splits(messages))


        counter = 0
        test_rmse=[]
        for train_index, test_index in loo.split(messages):
            #print("Counter=",counter)
            counter=counter+1
            train_x,test_x = messages[train_index], messages[test_index]
            train_y,test_y = labels[train_index], labels[test_index]

            with tf.Graph().as_default():
                test_rmse,mean_test_rmse=build_and_train_network(lstm_sizes, vocab_size, embed_size, epochs, batch_size,
                                    learning_rate, keep_prob, train_x, test_x, train_y, test_y)
                #print("Test rmse:",test_rmse,"Mean test rmse",mean_test_rmse)
                test_rmse.append(mean_test_rmse)
                #test_network('checkpoints', batch_size, test_x, test_y)
        print("Test Rmse:",np.mean(np.array(test_rmse)))
        results.append([p,len(messages),np.mean(np.array(test_rmse))])
result_df=pd.DataFrame(results)
result_df.to_csv('Results.csv')

Prob id: 1415683
Epoch: 20/20... Batch: 42/41... Train Loss: 0.028... Train Accruacy: 0.232... Val Accuracy: 0.245
Epoch: 20/20... Batch: 42/41... Train Loss: 0.034... Train Accruacy: 0.245... Val Accuracy: 0.238
Epoch: 20/20... Batch: 42/41... Train Loss: 0.014... Train Accruacy: 0.264... Val Accuracy: 0.239
Epoch: 20/20... Batch: 42/41... Train Loss: 0.012... Train Accruacy: 0.261... Val Accuracy: 0.306
Epoch: 20/20... Batch: 42/41... Train Loss: 0.026... Train Accruacy: 0.201... Val Accuracy: 0.154
Epoch: 20/20... Batch: 42/41... Train Loss: 0.064... Train Accruacy: 0.267... Val Accuracy: 0.231
Epoch: 20/20... Batch: 42/41... Train Loss: 0.069... Train Accruacy: 0.271... Val Accuracy: 0.270
Epoch: 20/20... Batch: 42/41... Train Loss: 0.172... Train Accruacy: 0.360... Val Accuracy: 0.418
Epoch: 20/20... Batch: 42/41... Train Loss: 0.211... Train Accruacy: 0.333... Val Accuracy: 0.261
Epoch: 20/20... Batch: 42/41... Train Loss: 0.069... Train Accruacy: 0.259... Val Accuracy: 0.251
Epo

InvalidArgumentError: slice index -1 of dimension 1 out of bounds.
	 [[Node: strided_slice = StridedSlice[Index=DT_INT32, T=DT_FLOAT, begin_mask=1, ellipsis_mask=0, end_mask=1, new_axis_mask=0, shrink_axis_mask=2, _device="/job:localhost/replica:0/task:0/device:GPU:0"](rnn/transpose, strided_slice/stack, strided_slice/stack_1, strided_slice/stack_2)]]
	 [[Node: rnn/while/Exit_2/_49 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_987_rnn/while/Exit_2", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

Caused by op 'strided_slice', defined at:
  File "/home/avaratharaj/anaconda3/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/avaratharaj/anaconda3/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/avaratharaj/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/home/avaratharaj/anaconda3/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/avaratharaj/anaconda3/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "/home/avaratharaj/anaconda3/lib/python3.6/site-packages/tornado/ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "/home/avaratharaj/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/avaratharaj/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/home/avaratharaj/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/avaratharaj/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/home/avaratharaj/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/avaratharaj/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/avaratharaj/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/avaratharaj/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "/home/avaratharaj/anaconda3/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/avaratharaj/anaconda3/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/avaratharaj/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/avaratharaj/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/avaratharaj/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-19-ea3e3f1443ba>", line 82, in <module>
    learning_rate, keep_prob, train_x, test_x, train_y, test_y)
  File "<ipython-input-17-8f431cba5bd3>", line 13, in build_and_train_network
    predictions, loss, optimizer = build_cost_fn_and_opt(lstm_outputs, labels_, learning_rate)
  File "<ipython-input-15-7bafacdc2442>", line 5, in build_cost_fn_and_opt
    predictions = tf.contrib.layers.fully_connected(lstm_outputs[:, -1], 1, activation_fn=tf.sigmoid)
  File "/home/avaratharaj/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py", line 538, in _SliceHelper
    name=name)
  File "/home/avaratharaj/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py", line 706, in strided_slice
    shrink_axis_mask=shrink_axis_mask)
  File "/home/avaratharaj/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py", line 5430, in strided_slice
    name=name)
  File "/home/avaratharaj/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/avaratharaj/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2956, in create_op
    op_def=op_def)
  File "/home/avaratharaj/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1470, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): slice index -1 of dimension 1 out of bounds.
	 [[Node: strided_slice = StridedSlice[Index=DT_INT32, T=DT_FLOAT, begin_mask=1, ellipsis_mask=0, end_mask=1, new_axis_mask=0, shrink_axis_mask=2, _device="/job:localhost/replica:0/task:0/device:GPU:0"](rnn/transpose, strided_slice/stack, strided_slice/stack_1, strided_slice/stack_2)]]
	 [[Node: rnn/while/Exit_2/_49 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_987_rnn/while/Exit_2", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
