In [0]:
# LSTM 10 FOLD CROSS VALIDATION ON UNIQUE DATASET (based on model 'lstm_with_unique_8'):
# copyright (c) ABDUL HASIB UDDIN <abdulhasibuddin@gmail.com>
# Courtesy: https://github.com/mchablani/deep-learning/blob/master/sentiment-rnn/Sentiment_RNN.ipynb

In [0]:
import numpy as np
import tensorflow as tf
from timeit import default_timer as timer
from collections import Counter
from string import punctuation
from google.colab import files

In [0]:
lstm_size = 128
lstm_layers = 5
k = 10
batch_size = 5
learning_rate = 0.0001
epochs = 3

In [4]:
fileName = "lstm_10_fold_cross_validation_12"
checkpointName = "checkpoints/"+fileName+".ckpt"
print(checkpointName)
print(type(checkpointName))

checkpoints/lstm_10_fold_cross_validation_12.ckpt
<class 'str'>


In [5]:
files.upload()
files.upload()

with open('data_all_unique_dnd_stratified_text.txt', 'r', encoding="utf8") as f:
    tweets = f.read()
with open('data_all_unique_dnd_stratified_labels.txt', 'r', encoding="utf8") as f:
    labels_org = f.read()
    
print('File upload done!')

Saving data_all_unique_dnd_stratified_text.txt to data_all_unique_dnd_stratified_text.txt


Saving data_all_unique_dnd_stratified_labels.txt to data_all_unique_dnd_stratified_labels.txt
File upload done!


In [0]:
# Data preprocessing::
#all_text = ''.join([c for c in tweets if c not in punctuation])
all_text = ''.join([c for c in tweets])
tweets = all_text.split('\n')

all_text = ' '.join(tweets)
words = all_text.split()

In [0]:
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

tweets_ints = []
for each in tweets:
    tweets_ints.append([vocab_to_int[word] for word in each.split()])

In [8]:
# Encoding the labels::
list_labels = []

for l in labels_org.split():
    if l == "depressive":
        list_labels.append(1)
    else:
        list_labels.append(0)
        
labels = np.array(list_labels)
print(len(labels))

1176


In [9]:
tweets_lens = Counter([len(x) for x in tweets_ints])
print("Zero-length tweets: {}".format(tweets_lens[0]))
print("Maximum tweets length: {}".format(max(tweets_lens)))

Zero-length tweets: 1
Maximum tweets length: 63


In [0]:
# Filter out that tweets with 0 length
tweets_ints = [r[0:200] for r in tweets_ints if len(r) > 0]

In [11]:
from collections import Counter
tweets_lens = Counter([len(x) for x in tweets_ints])
print("Zero-length tweets: {}".format(tweets_lens[0]))
print("Maximum tweet length: {}".format(max(tweets_lens)))

Zero-length tweets: 0
Maximum tweet length: 63


In [0]:
seq_len = 200
features = np.zeros((len(tweets_ints), seq_len), dtype=int)
# print(features[:10,:100])
for i, row in enumerate(tweets_ints):
    features[i, -len(row):] = np.array(row)[:seq_len]
#features[:10,:100]

In [13]:
#split_frac = 0.8

#split_index = int(split_frac * len(features))

#training_validation_x, test_x = features[:split_index], features[split_index:] 
#training_validation_y, test_y = labels[:split_index], labels[split_index:]

training_validation_x = features
training_validation_y = labels

split_train_val = int(len(features)/k)
#split_index = int(split_frac * len(val_x))

#val_x, test_x = val_x[:split_index], val_x[split_index:]
#val_y, test_y = val_y[:split_index], val_y[split_index:]

print("\t\t\tFeature Shapes:")
print("Train & Validation data set: {}".format(training_validation_x.shape))
print("Train & Validation label set: {}".format(training_validation_y.shape))

############
'''
dataset_split_index = int(len(final_dataset)*0.1)

training_validation_dataset = final_dataset[dataset_split_index:]
testing_dataset = final_dataset[:dataset_split_index]

training_validation_labelset = final_labelset[dataset_split_index:]
testing_labelset = final_labelset[:dataset_split_index]

print(dataset_split_index)

print('len(training_validation_dataset) =',len(training_validation_dataset))
print('len(training_validation_labelset) =',len(training_validation_labelset))

print('len(testing_dataset) =',len(testing_dataset))
print('len(testing_labelset)',len(testing_labelset))
'''

			Feature Shapes:
Train & Validation data set: (1176, 200)
Train & Validation label set: (1176,)


"\ndataset_split_index = int(len(final_dataset)*0.1)\n\ntraining_validation_dataset = final_dataset[dataset_split_index:]\ntesting_dataset = final_dataset[:dataset_split_index]\n\ntraining_validation_labelset = final_labelset[dataset_split_index:]\ntesting_labelset = final_labelset[:dataset_split_index]\n\nprint(dataset_split_index)\n\nprint('len(training_validation_dataset) =',len(training_validation_dataset))\nprint('len(training_validation_labelset) =',len(training_validation_labelset))\n\nprint('len(testing_dataset) =',len(testing_dataset))\nprint('len(testing_labelset)',len(testing_labelset))\n"

In [0]:
n_words = len(vocab_to_int) + 1 # Add 1 for 0 added to vocab

# Create the graph object
tf.reset_default_graph()
with tf.name_scope('inputs'):
    inputs_ = tf.placeholder(tf.int32, [None, None], name="inputs")
    labels_ = tf.placeholder(tf.int32, [None, None], name="labels")
    keep_prob = tf.placeholder(tf.float32, name="keep_prob")


In [0]:
# Size of the embedding vectors (number of units in the embedding layer)
embed_size = 300 

with tf.name_scope("Embeddings"):
    embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)

In [16]:
def lstm_cell():
    # Your basic LSTM cell
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size, reuse=tf.get_variable_scope().reuse)
    # Add dropout to the cell
    return tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)

with tf.name_scope("RNN_layers"):
    # Stack up multiple LSTM layers, for deep learning
    cell = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(lstm_layers)])
    
    # Getting an initial state of all zeros
    initial_state = cell.zero_state(batch_size, tf.float32)

Instructions for updating:
This class is deprecated, please use tf.nn.rnn_cell.LSTMCell, which supports all the feature this cell currently has. Please replace the existing code with tf.nn.rnn_cell.LSTMCell(name='basic_lstm_cell').


In [0]:
with tf.name_scope("RNN_forward"):
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)

In [0]:
# Output::

with tf.name_scope('predictions'):
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
    tf.summary.histogram('predictions', predictions)
with tf.name_scope('cost'):
    cost = tf.losses.mean_squared_error(labels_, predictions)
    tf.summary.scalar('cost', cost)

with tf.name_scope('train'):
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

merged = tf.summary.merge_all()

In [0]:
# Validation accuracy::

with tf.name_scope('validation'):
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [0]:
# Batching::

def get_batches(x, y, batch_size=100):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

In [21]:
# Training::
saver = tf.train.Saver()
start = timer()
folds_val_acc = []

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    train_writer = tf.summary.FileWriter('./logs/tb/train', sess.graph)
    test_writer = tf.summary.FileWriter('./logs/tb/test', sess.graph)
    
    for fold in range(1,k+1):
        print('Fold -',fold,'out of',k,'::')
        print('--------------')

        training_validation_x = training_validation_x.tolist()
        train_x = training_validation_x[:fold*split_train_val-split_train_val]
        train_x += training_validation_x[fold*split_train_val:]
        val_x = training_validation_x[fold*split_train_val-split_train_val:fold*split_train_val]
        
        training_validation_x = np.array(training_validation_x)
        train_x = np.array(train_x)
        val_x = np.array(val_x)
        
        training_validation_y = training_validation_y.tolist()
        train_y = training_validation_y[:fold*split_train_val-split_train_val]
        train_y += training_validation_y[fold*split_train_val:]
        val_y = training_validation_y[fold*split_train_val-split_train_val:fold*split_train_val]
        
        training_validation_y = np.array(training_validation_y)
        train_y = np.array(train_y)
        val_y = np.array(val_y)
        
        print('Training on',len(train_y),'samples & validating on',len(val_y),'samples with batch size',batch_size,'.')
        iteration = 1
        for e in range(1, epochs+1):
            state = sess.run(initial_state)
            for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
                feed = {inputs_: x,
                        labels_: y[:, None],
                        keep_prob: 0.5,
                        initial_state: state}
                summary, loss, state, _ = sess.run([merged, cost, final_state, optimizer], feed_dict=feed)

                train_writer.add_summary(summary, iteration)

                if iteration%5==0:
                    print("Epoch: {}/{}".format(e, epochs),
                          "Iteration: {}".format(iteration),
                          "Train loss: {:.3f}".format(loss))

                if iteration%25==0:
                    val_acc = []
                    val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                    for x, y in get_batches(val_x, val_y, batch_size):
                        feed = {inputs_: x,
                                labels_: y[:, None],
                                keep_prob: 1,
                                initial_state: val_state}
                        summary, batch_acc, val_state = sess.run([merged, accuracy, final_state], feed_dict=feed)
                        val_acc.append(batch_acc)
                    print("Val acc: {:.3f}".format(np.mean(val_acc)))
                iteration +=1
                test_writer.add_summary(summary, iteration)
                saver.save(sess, checkpointName)
        
            #saver.save(sess, checkpointName)
            # After an epoch is completed:
            val_acc = []
            val_state = sess.run(cell.zero_state(batch_size, tf.float32))
            for x, y in get_batches(val_x, val_y, batch_size):
                feed = {inputs_: x,
                        labels_: y[:, None],
                        keep_prob: 1,
                        initial_state: val_state}
                summary, batch_acc, val_state = sess.run([merged, accuracy, final_state], feed_dict=feed)
                val_acc.append(batch_acc)
            print("Val acc for epoch {:} = {:.4f}".format(e,np.mean(val_acc)))
            test_writer.add_summary(summary, iteration)
            saver.save(sess, checkpointName)
        
        saver.save(sess, checkpointName)        
        # After a fold is completed:
        val_acc = []
        val_state = sess.run(cell.zero_state(batch_size, tf.float32))
        for x, y in get_batches(val_x, val_y, batch_size):
            feed = {inputs_: x,
                    labels_: y[:, None],
                    keep_prob: 1,
                    initial_state: val_state}
            summary, batch_acc, val_state = sess.run([merged, accuracy, final_state], feed_dict=feed)
            val_acc.append(batch_acc)
            folds_val_acc.append(batch_acc)
        print("Val acc for fold {:} = {:.4f}".format(k,np.mean(val_acc)))
        test_writer.add_summary(summary, iteration)
        saver.save(sess, checkpointName)
        
        print('--------------\n')
        
duration = timer() - start
print('Time elasped =',duration,'sec(s)')

Fold - 1 out of 10 ::
--------------
Training on 1059 samples & validating on 117 samples with batch size 5 .
Epoch: 1/3 Iteration: 5 Train loss: 0.250
Epoch: 1/3 Iteration: 10 Train loss: 0.250
Epoch: 1/3 Iteration: 15 Train loss: 0.238
Epoch: 1/3 Iteration: 20 Train loss: 0.251
Epoch: 1/3 Iteration: 25 Train loss: 0.263
Val acc: 0.513
Epoch: 1/3 Iteration: 30 Train loss: 0.251
Epoch: 1/3 Iteration: 35 Train loss: 0.258
Epoch: 1/3 Iteration: 40 Train loss: 0.248
Epoch: 1/3 Iteration: 45 Train loss: 0.240
Epoch: 1/3 Iteration: 50 Train loss: 0.255
Val acc: 0.504
Epoch: 1/3 Iteration: 55 Train loss: 0.254
Epoch: 1/3 Iteration: 60 Train loss: 0.263
Epoch: 1/3 Iteration: 65 Train loss: 0.251
Epoch: 1/3 Iteration: 70 Train loss: 0.240
Epoch: 1/3 Iteration: 75 Train loss: 0.242
Val acc: 0.539
Epoch: 1/3 Iteration: 80 Train loss: 0.255
Epoch: 1/3 Iteration: 85 Train loss: 0.237
Epoch: 1/3 Iteration: 90 Train loss: 0.266
Epoch: 1/3 Iteration: 95 Train loss: 0.223
Epoch: 1/3 Iteration: 100 Tra

In [22]:
# Testing::
'''
test_acc = []
with tf.Session() as sess:
    saver.restore(sess, checkpointName)
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: x,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))
    
'''

'\ntest_acc = []\nwith tf.Session() as sess:\n    saver.restore(sess, checkpointName)\n    test_state = sess.run(cell.zero_state(batch_size, tf.float32))\n    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):\n        feed = {inputs_: x,\n                labels_: y[:, None],\n                keep_prob: 1,\n                initial_state: test_state}\n        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)\n        test_acc.append(batch_acc)\n    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))\n    \n'

In [24]:
print("Accuracy: {:.3f}".format(np.mean(folds_val_acc)))

Accuracy: 0.844


In [26]:
for acc in folds_val_acc:
  print("{:.3f}".format(acc))

0.200
0.400
0.000
0.400
0.400
0.400
0.400
0.400
0.400
0.600
0.600
0.800
0.400
1.000
0.600
0.800
0.800
0.600
0.400
0.600
0.200
0.200
0.600
0.200
0.200
0.600
0.200
0.600
0.800
0.800
0.400
0.600
0.600
0.400
0.000
0.200
0.400
0.200
0.800
0.600
0.200
0.400
0.600
0.600
0.800
0.600
0.600
1.000
0.800
0.600
0.800
0.800
0.800
1.000
0.600
0.600
1.000
1.000
0.800
0.800
0.600
1.000
1.000
0.400
0.600
0.800
1.000
0.800
0.800
0.600
1.000
0.800
0.600
0.800
0.800
1.000
1.000
1.000
0.800
0.800
0.800
0.600
1.000
0.800
0.800
1.000
0.800
1.000
0.800
0.800
0.800
1.000
1.000
1.000
1.000
1.000
1.000
1.000
1.000
0.800
0.800
1.000
1.000
1.000
1.000
1.000
0.800
0.800
1.000
0.800
1.000
1.000
1.000
1.000
1.000
1.000
1.000
1.000
0.800
0.800
1.000
1.000
0.600
0.800
1.000
1.000
1.000
1.000
1.000
1.000
1.000
1.000
1.000
1.000
0.800
1.000
1.000
0.800
1.000
0.800
1.000
1.000
1.000
1.000
1.000
1.000
1.000
1.000
1.000
1.000
1.000
1.000
1.000
1.000
1.000
1.000
1.000
1.000
1.000
1.000
1.000
0.800
1.000
1.000
1.000
1.000
1.00