In [1]:
# Courtesy: https://github.com/mchablani/deep-learning/blob/master/sentiment-rnn/Sentiment_RNN.ipynb
# Depression Analysis in Bangla with LSTM-RNN::
# Two classes with balanced dataset (Total 1968 tweets):
# 1. Depressive (984 tweets)
# 2. Non-depressive (984 tweets)
# 1 depressive -> 1 non-depressive -> 1 depressive -> 1 non-depressive ...

'''
SELECT tweets_depressive_nondepressive.tweet
INTO OUTFILE "D:/EDUCATION/Machine & Deep Learning/Implementations/tweet_depressive_nondepressive_balanced_text.txt" 
FROM tweets_depressive_nondepressive;
'''

'\nSELECT tweets_depressive_nondepressive.tweet\nINTO OUTFILE "D:/EDUCATION/Machine & Deep Learning/Implementations/tweet_depressive_nondepressive_balanced_text.txt" \nFROM tweets_depressive_nondepressive;\n'

In [2]:
import numpy as np
import tensorflow as tf
from timeit import default_timer as timer
from collections import Counter
from string import punctuation

In [3]:
with open('tweets_depressive_nondepressive_balanced_rearranged_text.txt', 'r', encoding="utf8") as f:
    tweets = f.read()
with open('tweets_depressive_nondepressive_balanced_rearranged_labels.txt', 'r', encoding="utf8") as f:
    labels_org = f.read()

In [4]:
# Data preprocessing::
all_text = ''.join([c for c in tweets if c not in punctuation])
tweets = all_text.split('\n')

all_text = ' '.join(tweets)
words = all_text.split()

In [5]:
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

tweets_ints = []
for each in tweets:
    tweets_ints.append([vocab_to_int[word] for word in each.split()])

In [6]:
# Encoding the labels::
list_labels = []

for l in labels_org.split():
    if l == "non_depressive":
        list_labels.append(0)
    elif l == "depressive":
        list_labels.append(1)
        
labels = np.array(list_labels)
#print(len(labels))

In [7]:
tweets_lens = Counter([len(x) for x in tweets_ints])
print("Zero-length tweets: {}".format(tweets_lens[0]))
print("Maximum tweets length: {}".format(max(tweets_lens)))

Zero-length tweets: 1
Maximum tweets length: 61


In [8]:
# Filter out that tweets with 0 length
tweets_ints = [r[0:200] for r in tweets_ints if len(r) > 0]

In [9]:
from collections import Counter
tweets_lens = Counter([len(x) for x in tweets_ints])
print("Zero-length tweets: {}".format(tweets_lens[0]))
print("Maximum tweet length: {}".format(max(tweets_lens)))

Zero-length tweets: 0
Maximum tweet length: 61


In [10]:
seq_len = 200
features = np.zeros((len(tweets_ints), seq_len), dtype=int)
# print(features[:10,:100])
for i, row in enumerate(tweets_ints):
    features[i, -len(row):] = np.array(row)[:seq_len]
#features[:10,:100]

In [11]:
split_frac = 0.8

split_index = int(split_frac * len(features))

train_x, val_x = features[:split_index], features[split_index:] 
train_y, val_y = labels[:split_index], labels[split_index:]

split_frac = 0.5
split_index = int(split_frac * len(val_x))

val_x, test_x = val_x[:split_index], val_x[split_index:]
val_y, test_y = val_y[:split_index], val_y[split_index:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))
print("label set: \t\t{}".format(train_y.shape), 
      "\nValidation label set: \t{}".format(val_y.shape),
      "\nTest label set: \t\t{}".format(test_y.shape))

			Feature Shapes:
Train set: 		(1574, 200) 
Validation set: 	(197, 200) 
Test set: 		(197, 200)
label set: 		(1574,) 
Validation label set: 	(197,) 
Test label set: 		(197,)


In [12]:
# Build the graph::

lstm_size = 256
lstm_layers = 2
batch_size = 1
learning_rate = 0.001

In [13]:
n_words = len(vocab_to_int) + 1 # Add 1 for 0 added to vocab

# Create the graph object
tf.reset_default_graph()
with tf.name_scope('inputs'):
    inputs_ = tf.placeholder(tf.int32, [None, None], name="inputs")
    labels_ = tf.placeholder(tf.int32, [None, None], name="labels")
    keep_prob = tf.placeholder(tf.float32, name="keep_prob")

In [14]:
# Size of the embedding vectors (number of units in the embedding layer)
embed_size = 300 

with tf.name_scope("Embeddings"):
    embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)

In [15]:
def lstm_cell():
    # Your basic LSTM cell
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size, reuse=tf.get_variable_scope().reuse)
    # Add dropout to the cell
    return tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)

with tf.name_scope("RNN_layers"):
    # Stack up multiple LSTM layers, for deep learning
    cell = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(lstm_layers)])
    
    # Getting an initial state of all zeros
    initial_state = cell.zero_state(batch_size, tf.float32)

In [16]:
with tf.name_scope("RNN_forward"):
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)

In [17]:
# Output::

with tf.name_scope('predictions'):
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
    tf.summary.histogram('predictions', predictions)
with tf.name_scope('cost'):
    cost = tf.losses.mean_squared_error(labels_, predictions)
    tf.summary.scalar('cost', cost)

with tf.name_scope('train'):
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

merged = tf.summary.merge_all()

In [18]:
# Validation accuracy::

with tf.name_scope('validation'):
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [19]:
# Batching::

def get_batches(x, y, batch_size=100):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

In [20]:
# Training::

epochs = 2
saver = tf.train.Saver()
start = timer()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    train_writer = tf.summary.FileWriter('./logs/tb/train', sess.graph)
    test_writer = tf.summary.FileWriter('./logs/tb/test', sess.graph)
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y[:, None],
                    keep_prob: 0.5,
                    initial_state: state}
            summary, loss, state, _ = sess.run([merged, cost, final_state, optimizer], feed_dict=feed)
#             loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)

            train_writer.add_summary(summary, iteration)
        
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
#                     batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    summary, batch_acc, val_state = sess.run([merged, accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
            test_writer.add_summary(summary, iteration)
            saver.save(sess, "checkpoints/tweet_depressive_nondepressive_balanced_rearranged2.ckpt")
#            tensorboard = TensorBoard(log_dir="logs/tweet_5000_all_sentiments_six_classes-{}".format(int(time.time())))
    saver.save(sess, "checkpoints/tweet_depressive_nondepressive_balanced_rearranged2.ckpt")
    
duration = timer() - start
print('Time elasped =',duration,'sec(s)')

Epoch: 0/2 Iteration: 5 Train loss: 0.323
Epoch: 0/2 Iteration: 10 Train loss: 0.444
Epoch: 0/2 Iteration: 15 Train loss: 0.464
Epoch: 0/2 Iteration: 20 Train loss: 0.004
Epoch: 0/2 Iteration: 25 Train loss: 0.200
Val acc: 0.528
Epoch: 0/2 Iteration: 30 Train loss: 0.446
Epoch: 0/2 Iteration: 35 Train loss: 0.547
Epoch: 0/2 Iteration: 40 Train loss: 0.177
Epoch: 0/2 Iteration: 45 Train loss: 0.647
Epoch: 0/2 Iteration: 50 Train loss: 0.304
Val acc: 0.528
Epoch: 0/2 Iteration: 55 Train loss: 0.530
Epoch: 0/2 Iteration: 60 Train loss: 0.197
Epoch: 0/2 Iteration: 65 Train loss: 0.210
Epoch: 0/2 Iteration: 70 Train loss: 0.056
Epoch: 0/2 Iteration: 75 Train loss: 0.368
Val acc: 0.533
Epoch: 0/2 Iteration: 80 Train loss: 0.235
Epoch: 0/2 Iteration: 85 Train loss: 0.131
Epoch: 0/2 Iteration: 90 Train loss: 0.775
Epoch: 0/2 Iteration: 95 Train loss: 0.337
Epoch: 0/2 Iteration: 100 Train loss: 0.057
Val acc: 0.518
Epoch: 0/2 Iteration: 105 Train loss: 0.414
Epoch: 0/2 Iteration: 110 Train loss

Epoch: 0/2 Iteration: 880 Train loss: 0.075
Epoch: 0/2 Iteration: 885 Train loss: 0.208
Epoch: 0/2 Iteration: 890 Train loss: 0.116
Epoch: 0/2 Iteration: 895 Train loss: 0.335
Epoch: 0/2 Iteration: 900 Train loss: 0.418
Val acc: 0.513
Epoch: 0/2 Iteration: 905 Train loss: 0.175
Epoch: 0/2 Iteration: 910 Train loss: 0.318
Epoch: 0/2 Iteration: 915 Train loss: 0.066
Epoch: 0/2 Iteration: 920 Train loss: 0.279
Epoch: 0/2 Iteration: 925 Train loss: 0.107
Val acc: 0.553
Epoch: 0/2 Iteration: 930 Train loss: 0.432
Epoch: 0/2 Iteration: 935 Train loss: 0.287
Epoch: 0/2 Iteration: 940 Train loss: 0.128
Epoch: 0/2 Iteration: 945 Train loss: 0.125
Epoch: 0/2 Iteration: 950 Train loss: 0.608
Val acc: 0.518
Epoch: 0/2 Iteration: 955 Train loss: 0.106
Epoch: 0/2 Iteration: 960 Train loss: 0.104
Epoch: 0/2 Iteration: 965 Train loss: 0.107
Epoch: 0/2 Iteration: 970 Train loss: 0.237
Epoch: 0/2 Iteration: 975 Train loss: 0.456
Val acc: 0.640
Epoch: 0/2 Iteration: 980 Train loss: 0.110
Epoch: 0/2 Itera

Epoch: 1/2 Iteration: 1740 Train loss: 0.003
Epoch: 1/2 Iteration: 1745 Train loss: 0.022
Epoch: 1/2 Iteration: 1750 Train loss: 0.144
Val acc: 0.645
Epoch: 1/2 Iteration: 1755 Train loss: 0.024
Epoch: 1/2 Iteration: 1760 Train loss: 0.524
Epoch: 1/2 Iteration: 1765 Train loss: 0.002
Epoch: 1/2 Iteration: 1770 Train loss: 0.013
Epoch: 1/2 Iteration: 1775 Train loss: 0.048
Val acc: 0.680
Epoch: 1/2 Iteration: 1780 Train loss: 0.020
Epoch: 1/2 Iteration: 1785 Train loss: 0.391
Epoch: 1/2 Iteration: 1790 Train loss: 0.006
Epoch: 1/2 Iteration: 1795 Train loss: 0.001
Epoch: 1/2 Iteration: 1800 Train loss: 0.476
Val acc: 0.650
Epoch: 1/2 Iteration: 1805 Train loss: 0.001
Epoch: 1/2 Iteration: 1810 Train loss: 0.044
Epoch: 1/2 Iteration: 1815 Train loss: 0.035
Epoch: 1/2 Iteration: 1820 Train loss: 0.015
Epoch: 1/2 Iteration: 1825 Train loss: 0.034
Val acc: 0.655
Epoch: 1/2 Iteration: 1830 Train loss: 0.330
Epoch: 1/2 Iteration: 1835 Train loss: 0.002
Epoch: 1/2 Iteration: 1840 Train loss: 0

Epoch: 1/2 Iteration: 2595 Train loss: 0.005
Epoch: 1/2 Iteration: 2600 Train loss: 0.884
Val acc: 0.655
Epoch: 1/2 Iteration: 2605 Train loss: 0.012
Epoch: 1/2 Iteration: 2610 Train loss: 0.105
Epoch: 1/2 Iteration: 2615 Train loss: 0.002
Epoch: 1/2 Iteration: 2620 Train loss: 0.006
Epoch: 1/2 Iteration: 2625 Train loss: 0.011
Val acc: 0.716
Epoch: 1/2 Iteration: 2630 Train loss: 0.140
Epoch: 1/2 Iteration: 2635 Train loss: 0.063
Epoch: 1/2 Iteration: 2640 Train loss: 0.027
Epoch: 1/2 Iteration: 2645 Train loss: 0.073
Epoch: 1/2 Iteration: 2650 Train loss: 0.258
Val acc: 0.782
Epoch: 1/2 Iteration: 2655 Train loss: 0.720
Epoch: 1/2 Iteration: 2660 Train loss: 0.148
Epoch: 1/2 Iteration: 2665 Train loss: 0.075
Epoch: 1/2 Iteration: 2670 Train loss: 0.081
Epoch: 1/2 Iteration: 2675 Train loss: 0.117
Val acc: 0.772
Epoch: 1/2 Iteration: 2680 Train loss: 0.029
Epoch: 1/2 Iteration: 2685 Train loss: 0.023
Epoch: 1/2 Iteration: 2690 Train loss: 0.134
Epoch: 1/2 Iteration: 2695 Train loss: 0

In [21]:
# Testing::

test_acc = []
with tf.Session() as sess:
    saver.restore(sess, "checkpoints/tweet_depressive_nondepressive_balanced_rearranged2.ckpt")
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: x,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

INFO:tensorflow:Restoring parameters from checkpoints/tweet_depressive_nondepressive_balanced_rearranged2.ckpt
Test accuracy: 0.711
