## Packages and pathes

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import os

In [2]:
# main path for provided data
gloveFile = 'D:/SentimentAnalysis/glove.6B.300d.txt/glove.6B.300d.txt'
vocab_path = 'D:/SentimentAnalysis/glove.6B.300d.txt/vocab_glove.csv'
# processed data path
path = 'D:/SentimentAnalysis/stanfordSentimentTreebank/stanfordSentimentTreebank/'
train_data_path ='D:/SentimentAnalysis/Train/train.csv'
val_data_path ='D:/SentimentAnalysis/Train/val.csv'
test_data_path ='D:/SentimentAnalysis/Train/test.csv'


## Processing Data

In [3]:
# process the data with phrase and index separation
df_data_sentence = pd.read_table(path + 'dictionary.txt')
df_data_sentence_processed = df_data_sentence['p|id'].str.split('|', expand=True)
df_data_sentence_processed = df_data_sentence_processed.rename(columns={0: 'Phrase', 1: 'phrase_ids'})
df_data_sentence_processed

Unnamed: 0,Phrase,phrase_ids
0,!,0
1,! ',22935
2,! '',18235
3,! Alas,179257
4,! Brilliant,22936
...,...,...
239227,zoning ordinances to protect your community fr...,220441
239228,zzzzzzzzz,179256
239229,élan,220442
239230,É,220443


In [4]:
# read sentiment labels into df
df_data_sentiment = pd.read_table(path + 'sentiment_labels.txt')
df_data_sentiment_processed = df_data_sentiment['phrase ids|sentiment values'].str.split('|', expand=True)
df_data_sentiment_processed = df_data_sentiment_processed.rename(columns={0: 'phrase_ids', 1: 'sentiment_values'})
df_data_sentiment_processed

Unnamed: 0,phrase_ids,sentiment_values
0,0,0.5
1,1,0.5
2,2,0.44444
3,3,0.5
4,4,0.42708
...,...,...
239227,239227,0.36111
239228,239228,0.38889
239229,239229,0.33333
239230,239230,0.88889


In [5]:
# processed all data
df_processed_all = df_data_sentence_processed.merge(df_data_sentiment_processed, how='inner', on='phrase_ids')
df_processed_all

Unnamed: 0,Phrase,phrase_ids,sentiment_values
0,!,0,0.5
1,! ',22935,0.52778
2,! '',18235,0.5
3,! Alas,179257,0.44444
4,! Brilliant,22936,0.86111
...,...,...,...
239227,zoning ordinances to protect your community fr...,220441,0.13889
239228,zzzzzzzzz,179256,0.19444
239229,élan,220442,0.51389
239230,É,220443,0.5


In [6]:
# separate all the data into 3 parts, 80% train, 10% valid, 10% test
def data_split(all_data,splitPercent):

    m = np.random.rand(len(all_data)) < splitPercent
    train = all_data[m]
    test_and_dev = all_data[~m]


    m_test = np.random.rand(len(test_and_dev)) <0.5
    test = test_and_dev[m_test]
    dev = test_and_dev[~m_test]

    dev.to_csv(path+ 'Train/dev.csv')
    test.to_csv(path+  'Train/test.csv')
    train.to_csv(path + 'Train/train.csv' )

    return train, test, dev


In [7]:
all_data = df_processed_all
train_data, test_data, dev_data = data_split(all_data, 0.8)

In [8]:
train_data = train_data.reset_index()
dev_data = dev_data.reset_index()
test_data = test_data.reset_index()

In [117]:
train_data

Unnamed: 0,index,Phrase,phrase_ids,sentiment_values
0,0,!,0,0.5
1,1,! ',22935,0.52778
2,3,! Alas,179257,0.44444
3,4,! Brilliant,22936,0.86111
4,5,! Brilliant !,40532,0.93056
...,...,...,...,...
191167,239227,zoning ordinances to protect your community fr...,220441,0.13889
191168,239228,zzzzzzzzz,179256,0.19444
191169,239229,élan,220442,0.51389
191170,239230,É,220443,0.5


## Filter and load embedding

In [9]:
sentence_path = path + 'SOStr.txt'
filtered_glove_path = path +  'filtered_glove.txt'

In [10]:
# filter the glove pretrained embeddings
def filter_glove(full_glove_path):
    vocab = set()
    with open(sentence_path,'r', encoding='utf-8') as f:
        for line in f:
        # Drop the trailing newline and strip backslashes. Split into words.
            vocab.update(line.strip().replace('\\', '').split('|'))
    read = 0
    wrote = 0
    with open(full_glove_path, 'r', encoding='utf-8') as f:
        with open(filtered_glove_path, 'w', encoding='utf-8') as out:
            for line in f:
                read += 1
                line = line.strip()
                if not line: continue
                if line.split(u' ', 1)[0] in vocab:
                    out.write(line + '\n')
                    wrote += 1
    print('read %s lines, wrote %s' % (read, wrote))

In [11]:
filter_glove(gloveFile)

read 400001 lines, wrote 14941


In [12]:
# load embeddings from pretrained glove embeddings
def load_embeddings(embedding_path):
  #Loads embedings, returns weight matrix and dict from words to indices.
    print('loading word embeddings from %s' % embedding_path)
    weight_vectors = []
    word_idx = {}
    with open(embedding_path, 'r',encoding='utf-8') as f:
        for line in f:
            word, vec = line.split(u' ', 1)
            word_idx[word] = len(weight_vectors)
            weight_vectors.append(np.array(vec.split(), dtype=np.float32))
    # '(' and ')' are replaced by '-LRB-' and '-RRB-' respectively in the parse-trees.
    word_idx[u'-LRB-'] = word_idx.pop(u'(')
    word_idx[u'-RRB-'] = word_idx.pop(u')')
    # Random embedding vector for unknown words.
    weight_vectors.append(np.random.uniform(-0.05, 0.05, weight_vectors[0].shape).astype(np.float32))
    return np.stack(weight_vectors), word_idx

In [126]:
weight_matrix, word_idx = load_embeddings(filtered_glove_path)

loading word embeddings from D:/SentimentAnalysis/stanfordSentimentTreebank/stanfordSentimentTreebank/filtered_glove.txt


In [127]:
word_idx

{'the': 0,
 ',': 1,
 '.': 2,
 'of': 3,
 'to': 4,
 'and': 5,
 'in': 6,
 'a': 7,
 "'s": 8,
 'for': 9,
 '-': 10,
 'that': 11,
 'on': 12,
 'is': 13,
 'was': 14,
 'said': 15,
 'with': 16,
 'he': 17,
 'as': 18,
 'it': 19,
 'by': 20,
 'at': 21,
 'from': 24,
 'his': 25,
 "''": 26,
 '``': 27,
 'an': 28,
 'be': 29,
 'has': 30,
 'are': 31,
 'have': 32,
 'but': 33,
 'were': 34,
 'not': 35,
 'this': 36,
 'who': 37,
 'they': 38,
 'had': 39,
 'which': 40,
 'will': 41,
 'their': 42,
 ':': 43,
 'or': 44,
 'its': 45,
 'one': 46,
 'after': 47,
 'new': 48,
 'been': 49,
 'also': 50,
 'we': 51,
 'would': 52,
 'two': 53,
 'more': 54,
 "'": 55,
 'first': 56,
 'about': 57,
 'up': 58,
 'when': 59,
 'year': 60,
 'there': 61,
 'all': 62,
 '--': 63,
 'out': 64,
 'she': 65,
 'other': 66,
 'people': 67,
 "n't": 68,
 'her': 69,
 'than': 70,
 'over': 71,
 'into': 72,
 'last': 73,
 'some': 74,
 'government': 75,
 'time': 76,
 '$': 77,
 'you': 78,
 'years': 79,
 'if': 80,
 'no': 81,
 'world': 82,
 'can': 83,
 'three': 8

## Preprocess without embedding

In [132]:
# generate embedding for non-pretrained word embedding
def make_dict(data):
    word_idx = dict()
    index = 0
    for idx, row in data.iterrows():
        sentence = (row['Phrase'])
        #print (sentence)
        tokenizer = RegexpTokenizer(r'\w+')
        sentence_words = tokenizer.tokenize(sentence)
        #print (sentence_words)
        for word in sentence_words:
            #print(index)
            word_lwr = word.lower()
            if word_lwr not in word_idx:
                word_idx[word_lwr] = index
                index += 1
    idx_word = {i: w for i, w in enumerate(word_idx)}
    print('the number of words : ', len(word_idx))
    
    return word_idx, idx_word

In [133]:
word_idx, idx_word = make_dict(all_data)

the number of words :  18057


In [134]:
word_idx

{'alas': 0,
 'brilliant': 1,
 'c': 2,
 'mon': 3,
 'gollum': 4,
 's': 5,
 'performance': 6,
 'is': 7,
 'incredible': 8,
 'oh': 9,
 'look': 10,
 'at': 11,
 'that': 12,
 'clever': 13,
 'angle': 14,
 'wow': 15,
 'a': 16,
 'jump': 17,
 'cut': 18,
 'romething': 19,
 'run': 20,
 'the': 21,
 'movie': 22,
 'camera': 23,
 'twirls': 24,
 'true': 25,
 'hollywood': 26,
 'story': 27,
 'zoom': 28,
 '133': 29,
 '3': 30,
 '8217': 31,
 't': 32,
 '9': 33,
 '1': 34,
 '8': 35,
 'million': 36,
 'charmer': 37,
 '100': 38,
 'on': 39,
 'this': 40,
 '20': 41,
 'ticket': 42,
 'to': 43,
 'ride': 44,
 'russian': 45,
 'rocket': 46,
 '40': 47,
 'version': 48,
 '50': 49,
 'us': 50,
 'budget': 51,
 '7': 52,
 '00': 53,
 'and': 54,
 '93': 55,
 'minutes': 56,
 'of': 57,
 'unrecoverable': 58,
 'life': 59,
 '99': 60,
 'bargain': 61,
 'basement': 62,
 'special': 63,
 'white': 64,
 'freeze': 65,
 'frames': 66,
 'reminiscent': 67,
 'pseudo': 68,
 'hip': 69,
 'luxury': 70,
 'car': 71,
 'commercial': 72,
 'it': 73,
 'its': 74,


## Pipline for Tensorflow Model

In [135]:
# get the max length for each sentence in trained data
def maxSeqLen(training_data):

    total_words = 0
    sequence_length = []
    idx = 0
    for index, row in training_data.iterrows():

        sentence = (row['Phrase'])
        sentence_words = sentence.split(' ')
        len_sentence_words = len(sentence_words)
        total_words = total_words + len_sentence_words

        # get the length of the sequence of each training data
        sequence_length.append(len_sentence_words)

        if idx == 0:
            max_seq_len = len_sentence_words


        if len_sentence_words > max_seq_len:
            max_seq_len = len_sentence_words
        idx = idx + 1

    avg_words = total_words/index
    # convert to numpy array
    sequence_length_np = np.asarray(sequence_length)
    return max_seq_len, avg_words, sequence_length_np

In [136]:
# inputs from dl_sentiment that are hard coded but need to be automated
maxSeqLength, avg_words, sequence_length = maxSeqLen(all_data)
#numClasses = 10
numClasses = 10

In [137]:
from nltk.tokenize import RegexpTokenizer

In [138]:
# create labels matrix for the rnn
def tf_data_pipeline_nltk(data, word_idx, max_seq_len):

    maxSeqLength = max_seq_len #Maximum length of sentence
    no_rows = len(data)
    ids = np.zeros((no_rows, maxSeqLength), dtype='int32')
    # convert keys in dict to lower case
    word_idx_lwr =  {k.lower(): v for k, v in word_idx.items()}
    idx = 0

    for index, row in data.iterrows():


        sentence = (row['Phrase'])
        tokenizer = RegexpTokenizer(r'\w+')
        sentence_words = tokenizer.tokenize(sentence)
        i = 0
        for word in sentence_words:
            word_lwr = word.lower()
            try:
                ids[idx][i] =  word_idx_lwr[word_lwr]

            except Exception as e:
                if str(e) == word:
                    ids[idx][i] = 0
                continue
            i = i + 1
        idx = idx + 1

    return ids


In [139]:
# get the label matrix for training data
def labels_matrix(data):

    labels = data['sentiment_values']

    lables_float = labels.astype(float)

    cats = ['0','1','2','3','4','5','6','7','8','9']
    labels_mult = (lables_float * 10).astype(int)
    dummies = pd.get_dummies(labels_mult, prefix='', prefix_sep='')
    dummies = dummies.T.reindex(cats).T.fillna(0)
    labels_matrix = dummies.iloc[:,:].values
    return labels_matrix

In [140]:
train_data['sentiment_values']

0             0.5
1         0.52778
2         0.44444
3         0.86111
4         0.93056
           ...   
191167    0.13889
191168    0.19444
191169    0.51389
191170        0.5
191171        0.5
Name: sentiment_values, Length: 191172, dtype: object

In [141]:
# load Training data matrix

train_x = tf_data_pipeline_nltk(train_data, word_idx, maxSeqLength)
test_x = tf_data_pipeline_nltk(test_data, word_idx, maxSeqLength)
val_x = tf_data_pipeline_nltk(dev_data, word_idx, maxSeqLength)

    
# load labels data matrix
train_y = labels_matrix(train_data)
val_y = labels_matrix(dev_data)
test_y = labels_matrix(test_data)

In [149]:
# summarize size
print("Training data: ")
print(train_x.shape)
print(train_y.shape)

# Summarize number of classes
print("Classes: ")
print(np.unique(train_y.shape[1]))

Training data: 
(191172, 56)
(191172, 10)
Classes: 
[10]


## Start to build a RNN model

In [152]:
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.layers import Bidirectional
from keras.preprocessing import sequence
from keras.layers import Dropout
import h5py
from keras.models import model_from_json
from keras.models import load_model

In [153]:
def create_model_rnn(weight_matrix, max_words, EMBEDDING_DIM):
    # create the model
    model = Sequential()
    model.add(Embedding(len(word_idx), EMBEDDING_DIM, input_length=max_words, trainable=False))
    #model.add(Embedding(len(weight_matrix), EMBEDDING_DIM, weights=[weight_matrix], input_length=max_words, trainable=False))
    #model.add(Embedding(len(weight_matrix), EMBEDDING_DIM, weights=[weight_matrix], input_length=max_words, trainable=False))
    #model.add(Bidirectional(LSTM(256, dropout=0.2, recurrent_dropout=0.2)))
    #model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2,return_sequences=True))
    model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.2))
    #model.add(LSTM(512, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.50))   
    model.add(Dense(10, activation='softmax'))
    # try using different optimizers and different optimizer configs
    model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model

In [154]:
def train_model(model,train_x, train_y, test_x, test_y, val_x, val_y, batch_size, path) :

    # save the best model and early stopping
    #saveBestModel = keras.callbacks.ModelCheckpoint(path+'Train' + '/model/best_model.hdf5', monitor='val_acc', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)
    #earlyStopping = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

    # Fit the model
    #model.fit(train_x, train_y, batch_size=batch_size, epochs=25,validation_data=(val_x, val_y), callbacks=[saveBestModel, earlyStopping])
    model.fit(train_x, train_y, batch_size=batch_size, epochs=15,validation_data=(val_x, val_y))
    # Final evaluation of the model
    score, acc = model.evaluate(test_x, test_y, batch_size=batch_size)

    print('Test score:', score)
    print('Test accuracy:', acc)

    return model

In [155]:
# make prediction of sentiment value for given sentence, top 3 average sentiment bands are taken
# reference: https://medium.com/analytics-vidhya/sentiment-analysis-for-text-with-deep-learning-2f0a0c6472b5
def live_test(trained_model, data, word_idx):

    live_list = []
    live_list_np = np.zeros((56,1))
    # split the sentence into its words and remove any punctuations.
    tokenizer = RegexpTokenizer(r'\w+')
    data_sample_list = tokenizer.tokenize(data)

    labels = np.array(['1','2','3','4','5','6','7','8','9','10'], dtype = "int")

    # get index for the live stage
    data_index = np.array([word_idx[word.lower()] if word.lower() in word_idx else 0 for word in data_sample_list])
    data_index_np = np.array(data_index)
    print(data_index_np)

    # padded with zeros of length 56 i.e maximum length
    padded_array = np.zeros(56) # use the def maxSeqLen(training_data) function to detemine the padding length for your data
    padded_array[:data_index_np.shape[0]] = data_index_np
    data_index_np_pad = padded_array.astype(int)
    live_list.append(data_index_np_pad)
    live_list_np = np.asarray(live_list)
    type(live_list_np)

    # get score from the model
    score = trained_model.predict(live_list_np, batch_size=1, verbose=0)

    single_score = np.round(np.argmax(score)/10, decimals=2) # maximum of the array i.e single band

    # weighted score of top 3 bands
    top_3_index = np.argsort(score)[0][-3:]
    top_3_scores = score[0][top_3_index]
    top_3_weights = top_3_scores/np.sum(top_3_scores)
    single_score_dot = np.round(np.dot(top_3_index, top_3_weights)/10, decimals = 2)

    #print (single_score)
    return single_score_dot

## start training

In [156]:
max_words = 56 # max no of words in your training data
batch_size = 2000 # batch size for training
EMBEDDING_DIM = 300 # size of the word embeddings
train_flag = True # set True if in training mode else False if in prediction mode

In [157]:
model = create_model_rnn(weight_matrix, max_words, EMBEDDING_DIM)

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 56, 300)           5417100   
_________________________________________________________________
lstm_18 (LSTM)               (None, 256)               570368    
_________________________________________________________________
dense_22 (Dense)             (None, 512)               131584    
_________________________________________________________________
dropout_11 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_23 (Dense)             (None, 10)                5130      
Total params: 6,124,182
Trainable params: 707,082
Non-trainable params: 5,417,100
_________________________________________________________________
None


In [158]:
trained_model =train_model(model,train_x, train_y, test_x, test_y, val_x, val_y, batch_size, path)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test score: 1.781495213508606
Test accuracy: 0.3724290132522583


In [102]:
# predict the sentiment value as a test of model
data_sample = "Great!! it is sunny today!!"
result = live_test(model,data_sample, word_idx)

[316  19  13   0 334]
