In [1]:
import os
import sys

sys.path.append(os.path.abspath(os.pardir))

import keras
import numpy as np

# Metrics
from sklearn.metrics import accuracy_score

# Models
from tdparse.models.tdlstm import TLSTM
# Tokenisers
from tdparse.tokenisers import whitespace, ark_twokenize
# Word Vectors
from tdparse.word_vectors import PreTrained, GloveTwitterVectors
# Get the data
from tdparse.helper import read_config, full_path
from tdparse.parsers import dong

Using TensorFlow backend.


In [2]:
# Load the datasets
dong_train = dong(full_path(read_config('dong_twit_train_data')))
dong_test = dong(full_path(read_config('dong_twit_test_data')))
# Load the word vectors
sswe_path = full_path(read_config('sswe_files')['vo_zhang'])
sswe = PreTrained(sswe_path, name='sswe')
#glove_50 = GloveTwitterVectors(50)
#glove_100 = GloveTwitterVectors(100)
#glove_200 = GloveTwitterVectors(200)

# LSTM
This is an implementation of the LSTM model that is shown in [Tang et al. paper](https://aclanthology.info/papers/C16-1311/c16-1311).

The LSTM is a single LSTM layer that outputs to a softmax function. The LSTM hidden layer dimension is the same as the embedding layer dimension. The optimiser is Stochastic Gradient Descent with a learning rate of 0.01.

The number of epochs to run the model is unknown and Early Stopping is not mentioned in the paper.

However to estimate the unknown number of epochs we are using Early Stopping and setting different patience values. We are experimenting between the values 1-5.

**Variables that can be changed through the contructor**
1. Tokeniser used
2. Pre-Trained Embedding e.g. Glove or SSWE (Sentiment Specific Word Embedding)
3. Pad Size of the sentences
4. To lower case the words. This should always be used as all the pre-trained word embeddings are trained on lower cased words
5. Dimension of the LSTM hidden layer
6. optimiser to use. To change this provide a valid Keras [optimiser](https://keras.io/optimizers/)
7. Early stopping patience value

Below we perform 5 fold cross validation on the five different LSTM models with different patience values

In [None]:
lstm_model_1 = TLSTM(whitespace, sswe, patience=1)
lstm_model_2 = TLSTM(whitespace, sswe, patience=2)
lstm_model_3 = TLSTM(whitespace, sswe, patience=3)
lstm_model_4 = TLSTM(whitespace, sswe, patience=4)
lstm_model_5 = TLSTM(whitespace, sswe, patience=5)
all_models = {'patience 1' : lstm_model_1, 'patience 2' : lstm_model_2,
              'patience 3' : lstm_model_3, 'patience 4' : lstm_model_4,
              'patience 5' : lstm_model_5}

In [None]:
model_results = {}
for model_name, model in all_models.items():
    train_data = dong_train.data_dict()
    train_y = dong_train.sentiment_data()
    predictions, scores = TLSTM.cross_val(train_data, train_y, 
                                          model, scorer=accuracy_score)
    model_results[model_name] = (predictions, scores)
    

Train on 3997 samples, validate on 1000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Train on 3998 samples, validate on 1000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100

## Example of how to fit, predict, and score a model

In [6]:
lstm_model = TLSTM(whitespace, sswe)
lstm_model.fit(dong_train.data_dict(), dong_train.sentiment_data())

Train on 4998 samples, validate on 1250 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100


In [7]:
predicted_values = np.argmax(lstm_model.predict(dong_test.data()), axis=1)
test_res = np.argmax(keras.utils.to_categorical(dong_test.sentiment_data(), num_classes=3), axis=1)
accuracy_score(test_res, predicted_values)

0.55780346820809246