In [1]:
import os
import sys

sys.path.append(os.path.abspath(os.pardir))

import keras
import numpy as np

# Metrics
from sklearn.metrics import accuracy_score

# Models
from tdparse.models.tdlstm import TLSTM
# Tokenisers
from tdparse.tokenisers import whitespace, ark_twokenize
# Word Vectors
from tdparse.word_vectors import PreTrained, GloveTwitterVectors
# Get the data
from tdparse.helper import read_config, full_path
from tdparse.parsers import dong

Using TensorFlow backend.


In [2]:
# Load the datasets
dong_train = dong(full_path(read_config('dong_twit_train_data')))
dong_test = dong(full_path(read_config('dong_twit_test_data')))
# Load the word vectors
sswe_path = full_path(read_config('sswe_files')['vo_zhang'])
sswe = PreTrained(sswe_path, name='sswe')
#glove_50 = GloveTwitterVectors(50)
#glove_100 = GloveTwitterVectors(100)
#glove_200 = GloveTwitterVectors(200)

# LSTM
This is an implementation of the LSTM model that is shown in [Tang et al. paper](https://aclanthology.info/papers/C16-1311/c16-1311).

The LSTM is a single LSTM layer that outputs to a softmax function. The LSTM hidden layer dimension is the same as the embedding layer dimension. The optimiser is Stochastic Gradient Descent with a learning rate of 0.01.

The number of epochs to run the model is unknown and Early Stopping is not mentioned in the paper.

**Variables that can be changed through the contructor**
1. Tokeniser used
2. Pre-Trained Embedding e.g. Glove or SSWE (Sentiment Specific Word Embedding)
3. Pad Size of the sentences
4. To lower case the words. This should always be used as all the pre-trained word embeddings are trained on lower cased words
5. Dimension of the LSTM hidden layer - The default for this is the size of the word embeddings e.g. If the word embedding is 50 dimensions then the output of the LSTM is 50 dimensions.
6. optimiser to use. To change this provide a valid Keras [optimiser](https://keras.io/optimizers/)
7. patience - Wether to use Early stopping or not default is not denoted by None. I would recomend to use it with a value of 10
8. batch-size - default 32
9. epochs - Number of epochs to train for default 100


### Below is an example of how to perform Cross validation

In [3]:
lstm_model = TLSTM(whitespace, sswe, epochs=5, lower=True, optimiser='adam')
predictions, scores = TLSTM. cross_val(dong_train.data_dict(), dong_train.sentiment_data(), 
                                       lstm_model, cv=3, scorer=accuracy_score,
                                       reproducible=True)

Train on 3331 samples, validate on 833 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 3332 samples, validate on 834 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 3332 samples, validate on 834 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [4]:
scores

[0.55950095969289826, 0.57925072046109505, 0.56964457252641687]

In [5]:
predictions

[array([2, 1, 0, ..., 2, 2, 1]),
 array([2, 1, 0, ..., 0, 0, 0]),
 array([2, 1, 0, ..., 1, 0, 0])]

The scores are accuracy scores per fold in the cross validation and the predicitions are the raw prediction values per fold.

### Below is an example of how to fit and test a model

In [6]:
lstm_model = TLSTM(whitespace, sswe, patience=2, epochs=200, optimiser='adam')
lstm_model.fit(dong_train.data_dict(), dong_train.sentiment_data(), reproducible=True,
               validation_size=0.2)
predicted_values = np.argmax(lstm_model.predict(dong_test.data()), axis=1)
test_res = np.argmax(keras.utils.to_categorical(dong_test.sentiment_data(), num_classes=3), axis=1)
acc_score = accuracy_score(test_res, predicted_values)
acc_score

Train on 4998 samples, validate on 1250 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200


0.57947976878612717