In [1]:
import math
from multiprocessing import Pool
import os
import sys
import itertools
import time
import tempfile
import pickle

sys.path.append(os.path.abspath(os.pardir))

%matplotlib inline

import keras
import numpy as np
import pandas as pd
import seaborn as sns

# Metrics
from sklearn.metrics import accuracy_score

# Notebook helper methods
from tdparse import notebook_helper
# Models
from tdparse.models.tdlstm import LSTM, TDLSTM, TCLSTM
# Tokenisers
from tdparse.tokenisers import ark_twokenize
# Word Vectors
from tdparse.word_vectors import PreTrained, GloveCommonCrawl
# Get the data
from tdparse.parsers import semeval_14, dong, election
from tdparse.data_types import TargetCollection
from tdparse.helper import read_config, full_path
from tdparse.evaluation import evaluation_results
from tdparse.notebook_helper import get_json_data, write_json_data

Using TensorFlow backend.


In [2]:
# Load all of the datasets
youtubean_train = semeval_14(full_path(read_config('youtubean_train')))
youtubean_test = semeval_14(full_path(read_config('youtubean_test')))
semeval_14_rest_train = semeval_14(full_path(read_config('semeval_2014_rest_train')))
semeval_14_lap_train = semeval_14(full_path(read_config('semeval_2014_lap_train')))
semeval_14_rest_test = semeval_14(full_path(read_config('semeval_2014_rest_test')))
semeval_14_lap_test = semeval_14(full_path(read_config('semeval_2014_lap_test')))
dong_train = dong(full_path(read_config('dong_twit_train_data')))
dong_test = dong(full_path(read_config('dong_twit_test_data')))
election_train, election_test = election(full_path(read_config('election_folder_dir')))
mitchel_train = semeval_14(full_path(read_config('mitchel_train')))
mitchel_test = semeval_14(full_path(read_config('mitchel_test')))


dataset_train_test = {#'SemEval 14 Laptop' : (semeval_14_lap_train, semeval_14_lap_test),
                      #'SemEval 14 Restaurant' : (semeval_14_rest_train, semeval_14_rest_test),
                      #'Dong Twitter' : (dong_train, dong_test),
                      'Election Twitter' : (election_train, election_test),
                      #'YouTuBean' : (youtubean_train, youtubean_test),
                      #'Mitchel' : (mitchel_train, mitchel_test)
                     }

In [3]:
# Loading the word vectors
sswe_path = full_path(read_config('sswe_files')['vo_zhang'])
sswe = PreTrained(sswe_path, name='sswe')
glove_300 = GloveCommonCrawl(version=42)
# Word vectors that we are searching over
word_vectors = [sswe, glove_300]


# This is required as we have 3 classes and one of them is -1 and when one hot encoded
# the index of -1 is 2 and that is what it thinks the label is when it should be 
# -1 hence the sentiment mapper
sentiment_mapper = {0 : 0, 1 : 1, 2 : -1}

# Folder to store all the sub folder for each model
result_folder = os.path.abspath(os.path.join(os.getcwd(), os.pardir, 'results', 'TDLstm'))
# Folder to store all of the saved models (model zoo folder)
model_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir, 'model zoo'))
os.makedirs(model_dir, exist_ok=True)

In [4]:
def dataset_predictions(train, test, dataset_name, model_class, 
                        word_vector_file_path, result_file_path,
                        model_folder_path, model_params):
    
    print('{} {}'.format(dataset_name, model_params))

    data_train = train.data_dict()
    y_train = train.sentiment_data()
    data_test = test.data_dict()
    y_test = test.sentiment_data()
    
    # Fits the model
    word_vector_data = get_json_data(word_vector_file_path, dataset_name)
    best_score = 0
    best_word_vector = None
    best_model = None
    for word_vector in word_vectors:
        print(word_vector)
        word_vector_name = '{}'.format(word_vector)
        if word_vector_name in word_vector_data:
            word_vec_val_score = word_vector_data[word_vector_name]
            if word_vec_val_score > best_score:
                best_score = word_vec_val_score
                best_word_vector = word_vector
            continue
        model_params['embeddings'] = word_vector
        model = model_class(**model_params)
        print('{} {}'.format(model_params, word_vector))
        history = model.fit(data_train, y_train, validation_size=0.3, verbose=1,
                            reproducible=True, patience=10, epochs=300, org_initialisers=True)
        word_vec_val_score = max(history.history['val_acc'])
        word_vector_data[word_vector_name] = word_vec_val_score
        if word_vec_val_score > best_score:
                best_score = word_vec_val_score
                best_word_vector = word_vector
                best_model = model
                
        # Save word vector validation score result
        write_json_data(word_vector_file_path, dataset_name, word_vector_data)
    if best_word_vector is None:
        raise ValueError('best word vector should not be None')
    if best_model is None:
        model_params['embeddings'] = best_word_vector
        model = model_class(**model_params)
        print('{} {}'.format(model_params, best_word_vector))
        model.fit(data_train, y_train, validation_size=0.3, verbose=1,
                  reproducible=True, patience=10, epochs=300, org_initialisers=True)
    # Saves the model to the model zoo
    model_folder_join = lambda file_name: os.path.join(model_folder_path, file_name)
    model_arch_fp = model_folder_join('{} {} architecture'.format(model, dataset_name))
    model_weights_fp = model_folder_join('{} {} weights'.format(model, dataset_name))
    model.save_model(model_arch_fp, model_weights_fp, verbose=1)
    
    # Predicts on the test data
    predicted_values = model.predict(data_test)
    # Convert prediction from one hot encoded to category value e.g. -1, 0, 1
    predicted_values_cats =  model.prediction_to_cats(y_test, predicted_values, 
                                                      mapper=sentiment_mapper)
    # Evaluates the predictions and save the results
    return evaluation_results(predicted_values_cats, test, dataset_name, 
                              file_name=result_file_path, 
                              save_raw_data=True, re_write=True)

# Mass Evaluation of the LSTM model

In [5]:
# Model folder results
lstm_folder = os.path.join(result_folder, 'lstm')
os.makedirs(lstm_folder, exist_ok=True)

# Result files
word_vector_file = os.path.join(lstm_folder, 'word vector results.json')
result_file = os.path.join(lstm_folder, 'results file.tsv')

for dataset_name, train_test in dataset_train_test.items():
    train, test = train_test
    model_params = {'tokeniser' : ark_twokenize,
                    'lower' : True, 'pad_size' : -1}
    dataset_predictions(train, test, dataset_name, LSTM, 
                        word_vector_file, result_file, model_dir, model_params)

Election Twitter {'tokeniser': <function ark_twokenize at 0x7fd7b4194950>, 'lower': True, 'pad_size': -1}
sswe
glove 300d 42b common crawl
{'tokeniser': <function ark_twokenize at 0x7fd7b4194950>, 'lower': True, 'pad_size': -1, 'embeddings': glove 300d 42b common crawl} glove 300d 42b common crawl
Train on 6550 samples, validate on 2808 samples
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300

Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78/300
Epoch 79/300
Epoch 80/300
Epoch 81/300
Epoch 82/300
Epoch 83/300
Epoch 84/300
Epoch 85/300
Epoch 86/300
Epoch 87/300
Epoch 88/300
Epoch 89/300
Epoch 90/300
Epoch 91/300
Epoch 92/300
Epoch 93/300
Epoch 94/300
Epoch 95/300
Epoch 96/300
Epoch 97/300
Epoch 98/300
Epoch 99/300
Epoch 100/300
Epoch 101/300
Epoch 102/300
Epoch 103/300
Epoch 104/300
Epoch 105/300
Epoch 106/300
Epoch 107/300
Epoch 108/300
Epoch 109/300
Epoch 110/300
Epoch 111/300
Epoch 112/300
Epoch 113/300
Epoch 114/300
Epoch 115/300
Epoch 116/300
Epoch 117/300


Epoch 118/300
Epoch 119/300
Epoch 120/300
Epoch 121/300
Epoch 122/300
Epoch 123/300
Epoch 124/300
Epoch 125/300
Epoch 126/300
Epoch 127/300
Epoch 128/300
Epoch 129/300
Epoch 130/300
Epoch 131/300
Epoch 132/300
Epoch 133/300
Epoch 134/300
Epoch 135/300
Epoch 136/300
Epoch 137/300
Epoch 138/300
Epoch 139/300
Epoch 140/300
Epoch 141/300
Epoch 142/300
Epoch 143/300
Epoch 144/300
Epoch 145/300
Epoch 146/300
Epoch 147/300
Epoch 148/300
Epoch 149/300
Epoch 150/300
Epoch 151/300
Epoch 152/300
Epoch 153/300
Epoch 154/300
Model architecture saved to: /mnt/silo/users/moorea/tdparse/model zoo/LSTM Election Twitter architecture.yaml
Model weights saved to /mnt/silo/users/moorea/tdparse/model zoo/LSTM Election Twitter weights.h5
Save time 11.47
saving raw data


# Mass Evaluation of the TDLSTM model

In [5]:
# Model folder results
tdlstm_folder = os.path.join(result_folder, 'tdlstm')
os.makedirs(tdlstm_folder, exist_ok=True)

# Result files
word_vector_file = os.path.join(tdlstm_folder, 'word vector results.json')
result_file = os.path.join(tdlstm_folder, 'results file.tsv')

for dataset_name, train_test in dataset_train_test.items():
    train, test = train_test
    model_params = {'tokeniser' : ark_twokenize,
                    'lower' : True, 'pad_size' : -1}
    dataset_predictions(train, test, dataset_name, TDLSTM,
                        word_vector_file, result_file, model_dir, model_params)

Mitchel {'tokeniser': <function ark_twokenize at 0x7f2c83bdd950>, 'lower': True, 'pad_size': -1}
sswe
glove 300d 42b common crawl
{'tokeniser': <function ark_twokenize at 0x7f2c83bdd950>, 'lower': True, 'pad_size': -1, 'embeddings': glove 300d 42b common crawl} glove 300d 42b common crawl
Train on 1610 samples, validate on 691 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
{'tokeniser': <function ark_twokenize at 0x7f2c83bdd950>, 'lower': True, 'pad_size': -1, 'embeddings': sswe} sswe
Train on 1610 samples, validate on 691 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

KeyboardInterrupt: 

# Mass Evaluation of the TCLSTM model

In [None]:
# Model folder results
tclstm_folder = os.path.join(result_folder, 'tclstm')
os.makedirs(tclstm_folder, exist_ok=True)

# Result files
word_vector_file = os.path.join(tclstm_folder, 'word vector results.json')
result_file = os.path.join(tclstm_folder, 'results file.tsv')

for dataset_name, train_test in dataset_train_test.items():
    train, test = train_test
    model_params = {'tokeniser' : ark_twokenize,
                    'lower' : True, 'pad_size' : -1}
    dataset_predictions(train, test, dataset_name, TCLSTM, 
                        word_vector_file, result_file, model_dir, model_params)

Mitchel {'tokeniser': <function ark_twokenize at 0x7fbd07a14950>, 'lower': True, 'pad_size': -1}
sswe
{'tokeniser': <function ark_twokenize at 0x7fbd07a14950>, 'lower': True, 'pad_size': -1, 'embeddings': sswe} sswe
Train on 1610 samples, validate on 691 samples
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch

Epoch 25/300
Epoch 26/300