In [1]:
import os
import time
import json
from pathlib import Path
from multiprocessing.pool import Pool

import numpy as np
import pandas as pd
# Metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
from keras import initializers, optimizers

from bella.data_types import TargetCollection, Target
# Models
from bella.models.tdlstm import LSTM, TDLSTM, TCLSTM
# Tokenisers
from bella.tokenisers import ark_twokenize
# Word Vectors
from bella.word_vectors import SSWE, GloveCommonCrawl
# Get the data
from bella.parsers import semeval_14, dong, election
from bella.helper import read_config
from bella.evaluation import evaluation_results

Using TensorFlow backend.


In [2]:
def train_val_split(train, split_size=0.2, seed=42):
    splitter = StratifiedShuffleSplit(n_splits=1, test_size=split_size, random_state=seed)
    data = np.asarray(train.data_dict())
    sentiment = np.asarray(train.sentiment_data())
    for train_indexs, test_indexs in splitter.split(data, sentiment):
        train_data = data[train_indexs]
        test_data = data[test_indexs]

    convert_to_targets = lambda data: [Target(**target) for target in data]
    train = TargetCollection(convert_to_targets(train_data))
    val = TargetCollection(convert_to_targets(test_data))

    X_train = np.array(train.data_dict())
    y_train = np.array(train.sentiment_data())
    X_val = np.array(val.data_dict())
    y_val = np.array(val.sentiment_data())
    return (X_train, y_train), (X_val, y_val)

def class_mapper(y, inverse=False):
    class_map = {-1: 0, 0: 1, 1: 2}
    inv_class_map = {0: -1, 1: 0, 2: 1}
    if inverse:
        return np.array([inv_class_map[val] for val in y])
    return np.array([class_map[val] for val in y])

uniform_init = initializers.RandomUniform(minval=-0.003, maxval=0.003)
lstm_layer_kwargs = {'kernel_initializer' : uniform_init,
                     'recurrent_initializer' : uniform_init,
                     'bias_initializer' : uniform_init}
dense_layer_kwargs = {'kernel_initializer' : uniform_init,
                      'bias_initializer' : uniform_init}
embedding_layer_kwargs = {'embeddings_initializer' : uniform_init}
model_kwargs = {'lstm_layer_kwargs': lstm_layer_kwargs,
                'dense_layer_kwargs': dense_layer_kwargs,
                'embedding_layer_kwargs': embedding_layer_kwargs,
                'optimiser': optimizers.SGD,
                'optimiser_params': {'lr': 0.01},
                'reproducible': 42}

In [3]:
def best_word_vector(word_vectors, dataset_train_test, model, model_result_folder):
    dataset_name, train_data, _ = dataset_train_test
    (X_train, y_train), (X_val, y_val) = train_val_split(train_data)
    y_train, y_val = class_mapper(y_train), class_mapper(y_val)
    word_vector_scores = {}
    for word_vector in word_vectors:
        model.embeddings = word_vector
        model_history = model.fit(X_train, y_train, (X_val, y_val))
        best_score = max(model_history.history['val_acc'])
        word_vector_scores[word_vector] = best_score
    
    # Saving results
    model_word_vector_fp = model_result_folder.joinpath('word vectors.json')
    
    dataset_word_vector_scores = {}
    if model_word_vector_fp.is_file():
        with model_word_vector_fp.open('r') as model_word_vector_file:
            dataset_word_vector_scores = json.load(model_word_vector_file)
    dataset_word_vector_scores[dataset_name] = {str(vector): score 
                                                for vector, score in word_vector_scores.items()}
    with model_word_vector_fp.open('w') as model_word_vector_file:
            json.dump(dataset_word_vector_scores, model_word_vector_file)
        
    return word_vector_scores

def dataset_predictions(dataset_train_test, model, model_result_folder):
    # Preparing the data
    dataset_name, train_data, test_data = dataset_train_test
    (X_train, y_train), (X_val, y_val) = train_val_split(train_data)
    y_train, y_val = class_mapper(y_train), class_mapper(y_val)
    X_test = test_data.data_dict()
    
    model_history = model.fit(X_train, y_train, (X_val, y_val))
    predictions = model.predict(X_test)
    # Change back to the original class labels from the mapping version
    predictions = class_mapper(predictions, inverse=True)
    predictions = predictions.tolist()
    
    # Saving results
    model_dataset_fp = model_result_folder.joinpath('dataset predictions.json')
    
    dataset_predictions = {}
    if model_dataset_fp.is_file():
        with model_dataset_fp.open('r') as model_dataset_file:
            dataset_predictions = json.load(model_dataset_file)
    dataset_predictions[dataset_name] = predictions
    with model_dataset_fp.open('w') as model_dataset_file:
            json.dump(dataset_predictions, model_dataset_file)
    return predictions

def model_evaluation(model_class, word_vectors, dataset_train_test, 
                     result_folder, model_zoo_folder, model_kwargs=None):
    if model_kwargs is None:
        model_kwargs = {}
    model = model_class(ark_twokenize, word_vectors[0], **model_kwargs)
    model_result_folder = result_folder.joinpath(f'{str(model)}')
    model_result_folder.mkdir(parents=True, exist_ok=True)
    
    vector_scores = best_word_vector(word_vectors, dataset_train_test, 
                                     model, model_result_folder)
    best_vector = max(vector_scores.items(), key=lambda x: x[1])[0]
    model.embeddings = best_vector
    best_predictions = dataset_predictions(dataset_train_test, model, model_result_folder)
    dataset_name = dataset_train_test[0]
    # Save model
    model_zoo_file = model_zoo_folder.joinpath(f'{str(model)} {dataset_name}')
    model.save(model, model_zoo_file)
    return dataset_name, best_predictions

def load_and_run(model_class, dataset_train_test, 
                 results_folder, model_zoo_folder, model_kwargs=None):
    if model_kwargs is None:
        model_kwargs = {}
    # Setting the word vectors up for each dataset
    train_list = dataset_train_test[1].word_list(ark_twokenize)
    test_list = dataset_train_test[2].word_list(ark_twokenize)
    all_words = list(set(train_list + test_list))
    sswe = SSWE(filter_words=all_words)
    glove_300 = GloveCommonCrawl(version=42, filter_words=all_words)
    word_vectors = [sswe, glove_300]
    
    return model_evaluation(model_class, word_vectors, dataset_train_test, 
                            results_folder, model_zoo_folder, model_kwargs)

def model_evaluation_args(model_classes, dataset_train_test, 
                          result_folder, model_zoo_folder, model_kwargs=None):
    for model_class in model_classes:
        for dataset_name_train_test in dataset_train_test:
            yield (model_class, dataset_name_train_test, 
                   results_folder, model_zoo_folder, model_kwargs)

In [None]:
##
#  ADD YOUR CONFIG FILE PATH HERE 
##
CONFIG_FP = Path('..', 'config.yaml')

# Load all of the datasets
youtubean_train = semeval_14(read_config('youtubean_train', CONFIG_FP))
youtubean_test = semeval_14(read_config('youtubean_test', CONFIG_FP))
semeval_14_rest_train = semeval_14(read_config('semeval_2014_rest_train', CONFIG_FP))
semeval_14_lap_train = semeval_14(read_config('semeval_2014_lap_train', CONFIG_FP))
semeval_14_rest_test = semeval_14(read_config('semeval_2014_rest_test', CONFIG_FP))
semeval_14_lap_test = semeval_14(read_config('semeval_2014_lap_test', CONFIG_FP))
dong_train = dong(read_config('dong_twit_train_data', CONFIG_FP))
dong_test = dong(read_config('dong_twit_test_data', CONFIG_FP))
election_train, election_test = election(read_config('election_folder_dir', CONFIG_FP))
mitchel_train = semeval_14(read_config('mitchel_train', CONFIG_FP))
mitchel_test = semeval_14(read_config('mitchel_test', CONFIG_FP))


dataset_train_test = [('SemEval 14 Laptop', semeval_14_lap_train, semeval_14_lap_test),
                      ('SemEval 14 Restaurant', semeval_14_rest_train, semeval_14_rest_test),
                      ('Dong Twitter', dong_train, dong_test),
                      ('Election Twitter', election_train, election_test),
                      ('YouTuBean', youtubean_train, youtubean_test),
                      ('Mitchel', mitchel_train, mitchel_test)]


results_folder = Path(read_config('results_folder', CONFIG_FP))
results_folder = results_folder.joinpath('TDLstm')
model_zoo_folder = Path(read_config('model_zoo_folder', CONFIG_FP))
model_zoo_folder.mkdir(parents=True, exist_ok=True)

# Mass evaluation of the LSTM, TDLSTM, and TCLSTM models

In [None]:
#
# Number of cpus to use
#
n_cpus = 15

model_eval_args = args = model_evaluation_args([LSTM, TDLSTM, TCLSTM], dataset_train_test, results_folder,
                                               model_zoo_folder, model_kwargs)
dataset_name_predictions = []
with Pool(n_cpus) as pool:
    dataset_name_predictions = pool.starmap(load_and_run, model_eval_args)

Loading glove 300d 42b common crawl from file
Loading glove 300d 42b common crawl from file
Loading glove 300d 42b common crawl from file
Loading glove 300d 42b common crawl from file
Loading glove 300d 42b common crawl from file
Loading glove 300d 42b common crawl from file
Loading glove 300d 42b common crawl from file
Loading glove 300d 42b common crawl from file
Loading glove 300d 42b common crawl from file
Loading glove 300d 42b common crawl from file
Loading glove 300d 42b common crawl from file
Loading glove 300d 42b common crawl from file
Loading glove 300d 42b common crawl from file
Loading glove 300d 42b common crawl from file
Loading glove 300d 42b common crawl from file
Loading glove 300d 42b common crawl from file
Loading glove 300d 42b common crawl from file
Loading glove 300d 42b common crawl from file
