In [1]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
import copy
import json
from pathlib import Path
import os

import pandas as pd
from sklearn.metrics import accuracy_score, f1_score

from bella.helper import read_config
from bella.parsers import semeval_14, dong, election

Using TensorFlow backend.


In [2]:
##
#  ADD YOUR CONFIG FILE PATH HERE 
##
CONFIG_FP = Path('..', 'config.yaml')
#
# Load the test data
#
# Load all of the datasets
youtubean_test = semeval_14(read_config('youtubean_test', CONFIG_FP))
semeval_14_rest_test = semeval_14(read_config('semeval_2014_rest_test', CONFIG_FP))
semeval_14_lap_test = semeval_14(read_config('semeval_2014_lap_test', CONFIG_FP))
dong_test = dong(read_config('dong_twit_test_data', CONFIG_FP))
_, election_test = election(read_config('election_folder_dir', CONFIG_FP))
mitchel_test = semeval_14(read_config('mitchel_test', CONFIG_FP))
dataset_test = {'SemEval 14 Laptop': semeval_14_lap_test,
                'SemEval 14 Restaurant': semeval_14_rest_test,
                'Dong Twitter': dong_test,
                'Election Twitter': election_test,
                'YouTuBean': youtubean_test,
                'Mitchel': mitchel_test}
# results folder

results_folder = Path(read_config('results_folder', CONFIG_FP))
results_folder = results_folder.joinpath('TDLstm')

# Data 

lstm_results_fp = ('LSTM', results_folder)
tdlstm_results_fp = ('TDLSTM', results_folder)
tclstm_results_fp = ('TCLSTM', results_folder)

all_results = [lstm_results_fp, tdlstm_results_fp, tclstm_results_fp]

root_folder = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
root_folder_join = lambda paths: os.path.join(root_folder, 'results', *paths)

target_dep_results_fp = root_folder_join(['Target Dependent', 'Target Dependent.tsv'])
target_dep_plus_results_fp = root_folder_join(['Target Dependent', 'Target Dependent+.tsv'])
tdparse_results_fp = root_folder_join(['TDParse','TDParse.tsv'])
tdparse_plus_results_fp = root_folder_join(['TDParse','TDParsePlus.tsv'])
lstm_results_fp = root_folder_join(['TDLstm', 'lstm', 'results file.tsv'])
tdlstm_results_fp = root_folder_join(['TDLstm', 'tdlstm', 'results file.tsv'])
tclstm_results_fp = root_folder_join(['TDLstm', 'tclstm', 'results file.tsv'])

model_result_fp = {'Target-Dep' : target_dep_results_fp, 'Target-Dep+' : target_dep_plus_results_fp,
                   'TDParse' : tdparse_results_fp, 'TDParse+' : tdparse_plus_results_fp}

In [3]:
def load_predictions(model_name, results_folder):
    predictions_fp = results_folder.joinpath(model_name, 'dataset predictions.json')
    with predictions_fp.open('r') as predictions_file:
        return json.load(predictions_file)
    
def score_per_dataset(metric, dataset_predictions, dataset_true_values, custom=False, **metric_kwargs):
    dataset_score = {}
    for dataset_name, predictions in dataset_predictions.items():
        if custom:
            true_data = dataset_true_values[dataset_name]
            dataset_score[dataset_name] = metric(true_data, predictions, **metric_kwargs)
        else:
            true_values = dataset_true_values[dataset_name].sentiment_data()
            dataset_score[dataset_name] = metric(true_values, predictions, **metric_kwargs)
    return dataset_score

def evaluate_datasets(metric, dataset_true_values, model_results, custom_metric=False, **metric_kwargs):
    model_score = {}
    for model_name, results_fp in model_results:
        dataset_predictions = load_predictions(model_name, results_fp)
        score = score_per_dataset(metric, dataset_predictions, dataset_true_values, 
                                  custom_metric, **metric_kwargs)
        model_score[model_name] = score
    return model_score

def to_dataframe(evaluation_results, metric_name):
    all_results = []
    for model_name, result in evaluation_results.items():
        column_name = f'{model_name} {metric_name}'
        result = pd.Series(result, name=column_name) * 100
        result['Mean'] = result.mean()
        all_results.append(result)
    return pd.DataFrame(all_results).T

def distinct_sentiment_metrics(test_data, predictions, 
                               distinct_sentiments, metric_func, metric_kwargs):
    test_data_copy = copy.deepcopy(test_data)
    test_data_copy.add_pred_sentiment(predictions)
    test_subset = test_data_copy.subset_by_sentiment(distinct_sentiments)
    true_values = test_subset.sentiment_data()
    subset_predictions = test_subset.sentiment_data(sentiment_field='predicted')
    return metric_func(true_values, subset_predictions, **metric_kwargs)
    
def extract_columns(model_result_fp, columns):
    all_dfs = []
    datasets = None
    for model, result_fp in model_result_fp.items():
        results_dataframe = pd.read_csv(result_fp, sep='\t')
        if datasets is None:
            datasets = results_dataframe['dataset']
        results_dataframe = results_dataframe.set_index('dataset')
        results_dataframe = results_dataframe[columns]
        new_col_names = {column : '{} {}'.format(model, column) for column in columns}
        results_dataframe = results_dataframe.rename(columns=new_col_names)

        all_dfs.append(results_dataframe)
    return pd.concat(all_dfs, axis=1)

def columns_avliable(model_result_fp):
    for model, result_fp in model_result_fp.items():
        results_dataframe = pd.read_csv(result_fp, sep='\t')
        return [col for col in results_dataframe.columns if 'Unnamed' not in col]
    

def add_mean(results_df, rounding=2):
    mean_results = results_df.mean()
    mean_results.name = 'Mean'
    results_df = results_df.append(mean_results).round(rounding)
    return results_df
        
        



# Mass Evaluation Result analysis

Below we extract out the 3 way Macro F1 scores for each method on all of the dataset and combine them together into one table.

In [4]:
f1_results = extract_columns(model_result_fp, ['3 Class Macro F1'])
f1_results = add_mean(f1_results)

lstm_f1_results = evaluate_datasets(f1_score, dataset_test, all_results, average='macro')
lstm_f1_results= to_dataframe(lstm_f1_results, '3 Class Macro F1')
pd.concat((f1_results, lstm_f1_results), 1).round(2)

Unnamed: 0,Target-Dep 3 Class Macro F1,Target-Dep+ 3 Class Macro F1,TDParse 3 Class Macro F1,TDParse+ 3 Class Macro F1,LSTM 3 Class Macro F1,TDLSTM 3 Class Macro F1,TCLSTM 3 Class Macro F1
Dong Twitter,65.7,65.7,66.0,68.1,63.6,66.09,67.14
Election Twitter,45.5,45.9,46.2,44.6,38.7,43.6,42.08
Mitchel,40.8,42.9,40.5,50.0,47.17,51.16,41.03
SemEval 14 Laptop,60.0,63.7,59.6,64.5,47.84,57.91,46.8
SemEval 14 Restaurant,56.2,57.7,59.4,61.0,46.36,57.68,55.38
YouTuBean,53.1,55.6,71.7,68.0,45.93,45.47,38.07
Mean,53.55,55.25,57.23,59.37,48.27,53.65,48.42


We do the same as above but for the Accuracy

In [5]:
acc_results = extract_columns(model_result_fp, ['Accuracy'])
acc_results = add_mean(acc_results)
lstm_acc_results = evaluate_datasets(accuracy_score, dataset_test, all_results)
lstm_acc_results= to_dataframe(lstm_acc_results, 'Accuracy')
pd.concat((acc_results, lstm_acc_results), 1).round(2)

Unnamed: 0,Target-Dep Accuracy,Target-Dep+ Accuracy,TDParse Accuracy,TDParse+ Accuracy,LSTM Accuracy,TDLSTM Accuracy,TCLSTM Accuracy
Dong Twitter,67.3,67.8,68.6,70.1,66.47,68.79,69.51
Election Twitter,57.7,56.6,57.1,56.4,53.8,57.3,57.22
Mitchel,72.6,72.8,72.9,74.5,72.14,73.45,69.81
SemEval 14 Laptop,67.9,70.8,68.0,71.3,58.93,65.52,57.05
SemEval 14 Restaurant,73.8,74.6,73.3,76.2,71.07,74.38,72.41
YouTuBean,70.8,72.5,81.2,80.8,72.5,72.5,66.67
Mean,68.35,69.18,70.18,71.55,65.82,68.66,65.44


We can then look at the 2 way Macro F1 score which evaluates how well the models can predict Positive or Negative sentiment while still taking into account prediciting the neutral class

In [6]:
two_way_f1_results = extract_columns(model_result_fp, ['2 Class F1'])
two_way_f1_results = add_mean(two_way_f1_results)
lstm_two_way_f1_results = evaluate_datasets(f1_score, dataset_test, all_results, average='macro', labels=[-1, 1])
lstm_two_way_f1_results = to_dataframe(lstm_two_way_f1_results, '2 Class Macro F1')
pd.concat((two_way_f1_results, lstm_two_way_f1_results), 1).round(2)

Unnamed: 0,Target-Dep 2 Class F1,Target-Dep+ 2 Class F1,TDParse 2 Class F1,TDParse+ 2 Class F1,LSTM 2 Class Macro F1,TDLSTM 2 Class Macro F1,TCLSTM 2 Class Macro F1
Dong Twitter,62.4,61.9,61.5,64.4,59.19,61.59,62.94
Election Twitter,40.0,41.0,41.2,38.9,34.02,36.85,34.39
Mitchel,19.2,22.5,18.8,32.7,29.03,34.49,20.42
SemEval 14 Laptop,68.6,71.8,69.3,72.0,64.46,65.29,57.36
SemEval 14 Restaurant,73.6,73.8,70.9,76.4,69.54,71.08,68.35
YouTuBean,39.0,42.3,64.0,58.5,27.05,26.27,17.71
Mean,50.47,52.22,54.28,57.15,47.22,49.26,43.53


Now we look at the results of trying to predict the sentiment of all targets that are only in sentence that contain one distinct sentiment. This should be a result that is good for the LSTM model as it does not take into account any target information. However as we can see below the LSTM is one of the worse models.

In [7]:
one_dist_sent = extract_columns(model_result_fp, ['3 Class Macro F1 for text with 1 distinct sentiments'])
one_dist_sent = add_mean(one_dist_sent)
lstm_one_dist_sent = evaluate_datasets(distinct_sentiment_metrics, dataset_test, all_results, custom_metric=True, 
                                       distinct_sentiments=1, metric_func=f1_score,
                                       metric_kwargs={'average': 'macro'})
lstm_one_dist_sent = to_dataframe(lstm_one_dist_sent, '3 Class Macro F1 for text with 1 distinct sentiments')
pd.concat((one_dist_sent, lstm_one_dist_sent), 1).round(2)

Unnamed: 0,Target-Dep 3 Class Macro F1 for text with 1 distinct sentiments,Target-Dep+ 3 Class Macro F1 for text with 1 distinct sentiments,TDParse 3 Class Macro F1 for text with 1 distinct sentiments,TDParse+ 3 Class Macro F1 for text with 1 distinct sentiments,LSTM 3 Class Macro F1 for text with 1 distinct sentiments,TDLSTM 3 Class Macro F1 for text with 1 distinct sentiments,TCLSTM 3 Class Macro F1 for text with 1 distinct sentiments
Dong Twitter,65.7,65.7,66.0,68.1,63.6,66.09,67.14
Election Twitter,50.3,50.3,48.7,48.6,45.09,47.29,42.55
Mitchel,41.5,43.1,41.3,51.3,47.64,52.41,41.13
SemEval 14 Laptop,61.0,67.2,60.7,67.3,49.06,58.3,47.61
SemEval 14 Restaurant,57.2,60.5,57.7,62.7,48.19,59.89,54.73
YouTuBean,54.2,57.0,73.4,69.2,46.68,46.22,38.66
Mean,54.98,57.3,57.97,61.2,50.04,55.03,48.64


We do the same as above but for 2 distinct sentiments per sentence. We remove the Dong twitter dataset from this experiment due to it only containing Tweets with one distinct sentiment.

In [8]:
two_dist_sent = extract_columns(model_result_fp, ['3 Class Macro F1 for text with 2 distinct sentiments'])
two_dist_sent = add_mean(two_dist_sent)
lstm_two_dist_sent = evaluate_datasets(distinct_sentiment_metrics, dataset_test, all_results, custom_metric=True,
                                       distinct_sentiments=2, metric_func=f1_score, 
                                       metric_kwargs={'average': 'macro'})
lstm_two_dist_sent = to_dataframe(lstm_two_dist_sent, '3 Class Macro F1 for text with 2 distinct sentiments')
two_dist_sent = pd.concat((two_dist_sent, lstm_two_dist_sent), 1).round(2)
two_dist_sent = two_dist_sent.drop('Dong Twitter')
two_dist_sent = two_dist_sent.drop('Mean')
two_dist_mean = pd.Series(two_dist_sent.mean(), name='Mean')
two_dist_sent.append(two_dist_mean).round(2)

  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,Target-Dep 3 Class Macro F1 for text with 2 distinct sentiments,Target-Dep+ 3 Class Macro F1 for text with 2 distinct sentiments,TDParse 3 Class Macro F1 for text with 2 distinct sentiments,TDParse+ 3 Class Macro F1 for text with 2 distinct sentiments,LSTM 3 Class Macro F1 for text with 2 distinct sentiments,TDLSTM 3 Class Macro F1 for text with 2 distinct sentiments,TCLSTM 3 Class Macro F1 for text with 2 distinct sentiments
Election Twitter,41.9,43.0,43.9,41.5,34.12,39.89,41.29
Mitchel,31.6,34.8,30.6,37.9,38.0,36.24,39.21
SemEval 14 Laptop,51.9,47.4,50.7,49.7,33.67,52.94,40.18
SemEval 14 Restaurant,48.5,48.6,58.0,53.8,37.97,48.6,51.46
YouTuBean,31.5,31.1,42.2,50.9,20.51,20.51,20.51
Mean,41.08,40.98,45.08,46.76,32.85,39.64,38.53


In [9]:
three_dist_sent = extract_columns(model_result_fp, ['3 Class Macro F1 for text with 3 distinct sentiments'])
lstm_3_dist_sent = evaluate_datasets(distinct_sentiment_metrics, dataset_test, all_results, custom_metric=True,
                                     distinct_sentiments=3, metric_func=f1_score, 
                                     metric_kwargs={'average': 'macro'})
lstm_3_dist_sent = to_dataframe(lstm_3_dist_sent, '3 Class Macro F1 for text with 3 distinct sentiments')
three_dist_sent = pd.concat((three_dist_sent, lstm_3_dist_sent), 1).round(2)
three_dist_sent = pd.DataFrame(three_dist_sent.loc['Election Twitter'])
three_dist_sent.sort_values('Election Twitter', ascending=False)

  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,Election Twitter
TDParse 3 Class Macro F1 for text with 3 distinct sentiments,44.9
TDLSTM 3 Class Macro F1 for text with 3 distinct sentiments,42.75
TCLSTM 3 Class Macro F1 for text with 3 distinct sentiments,42.36
TDParse+ 3 Class Macro F1 for text with 3 distinct sentiments,42.0
Target-Dep 3 Class Macro F1 for text with 3 distinct sentiments,41.9
Target-Dep+ 3 Class Macro F1 for text with 3 distinct sentiments,40.9
LSTM 3 Class Macro F1 for text with 3 distinct sentiments,29.55


Lastly the same as above but for 3 distinct sentiments. As the majority of the datasets have very few 3 distinct sentiments per sentence we only look at the Twitter Elections datases (8.78% of the dataset contains 3 distinct sentiments)