In [1]:
import os

import pandas as pd

In [2]:
def extract_columns(model_result_fp, columns):
    all_dfs = []
    datasets = None
    for model, result_fp in model_result_fp.items():
        results_dataframe = pd.read_csv(result_fp, sep='\t')
        if datasets is None:
            datasets = results_dataframe['dataset']
        results_dataframe = results_dataframe.set_index('dataset')
        results_dataframe = results_dataframe[columns]
        new_col_names = {column : '{} {}'.format(model, column) for column in columns}
        results_dataframe = results_dataframe.rename(columns=new_col_names)

        all_dfs.append(results_dataframe)
    return pd.concat(all_dfs, axis=1)

def columns_avliable(model_result_fp):
    for model, result_fp in model_result_fp.items():
        results_dataframe = pd.read_csv(result_fp, sep='\t')
        return [col for col in results_dataframe.columns if 'Unnamed' not in col]
    

def add_mean(results_df, rounding=2):
    mean_results = results_df.mean()
    mean_results.name = 'Mean'
    results_df = results_df.append(mean_results).round(rounding)
    return results_df
    

# Mass Evaluation Result analysis

Below we are matching the name of the model to the result file for that model

In [7]:
root_folder = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
root_folder_join = lambda paths: os.path.join(root_folder, 'results', *paths)

target_dep_results_fp = root_folder_join(['Target Dependent', 'Target Dependent.tsv'])
target_dep_plus_results_fp = root_folder_join(['Target Dependent', 'Target Dependent+.tsv'])
tdparse_results_fp = root_folder_join(['TDParse','TDParse.tsv'])
tdparse_plus_results_fp = root_folder_join(['TDParse','TDParsePlus.tsv'])
lstm_results_fp = root_folder_join(['TDLstm', 'lstm', 'results file.tsv'])
tdlstm_results_fp = root_folder_join(['TDLstm', 'tdlstm', 'results file.tsv'])
tclstm_results_fp = root_folder_join(['TDLstm', 'tclstm', 'results file.tsv'])

model_result_fp = {'Target-Dep' : target_dep_results_fp, 'Target-Dep+' : target_dep_plus_results_fp,
                   'TDParse' : tdparse_results_fp, 'TDParse+' : tdparse_plus_results_fp,
                   'LSTM' : lstm_results_fp, 'TDLSTM' : tdlstm_results_fp, 'TCLSTM' : tclstm_results_fp}

Below we extract out the 3 way Macro F1 scores for each method on all of the dataset and combine them together into one table.

In [16]:
f1_results = extract_columns(model_result_fp, ['3 Class Macro F1'])
f1_results = add_mean(f1_results)
f1_results

Unnamed: 0,Target-Dep 3 Class Macro F1,Target-Dep+ 3 Class Macro F1,TDParse 3 Class Macro F1,TDParse+ 3 Class Macro F1,LSTM 3 Class Macro F1,TDLSTM 3 Class Macro F1,TCLSTM 3 Class Macro F1
Dong Twitter,65.7,65.7,66.0,68.1,62.0,63.0,60.1
Election Twitter,45.5,45.9,46.2,44.6,39.2,43.1,41.9
Mitchel,40.8,42.9,40.5,50.0,39.9,41.6,35.5
SemEval 14 Laptop,60.0,63.7,59.6,64.5,58.2,56.1,47.3
SemEval 14 Restaurant,56.2,57.7,59.4,61.0,48.7,59.8,49.6
YouTuBean,53.1,55.6,71.7,68.0,45.2,43.9,34.7
Mean,53.55,55.25,57.23,59.37,48.87,51.25,44.85


We do the same as above but for the Accuracy

In [10]:
acc_results = extract_columns(model_result_fp, ['Accuracy'])
acc_results = add_mean(acc_results)
acc_results

Unnamed: 0,Target-Dep Accuracy,Target-Dep+ Accuracy,TDParse Accuracy,TDParse+ Accuracy,LSTM Accuracy,TDLSTM Accuracy,TCLSTM Accuracy
Dong Twitter,67.3,67.8,68.6,70.1,65.3,65.6,63.7
Election Twitter,57.7,56.6,57.1,56.4,52.0,55.3,56.2
Mitchel,72.6,72.8,72.9,74.5,70.7,70.8,70.1
SemEval 14 Laptop,67.9,70.8,68.0,71.3,66.5,63.8,56.1
SemEval 14 Restaurant,73.8,74.6,73.3,76.2,73.3,74.5,68.4
YouTuBean,70.8,72.5,81.2,80.8,72.5,71.7,65.4
Mean,68.35,69.18,70.18,71.55,66.72,66.95,63.32


We can then look at the 2 way Macro F1 score which evaluates how well the models can predict Positive or Negative sentiment while still taking into account prediciting the neutral class

In [11]:
two_way_f1_results = extract_columns(model_result_fp, ['2 Class F1'])
two_way_f1_results = add_mean(two_way_f1_results)
two_way_f1_results

Unnamed: 0,Target-Dep 2 Class F1,Target-Dep+ 2 Class F1,TDParse 2 Class F1,TDParse+ 2 Class F1,LSTM 2 Class F1,TDLSTM 2 Class F1,TCLSTM 2 Class F1
Dong Twitter,62.4,61.9,61.5,64.4,57.3,58.3,54.3
Election Twitter,40.0,41.0,41.2,38.9,34.7,36.3,35.8
Mitchel,19.2,22.5,18.8,32.7,18.4,21.2,12.2
SemEval 14 Laptop,68.6,71.8,69.3,72.0,68.2,64.8,56.2
SemEval 14 Restaurant,73.6,73.8,70.9,76.4,73.1,69.2,59.4
YouTuBean,39.0,42.3,64.0,58.5,25.9,24.3,13.1
Mean,50.47,52.22,54.28,57.15,46.27,45.68,38.5


Now we look at the results of trying to predict the sentiment of all targets that are only in sentence that contain one distinct sentiment. This should be a result that is good for the LSTM model as it does not take into account any target information. However as we can see below the LSTM is one of the worse models.

In [12]:
one_dist_sent = extract_columns(model_result_fp, ['3 Class Macro F1 for text with 1 distinct sentiments'])
one_dist_sent = add_mean(one_dist_sent)
one_dist_sent

Unnamed: 0,Target-Dep 3 Class Macro F1 for text with 1 distinct sentiments,Target-Dep+ 3 Class Macro F1 for text with 1 distinct sentiments,TDParse 3 Class Macro F1 for text with 1 distinct sentiments,TDParse+ 3 Class Macro F1 for text with 1 distinct sentiments,LSTM 3 Class Macro F1 for text with 1 distinct sentiments,TDLSTM 3 Class Macro F1 for text with 1 distinct sentiments,TCLSTM 3 Class Macro F1 for text with 1 distinct sentiments
Dong Twitter,65.7,65.7,66.0,68.1,62.0,63.0,60.1
Election Twitter,50.3,50.3,48.7,48.6,46.5,46.0,41.7
Mitchel,41.5,43.1,41.3,51.3,40.9,41.8,36.2
SemEval 14 Laptop,61.0,67.2,60.7,67.3,61.6,57.7,47.2
SemEval 14 Restaurant,57.2,60.5,57.7,62.7,51.7,58.7,47.1
YouTuBean,54.2,57.0,73.4,69.2,46.0,44.6,35.2
Mean,54.98,57.3,57.97,61.2,51.45,51.97,44.58


We do the same as above but for 2 distinct sentiments per sentence. We remove the Dong twitter dataset from this experiment due to it only containing Tweets with one distinct sentiment.

In [13]:
two_dist_sent = extract_columns(model_result_fp, ['3 Class Macro F1 for text with 2 distinct sentiments'])
two_dist_sent = two_dist_sent.drop('Dong Twitter')
two_dist_sent = add_mean(two_dist_sent)
two_dist_sent

Unnamed: 0,Target-Dep 3 Class Macro F1 for text with 2 distinct sentiments,Target-Dep+ 3 Class Macro F1 for text with 2 distinct sentiments,TDParse 3 Class Macro F1 for text with 2 distinct sentiments,TDParse+ 3 Class Macro F1 for text with 2 distinct sentiments,LSTM 3 Class Macro F1 for text with 2 distinct sentiments,TDLSTM 3 Class Macro F1 for text with 2 distinct sentiments,TCLSTM 3 Class Macro F1 for text with 2 distinct sentiments
Election Twitter,41.9,43.0,43.9,41.5,33.7,40.3,41.8
Mitchel,31.6,34.8,30.6,37.9,27.4,35.6,28.0
SemEval 14 Laptop,51.9,47.4,50.7,49.7,39.6,46.9,43.8
SemEval 14 Restaurant,48.5,48.6,58.0,53.8,37.0,55.8,50.2
YouTuBean,31.5,31.1,42.2,50.9,20.5,20.5,20.5
Mean,41.08,40.98,45.08,46.76,31.64,39.82,36.86


Lastly the same as above but for 3 distinct sentiments. As the majority of the datasets have very few 3 distinct sentiments per sentence we only look at the Twitter Elections datases (8.78% of the dataset contains 3 distinct sentiments)

In [14]:
three_dist_sent = extract_columns(model_result_fp, ['3 Class Macro F1 for text with 3 distinct sentiments'])
three_dist_sent = pd.DataFrame(three_dist_sent.loc['Election Twitter'])
three_dist_sent.sort_values('Election Twitter', ascending=False)

Unnamed: 0,Election Twitter
TDParse 3 Class Macro F1 for text with 3 distinct sentiments,44.9
TDLSTM 3 Class Macro F1 for text with 3 distinct sentiments,43.3
TCLSTM 3 Class Macro F1 for text with 3 distinct sentiments,42.7
TDParse+ 3 Class Macro F1 for text with 3 distinct sentiments,42.0
Target-Dep 3 Class Macro F1 for text with 3 distinct sentiments,41.9
Target-Dep+ 3 Class Macro F1 for text with 3 distinct sentiments,40.9
LSTM 3 Class Macro F1 for text with 3 distinct sentiments,29.5
