# NOTE
**Please ensure that you have ran the *Mitchel and YouTuBean train test split* notebooks first so that all of the datasets are avaliable**

In [1]:
from collections import defaultdict
from pathlib import Path
import json
from typing import Callable, List, Union, Tuple, Dict, Any
import math

import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score
# Models
from bella.models.target import TargetDep, TargetDepPlus
from bella.models.base import SKLearnModel
# Word Vector methods
from bella.word_vectors import GloveCommonCrawl, SSWE
from bella.helper import read_config
# Sentiment lexicons
from bella import lexicons
# Get the data
from bella.parsers import semeval_14, dong, election
from bella.data_types import TargetCollection
# Tokenisers
from bella.tokenisers import ark_twokenize
# Evaluation
from bella import evaluation

Using TensorFlow backend.


In [2]:
def all_words(tokeniser: Callable[[str], List[str]], 
              *datasets) -> List[str]:
    words = []
    for dataset in datasets:
        words.extend(dataset.word_list(tokeniser))
    return list(set(words)) 

In [3]:
##
#  ADD YOUR CONFIG FILE PATH HERE 
##
CONFIG_FP = Path('..', 'config.yaml')

# Getting the sentiment lexicons
hu_liu_fp = Path(read_config('hu_liu_lexicon', CONFIG_FP))
mpqa_fp = Path(read_config('mpqa_lexicon', CONFIG_FP))
nrc_fp = Path(read_config('nrc_emotion_lexicon', CONFIG_FP))

subset_cats = {'positive', 'negative'}
mpqa_low = lexicons.Mpqa(mpqa_fp, subset_cats=subset_cats, lower=True)
nrc_low = lexicons.NRC(nrc_fp, subset_cats=subset_cats, lower=True)
hu_liu_low = lexicons.HuLiu(hu_liu_fp, subset_cats=subset_cats, lower=True)
mpqa_huliu_low = lexicons.Lexicon.combine_lexicons(mpqa_low, hu_liu_low)
all_three_low = lexicons.Lexicon.combine_lexicons(mpqa_huliu_low, nrc_low)


# Load all of the datasets
youtubean_train = semeval_14(read_config('youtubean_train', CONFIG_FP))
youtubean_test = semeval_14(read_config('youtubean_test', CONFIG_FP))
semeval_14_rest_train = semeval_14(read_config('semeval_2014_rest_train', CONFIG_FP))
semeval_14_lap_train = semeval_14(read_config('semeval_2014_lap_train', CONFIG_FP))
semeval_14_rest_test = semeval_14(read_config('semeval_2014_rest_test', CONFIG_FP))
semeval_14_lap_test = semeval_14(read_config('semeval_2014_lap_test', CONFIG_FP))
dong_train = dong(read_config('dong_twit_train_data', CONFIG_FP))
dong_test = dong(read_config('dong_twit_test_data', CONFIG_FP))
election_train, election_test = election(read_config('election_folder_dir', CONFIG_FP))
mitchel_train = semeval_14(read_config('mitchel_train', CONFIG_FP))
mitchel_test = semeval_14(read_config('mitchel_test', CONFIG_FP))


dataset_train_test = [('SemEval 14 Laptop', semeval_14_lap_train, semeval_14_lap_test),
                      ('SemEval 14 Restaurant', semeval_14_rest_train, semeval_14_rest_test),
                      ('Dong Twitter', dong_train, dong_test),
                      ('Election Twitter', election_train, election_test),
                      ('YouTuBean', youtubean_train, youtubean_test),
                      ('Mitchel', mitchel_train, mitchel_test)]

results_folder = Path(read_config('results_folder', CONFIG_FP))
results_folder = results_folder.joinpath('Target Dependent')
results_folder.mkdir(parents=True, exist_ok=True)
model_zoo_folder = Path(read_config('model_zoo_folder', CONFIG_FP))
model_zoo_folder.mkdir(parents=True, exist_ok=True)

Above is just loading the data, sentiment lexicons and places to save the results of the experiments

# Target Dependent methods applied across multiple datasets

In this notebook we are going to look at the two best TDParse methods:
1. Target Dependent
2. Target Dependent+

The first does not use a sentiment lexicon and the second does.

We are going to test them over all 6 datasets:
1. SemEval 2014 Laptop
2. SemEval 2014 Resturant
3. Dong Twitter
4. Election Twitter
5. YouTuBean
6. Mitchel Twitter dataset

Each of these are different some more so than others for full details on these datasets look at this [notebook](./datasets.ipynb). First each one of these models has to be fine tuned for each dataset that involves:
1. Finding the Best C value for the SVM estimator for both methods.
2. We will find the best word embeddings to use for each method.

Once we have fine tuned our methods for each dataset on the training dataset using 5 fold cross validation we will predict on the test data and save the models for future use.


## Finding the Best C value

First we want to find the Best C value for each model for each dataset by performing 5 fold cross validation. 

We are first going to choose the best C value from a coarse grained set of C values and then create a more fine grained search around the best coarse grained C value.

In [4]:
coarse_range = []
start = 0.00001
stop = 10
while True:
    coarse_range.append(start)
    start *= 10
    if start > stop:
        break
coarse_range

[1e-05, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]

In [5]:
models = [TargetDep, TargetDepPlus]
n_cpus = 7

dataset_model_c = {}
best_c_file = results_folder.joinpath('Mass Evaluation Best C.json')
if best_c_file.is_file():
    with best_c_file.open('r') as best_c_json:
        dataset_model_c = json.load(best_c_json)

for dataset_name, train, test in dataset_train_test:
    if dataset_name in dataset_model_c:
        continue
    model_kwargs = []
    word_embedding = SSWE(filter_words=train.word_list(ark_twokenize))
    for model in models:
        kwargs = {'word_vectors': [[word_embedding]]}
        if model == TargetDepPlus:
            kwargs['senti_lexicon'] = [all_three_low]
        model_kwargs.append((model, kwargs))
    X_train = train.data()
    y_train = train.sentiment_data()
    model_c = SKLearnModel.models_best_parameter(model_kwargs, 'C', coarse_range, 
                                                 X_train, y_train, n_cpus)
    model_fine_c = {}
    for model_kwarg in model_kwargs:
        model, kwarg = model_kwarg
        best_coarse_c = float(model_c[model])
        fine_range = [(best_coarse_c / 10) * 3.5,
                      (best_coarse_c / 10) * 7, best_coarse_c,
                       best_coarse_c * 3.5, best_coarse_c * 7]
        temp_model_c = SKLearnModel.models_best_parameter([model_kwarg], 'C', fine_range, 
                                                          X_train, y_train, n_cpus)
        model_fine_c[model] = temp_model_c[model]
    model_c = {str(model): c for model, c in model_fine_c.items()}
    dataset_model_c[dataset_name] = model_c
    
with best_c_file.open('w+') as best_c_json:
        json.dump(dataset_model_c, best_c_json)
for dataset_name, model_c in dataset_model_c.items():
    print(f'Dataset: {dataset_name} Model and C Value {model_c}')

Dataset: SemEval 14 Laptop Model and C Value {"<class 'bella.models.target.TargetDep'>": '0.01', "<class 'bella.models.target.TargetDepPlus'>": '0.0035'}
Dataset: SemEval 14 Restaurant Model and C Value {"<class 'bella.models.target.TargetDep'>": '0.035', "<class 'bella.models.target.TargetDepPlus'>": '0.01'}
Dataset: Dong Twitter Model and C Value {"<class 'bella.models.target.TargetDep'>": '0.035', "<class 'bella.models.target.TargetDepPlus'>": '0.01'}
Dataset: Election Twitter Model and C Value {"<class 'bella.models.target.TargetDep'>": '0.0035', "<class 'bella.models.target.TargetDepPlus'>": '0.0035'}
Dataset: YouTuBean Model and C Value {"<class 'bella.models.target.TargetDep'>": '0.035', "<class 'bella.models.target.TargetDepPlus'>": '0.01'}
Dataset: Mitchel Model and C Value {"<class 'bella.models.target.TargetDep'>": '0.01', "<class 'bella.models.target.TargetDepPlus'>": '0.007'}


## Finding the best word embeddings

We are now going to perform 5 fold cross validation to find the best word embedding for each method on each dataset. The possible word embeddings are the following:
1. [Glove 42 Billion Common Crawl](https://nlp.stanford.edu/projects/glove/) - 300 dimension these were trained on web data.
2. [Sentiment Specific Word Embeddings (SSWE)](http://www.aclweb.org/anthology/P14-1146) - 50 dimension these were trained on Twitter data.

We are going to use the Best C values while performing cross validation to find the best word embeddings.

In [6]:
dataset_model_embedding = {}
best_embedding_file = results_folder.joinpath('Mass Evaluation Best Embedding.json')
if best_embedding_file.is_file():
    with best_embedding_file.open('r') as best_embedding_json:
        dataset_model_embedding = json.load(best_embedding_json)

for dataset_name, train, test in dataset_train_test:
    if dataset_name in dataset_model_embedding:
        continue
    # the different embeddings
    filter_words = train.word_list(ark_twokenize)
    sswe_embedding = SSWE(filter_words=filter_words)
    glove_embedding = GloveCommonCrawl(42, filter_words=filter_words)
    all_embeddings = [[sswe_embedding], [glove_embedding]]
    model_kwargs = []
    for model in models:
        best_c = dataset_model_c[dataset_name][str(model)]
        kwargs = {'C': [float(best_c)]}
        if model == TargetDepPlus:
            kwargs['senti_lexicon'] = [all_three_low]
        model_kwargs.append((model, kwargs))
    X_train = train.data()
    y_train = train.sentiment_data()
    model_embedding = SKLearnModel.models_best_parameter(model_kwargs, 'word_vectors', 
                                                         all_embeddings, 
                                                         X_train, y_train, n_cpus)
    model_embedding = {str(model): embedding for model, embedding in model_embedding.items()}
    dataset_model_embedding[dataset_name] = model_embedding

with best_embedding_file.open('w+') as best_embedding_json:
        json.dump(dataset_model_embedding, best_embedding_json)
for dataset_name, model_embedding in dataset_model_embedding.items():
    print(f'Dataset: {dataset_name} Model and Embedding {model_embedding}')  

Dataset: SemEval 14 Laptop Model and Embedding {"<class 'bella.models.target.TargetDep'>": '[glove 300d 42b common crawl]', "<class 'bella.models.target.TargetDepPlus'>": '[glove 300d 42b common crawl]'}
Dataset: SemEval 14 Restaurant Model and Embedding {"<class 'bella.models.target.TargetDep'>": '[sswe]', "<class 'bella.models.target.TargetDepPlus'>": '[sswe]'}
Dataset: Dong Twitter Model and Embedding {"<class 'bella.models.target.TargetDep'>": '[glove 300d 42b common crawl]', "<class 'bella.models.target.TargetDepPlus'>": '[glove 300d 42b common crawl]'}
Dataset: Election Twitter Model and Embedding {"<class 'bella.models.target.TargetDep'>": '[glove 300d 42b common crawl]', "<class 'bella.models.target.TargetDepPlus'>": '[glove 300d 42b common crawl]'}
Dataset: YouTuBean Model and Embedding {"<class 'bella.models.target.TargetDep'>": '[sswe]', "<class 'bella.models.target.TargetDepPlus'>": '[sswe]'}
Dataset: Mitchel Model and Embedding {"<class 'bella.models.target.TargetDep'>": '

## Predictions on the test data

Now we have the best C value and embeddings for each dataset and for each model we shall use these to make the predictions on the test data of all the datasets. Once we have made these predictions we shall save the raw predictions and the machine learning models so that we can analysis and use them later.

In [7]:
model_dataset_predictions = defaultdict(lambda: dict())

# Get the predictions data if it exists
for model in models:
    model_results_folder = results_folder.joinpath(model.name())
    dataset_predictions_fp = model_results_folder.joinpath('dataset predictions.json')
    if dataset_predictions_fp.is_file():
        with dataset_predictions_fp.open('r') as dataset_predictions_json:
            dataset_predictions = json.load(dataset_predictions_json)
            model_dataset_predictions[model.name()] = dataset_predictions

# Create the predictions for each dataset and for each model
for dataset_name, train, test in dataset_train_test:
    model_c = dataset_model_c[dataset_name]
    model_embedding = dataset_model_embedding[dataset_name]
        
    X_train, y_train = train.data(), train.sentiment_data()
    X_test, y_test = test.data(), test.sentiment_data()
    dataset_words = all_words(ark_twokenize, train, test)
    
    
    for model in models:
        if dataset_name in model_dataset_predictions[model.name()]:
            continue
            
        embedding = model_embedding[str(model)]
        if embedding == '[glove 300d 42b common crawl]':
            embedding = [GloveCommonCrawl(42, filter_words=dataset_words)]
        elif embedding == '[sswe]':
            embedding = [SSWE(filter_words=dataset_words)]
        else:
            raise Exception(f'Embeddings is not SSWE or Glove {embedding}')

        if model == TargetDepPlus:
            model_instance = model(embedding, all_three_low, 
                                   C=float(model_c[str(model)]))
        else:
            model_instance = model(embedding,
                                   C=float(model_c[str(model)]))
        if dataset_name in model_dataset_predictions[str(model)]:
            continue
        model_instance.fit(X_train, y_train)
        predictions = model_instance.predict(X_test).tolist()
        model_dataset_predictions[model.name()][dataset_name] = predictions
        # Save the model to the model zoo
        model_fp = model_zoo_folder.joinpath(f'{model.name()} {dataset_name}')
        model.save(model_instance, model_fp)
        
# Save the results
for model in models:
    model_results_folder = results_folder.joinpath(model.name())
    model_results_folder.mkdir(parents=True, exist_ok=True)
    dataset_predictions_fp = model_results_folder.joinpath('dataset predictions.json')
    dataset_predictions = model_dataset_predictions[model.name()]
    with dataset_predictions_fp.open('w+') as dataset_predictions_file:
        json.dump(dataset_predictions, dataset_predictions_file)

# Mass evaluation results

In [8]:
dataset_test = {name: test for name, train, test in dataset_train_test}
f1_results = evaluation.evaluate_models(f1_score, dataset_test, 
                                        model_dataset_predictions, 
                                        dataframe=True, average='macro')
acc_results = evaluation.evaluate_models(accuracy_score, dataset_test, 
                                         model_dataset_predictions, 
                                         dataframe=True)

## Accuracy

In [12]:
(acc_results * 100).round(2)

Unnamed: 0,Target Dependent,Target Dependent Plus
Dong Twitter,67.34,67.77
Election Twitter,57.65,56.63
Mitchel,72.64,72.85
SemEval 14 Laptop,67.87,70.85
SemEval 14 Restaurant,73.84,74.64
YouTuBean,70.83,72.5
Mean,68.36,69.21


## Macro F1

In [13]:
(f1_results * 100).round(2)

Unnamed: 0,Target Dependent,Target Dependent Plus
Dong Twitter,65.66,65.67
Election Twitter,45.47,45.93
Mitchel,40.76,42.86
SemEval 14 Laptop,59.97,63.73
SemEval 14 Restaurant,56.16,57.71
YouTuBean,53.14,55.56
Mean,53.53,55.24
