# NOTE
**Please ensure that you have ran the *Product and YouTuBean train test split* notebook first so that all of the datasets are avaliable**

In [1]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
import os
import sys

sys.path.append(os.path.abspath(os.pardir))

from collections import defaultdict

import pandas as pd
import numpy as np

# Helper functions
from tdparse.notebook_helper import write_json_data, get_json_data
# Models
from tdparse.models.target import TargetInd
from tdparse.models.target import TargetDepC
from tdparse.models.target import TargetDep
from tdparse.models.target import TargetDepSent
# Word Vector methods
from tdparse.word_vectors import GloveWikiGiga, GloveTwitterVectors, GensimVectors, GloveCommonCrawl
from tdparse.word_vectors import PreTrained
from tdparse.helper import read_config, full_path
# Sentiment lexicons
from tdparse import lexicons
# Get the data
from tdparse.parsers import semeval_14, semeval_15_16, dong, election
from tdparse.data_types import TargetCollection
# Evaluation methods
from tdparse.evaluation import evaluation_results, scores, get_results, \
                               save_results, combine_results, get_raw_data
# Tokenisers
from tdparse.tokenisers import ark_twokenize, whitespace, stanford
from tdparse.stanford_tools import constituency_parse

In [2]:
def fine_tune_values(c_values):
    fine_values = []
    for c_value in c_values:
        if '35' in c_value or '7' in c_value:
            fine_values.append(float(c_value))
    fine_values = sorted(fine_values)
    best_coarse_c_value = fine_values[3] / 7
    fine_values.append(best_coarse_c_value)
    return fine_values, best_coarse_c_value
def coarse_tune_values(c_values):
    coarse_values = []
    fine_values = fine_tune_values(c_values)
    for c_value in c_values:
        c_value = float(c_value)
        if c_value not in fine_values:
            coarse_values.append(c_value)
    return coarse_values
def best_c_value(c_values):
    best = 0
    best_c = 0
    for c_value, acc in c_values.items():
        if acc > best:
            best_c = c_value
            best = acc
    return best_c

In [3]:
# Load all of the datasets
youtubean_train = semeval_14(full_path(read_config('youtubean_train')))
youtubean_test = semeval_14(full_path(read_config('youtubean_test')))
semeval_14_rest_train = semeval_14(full_path(read_config('semeval_2014_rest_train')))
semeval_14_lap_train = semeval_14(full_path(read_config('semeval_2014_lap_train')))
semeval_14_rest_test = semeval_14(full_path(read_config('semeval_2014_rest_test')))
semeval_14_lap_test = semeval_14(full_path(read_config('semeval_2014_lap_test')))
semeval_15_rest_test = semeval_15_16(full_path(read_config('semeval_2015_rest_test')))
semeval_16_rest_test = semeval_15_16(full_path(read_config('semeval_2016_rest_test')),
                                     sep_16_from_15=True)
dong_train = dong(full_path(read_config('dong_twit_train_data')))
dong_test = dong(full_path(read_config('dong_twit_test_data')))
election_train, election_test = election(full_path(read_config('election_folder_dir')))
product_reviews_train = semeval_14(full_path(read_config('product_train')))
product_reviews_test = semeval_14(full_path(read_config('product_test')))
# Combine semeval 14 resturant train and test
semeval_14_rest_all = TargetCollection.combine_collections(semeval_14_rest_train,
                                                           semeval_14_rest_test)
# Combine semeval 14 resturant all with 15 test
semeval_14_15 = TargetCollection.combine_collections(semeval_14_rest_all,
                                                     semeval_15_rest_test)

dataset_train_test = {'SemEval 14 Laptop' : (semeval_14_lap_train, semeval_14_lap_test),
                      'SemEval 14 Restaurant' : (semeval_14_rest_train, semeval_14_rest_test),
                      'SemEval 16 Restaurant 14 Train' : (semeval_14_rest_train, semeval_16_rest_test),
                      'SemEval 16 Restaurant 14 All' : (semeval_14_rest_all, semeval_16_rest_test),
                      'SemEval 16 Restaurant 15&14' : (semeval_14_15, semeval_16_rest_test),
                      'Dong Twitter' : (dong_train, dong_test),
                      'Election Twitter' : (election_train, election_test),
                      'Product Reviews' : (product_reviews_train, product_reviews_test),
                      'YouTuBean' : (youtubean_train, youtubean_test)
                     }

In [4]:
# Get word vectors
w2v_path = full_path(read_config('word2vec_files')['vo_zhang'])
w2v = GensimVectors(w2v_path, None, model='word2vec', name='w2v')
sswe_path = full_path(read_config('sswe_files')['vo_zhang'])
sswe = PreTrained(sswe_path, name='sswe')

glove_twit_50 = GloveTwitterVectors(50)
glove_twit_100 = GloveTwitterVectors(100)
glove_twit_200 = GloveTwitterVectors(200)
glove_50 = GloveWikiGiga(50)
glove_100 = GloveWikiGiga(100)
glove_200 = GloveWikiGiga(200)
glove_300 = GloveCommonCrawl(version=42)


# Load the sentiment lexicons and remove all words that are not associated
# to the Positive or Negative class.
subset_cats = {'positive', 'negative'}
mpqa_low = lexicons.Mpqa(subset_cats=subset_cats, lower=True)
nrc_low = lexicons.NRC(subset_cats=subset_cats, lower=True)
hu_liu_low = lexicons.HuLiu(subset_cats=subset_cats, lower=True)
mpqa_huliu_low = lexicons.Lexicon.combine_lexicons(mpqa_low, hu_liu_low)
all_three_low = lexicons.Lexicon.combine_lexicons(mpqa_huliu_low, nrc_low)
all_lexicons = [mpqa_low, nrc_low, hu_liu_low, mpqa_huliu_low, all_three_low]

In [12]:
def dataset_predictions(train, test, dataset_name, model,
                        word_vector, random_state,
                        c_file_path, tokeniser_file_path,
                        word_vector_file_path, senti_lexicon_file_path,
                        model_dir,
                        sentiment_lexicon=None, result_file_path=None,
                        re_write=True, save_raw_data=True):
    # Gets the results dataframe if it already exists
    #if not re_write and result_file_path is not None:
    #    results_df = get_results(result_file_path, name)
    #    if save_raw_data and results_df is not None:
    #        if get_raw_data(result_file_path, name, test):
    #            return results_df
    #    elif results_df is not None:
    #        return results_df
    # loading the data
    data_train = train.data()
    y_train = train.sentiment_data()
    data_test = test.data()
    y_test = test.sentiment_data()
    
    # CV grid params
    c_grid_params = {'word_vectors' : [word_vector], 'random_state' : random_state}
    if sentiment_lexicon is not None:
        c_grid_params['senti_lexicons'] = [sentiment_lexicon]
        
    best_c, c_scores = model.find_best_c(data_train, y_train, c_grid_params, 
                                         save_file=c_file_path, dataset_name=dataset_name, 
                                         re_write=False, n_jobs=7, cv=5)
    # Search over the different tokenizers
    tokenisers = [ark_twokenize, whitespace, stanford]
    tok_grid_params = {**c_grid_params}
    tok_grid_params['tokenisers'] = tokenisers
    tok_grid_params['C'] = [best_c]
    best_tokeniser = model.save_grid_search(data_train, y_train, tok_grid_params, 
                                            'tokenisers', dataset_name, tokeniser_file_path, 
                                            re_write=False, n_jobs=5, cv=5)
    if sentiment_lexicon is not None:
        # Search over the different lexicons given the best tokeniser
        senti_lexicon_grid_params = {**tok_grid_params}
        senti_lexicon_grid_params['tokenisers'] = [best_tokeniser]
        senti_lexicon_grid_params['senti_lexicons'] = all_lexicons
        best_senti_lexicon = model.save_grid_search(data_train, y_train, senti_lexicon_grid_params, 
                                                    'senti_lexicons', dataset_name, senti_lexicon_file_path, 
                                                     re_write=False, n_jobs=5, cv=5)
    # Search over the different word vectors given the best tokeniser
    # and sentiment lexicon
    word_vectors = [[glove_twit_50], [glove_50], [glove_200], [sswe], [w2v, sswe]]
    word_vector_grid_params = {**tok_grid_params}
    word_vector_grid_params['tokenisers'] = [best_tokeniser]
    word_vector_grid_params['word_vectors'] = word_vectors
    if sentiment_lexicon is not None:
        word_vector_grid_params['senti_lexicons'] = [best_senti_lexicon]
    import time
    t = time.time()
    best_word_vector = model.save_grid_search(data_train, y_train, word_vector_grid_params, 
                                              'word_vectors', dataset_name, word_vector_file_path, 
                                              re_write=False, n_jobs=5, cv=5)
    print('{} {}'.format(best_word_vector, time.time() - t))
    t = time.time()
    # Word Vector is too large to multi-process
    word_vectors.extend([[glove_twit_200], [glove_300]])
    best_word_vector = model.save_grid_search(data_train, y_train, word_vector_grid_params, 
                                              'word_vectors', dataset_name, word_vector_file_path, 
                                              re_write=False, n_jobs=1, cv=5)
    print('{} {}'.format(best_word_vector, time.time() - t))
    
    parameters = {'word_vector' : best_word_vector, 'random_state' : random_state, 
                  'C' : 0.01, 'tokeniser' : best_tokeniser}#best_c, 'tokeniser' : best_tokeniser}
    if sentiment_lexicon is not None:
        parameters['senti_lexicon'] = best_senti_lexicon
    best_params = model.get_params(**parameters)
    model.fit(data_train, y_train, params=best_params)
    predicted_values = model.predict(data_test)
    # Save the model to the model zoo
    model_file_name = '{} {}'.format(model, dataset_name)
    model_file_path = os.path.join(model_dir, model_file_name)
    model.save_model(model_file_path, verbose=1)
    # Return the results
    if result_file_path is not None:
        return evaluation_results(predicted_values, test, dataset_name, 
                                  file_name=result_file_path, 
                                  save_raw_data=save_raw_data, re_write=True)
    else:
        return evaluation_results(predicted_values, test, dataset_name)
   

In [6]:
# Instances of the models
target_dep = TargetDep()
target_dep_plus = TargetDepSent()
models = [target_dep, target_dep_plus]

# Target dependent model mass evaluation

The above code loads all of the data, models, and lexicons we are going to use in this notebook

We are going to use three different models.
1. target_dep -- Target Dependent model that uses no sentiment lexicons
2. target_dep_plus -- Target Dependent model that uses only the Hu & Liu lexicon
3. target_dep_plus_all -- Target Dependent model that uses all three lexicons from the original paper

Each model gets it's own results file where it will store the results from each dataset.

In [7]:
# Creating the result files
result_folder = os.path.abspath(os.path.join(os.getcwd(), os.pardir, 'results', 'Target Dependent Models'))
model_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir, 'model zoo'))
os.makedirs(result_folder, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)
model_result_files = ['Target Dependent.tsv', 'Target Dependent+.tsv']
model_result_files = [os.path.join(result_folder, result_file) for result_file in model_result_files]
C_result_files = ['Target Dependent C.json', 'Target Dependent+ All C.json']
C_result_files = [os.path.join(result_folder, result_file) for result_file in C_result_files]
tokeniser_result_files = ['Target Dependent tokeniser.json', 'Target Dependent+ All tokeniser.json']
tokeniser_result_files = [os.path.join(result_folder, result_file) for result_file in tokeniser_result_files]
word_vector_result_files = ['Target Dependent word vector.json', 'Target Dependent+ All word vector.json']
word_vector_result_files = [os.path.join(result_folder, result_file) for result_file in word_vector_result_files]
senti_lexicon_result_files = ['Target Dependent senti lexicon.json', 'Target Dependent+ All senti lexicon.json']
senti_lexicon_result_files = [os.path.join(result_folder, result_file) for result_file in senti_lexicon_result_files]
# Parameters for each model
std_model_parameters = {'word_vector' : [sswe], 'random_state' : 42}
all_senti_model_parameters = {**std_model_parameters, 'sentiment_lexicon' : all_three_low}
model_parameters = [std_model_parameters, all_senti_model_parameters]
# Combining parameters and result files
parameters_files = list(zip(model_parameters, model_result_files, C_result_files, 
                            tokeniser_result_files, word_vector_result_files, 
                            senti_lexicon_result_files, [model_dir]*2))

model_files = dict(zip(models, parameters_files))
model_files

{Target Dependent: ({'random_state': 42, 'word_vector': [sswe]},
  '/home/moorea/tdparse/results/Target Dependent Models/Target Dependent.tsv',
  '/home/moorea/tdparse/results/Target Dependent Models/Target Dependent C.json',
  '/home/moorea/tdparse/results/Target Dependent Models/Target Dependent tokeniser.json',
  '/home/moorea/tdparse/results/Target Dependent Models/Target Dependent word vector.json',
  '/home/moorea/tdparse/results/Target Dependent Models/Target Dependent senti lexicon.json',
  '/home/moorea/tdparse/model zoo'),
 Target Dependent Plus: ({'random_state': 42,
   'sentiment_lexicon': <tdparse.lexicons.Lexicon at 0x7fadec17a668>,
   'word_vector': [sswe]},
  '/home/moorea/tdparse/results/Target Dependent Models/Target Dependent+.tsv',
  '/home/moorea/tdparse/results/Target Dependent Models/Target Dependent+ All C.json',
  '/home/moorea/tdparse/results/Target Dependent Models/Target Dependent+ All tokeniser.json',
  '/home/moorea/tdparse/results/Target Dependent Models/

In [None]:
import time
time_to_process = time.time()
for dataset_name, train_test in dataset_train_test.items():
    print('Processing dataset {}'.format(dataset_name))
    train, test = train_test
    for model, parameter_file_paths in model_files.items():
        print('Processing model {}'.format(model))
        params_files = parameter_file_paths
        parameters = params_files[0]
        result_file_path = params_files[1]
        c_fp = params_files[2]
        tokeniser_fp = params_files[3]
        word_vectors_fp = params_files[4]
        senti_lexicons_fp = params_files[5]
        model_dir = params_files[6]
        dataset_predictions(train, test, dataset_name, model, 
                            result_file_path=result_file_path,
                            re_write=True, save_raw_data=True,
                            c_file_path=c_fp,
                            tokeniser_file_path=tokeniser_fp,
                            word_vector_file_path=word_vectors_fp, 
                            senti_lexicon_file_path=senti_lexicons_fp,
                            model_dir=model_dir,
                            **parameters)

time_to_process = time.time() - time_to_process

Processing dataset SemEval 14 Laptop
Processing model Target Dependent
yes ark_twokenize it has continued {'ark_twokenize': 0.6346735840899265, 'whitespace': 0.6221357544314743, 'stanford': 0.6238651102464332}
yes whitespace it has continued {'ark_twokenize': 0.6346735840899265, 'whitespace': 0.6221357544314743, 'stanford': 0.6238651102464332}
yes stanford it has continued {'ark_twokenize': 0.6346735840899265, 'whitespace': 0.6221357544314743, 'stanford': 0.6238651102464332}
best <function ark_twokenize at 0x7fad981d1620> list {'ark_twokenize': <function ark_twokenize at 0x7fad981d1620>, 'whitespace': <function whitespace at 0x7fad9e32fa60>, 'stanford': <function stanford at 0x7fad97aec840>} stored [('ark_twokenize', 0.6346735840899265), ('stanford', 0.6238651102464332), ('whitespace', 0.6221357544314743)]
yes glove twitter 50d it has continued {'glove twitter 50d': 0.6446173800259404, 'glove wiki giga 50d': 0.6610462602680501, 'glove wiki giga 200d': 0.6632079550367488, 'sswe': 0.6346

yes glove twitter 50d it has continued {'glove twitter 50d': 0.6869865974924341, 'glove wiki giga 50d': 0.6913099870298314, 'glove wiki giga 200d': 0.6964980544747081, 'sswe': 0.6817985300475573, 'glove twitter 200d': 0.7042801556420234, 'w2v sswe': 0.6969303934284479, 'glove 300d 42b common crawl': 0.7107652399481194}
yes glove wiki giga 50d it has continued {'glove twitter 50d': 0.6869865974924341, 'glove wiki giga 50d': 0.6913099870298314, 'glove wiki giga 200d': 0.6964980544747081, 'sswe': 0.6817985300475573, 'glove twitter 200d': 0.7042801556420234, 'w2v sswe': 0.6969303934284479, 'glove 300d 42b common crawl': 0.7107652399481194}
yes glove wiki giga 200d it has continued {'glove twitter 50d': 0.6869865974924341, 'glove wiki giga 50d': 0.6913099870298314, 'glove wiki giga 200d': 0.6964980544747081, 'sswe': 0.6817985300475573, 'glove twitter 200d': 0.7042801556420234, 'w2v sswe': 0.6969303934284479, 'glove 300d 42b common crawl': 0.7107652399481194}
yes sswe it has continued {'glov

[glove 300d 42b common crawl] 207.69602465629578
Model saved to /home/moorea/tdparse/model zoo/Target Dependent SemEval 14 Restaurant. Save time 376.72
saving raw data
Re-writing over previous results
Re-writing over previous RAW results
Processing model Target Dependent Plus
yes ark_twokenize it has continued {'ark_twokenize': 0.7093281510272071, 'whitespace': 0.6901721265963354, 'stanford': 0.7007218212104387}
yes whitespace it has continued {'ark_twokenize': 0.7093281510272071, 'whitespace': 0.6901721265963354, 'stanford': 0.7007218212104387}
yes stanford it has continued {'ark_twokenize': 0.7093281510272071, 'whitespace': 0.6901721265963354, 'stanford': 0.7007218212104387}
best <function ark_twokenize at 0x7fad981d1620> list {'ark_twokenize': <function ark_twokenize at 0x7fad981d1620>, 'whitespace': <function whitespace at 0x7fad9e32fa60>, 'stanford': <function stanford at 0x7fad97aec840>} stored [('ark_twokenize', 0.7093281510272071), ('stanford', 0.7007218212104387), ('whitespace

yes glove twitter 50d it has continued {'glove twitter 50d': 0.6690727373681288, 'glove wiki giga 50d': 0.6715713492504164, 'glove wiki giga 200d': 0.689894503053859, 'sswe': 0.6848972792892837, 'glove twitter 200d': 0.6907273736812882, 'w2v sswe': 0.6998889505830095, 'glove 300d 42b common crawl': 0.7134925041643532}
yes glove wiki giga 50d it has continued {'glove twitter 50d': 0.6690727373681288, 'glove wiki giga 50d': 0.6715713492504164, 'glove wiki giga 200d': 0.689894503053859, 'sswe': 0.6848972792892837, 'glove twitter 200d': 0.6907273736812882, 'w2v sswe': 0.6998889505830095, 'glove 300d 42b common crawl': 0.7134925041643532}
yes glove wiki giga 200d it has continued {'glove twitter 50d': 0.6690727373681288, 'glove wiki giga 50d': 0.6715713492504164, 'glove wiki giga 200d': 0.689894503053859, 'sswe': 0.6848972792892837, 'glove twitter 200d': 0.6907273736812882, 'w2v sswe': 0.6998889505830095, 'glove 300d 42b common crawl': 0.7134925041643532}
yes sswe it has continued {'glove t

Time it took to process all the datasets {{round(time_to_process / 3600, 2)}} hours

## The affect of the C value

We take a look at the affect of tunning for the C-Value. We tune the C-Value in two steps:
1. Coarse grain search over the following values: from 0.00001 to 10 going up by a factor of 10. Once the best best coarse grain value is found we fine tune it.
2. We multiple the best coarse grain value by the following value and search over these values: 0.35, 0.7, 1, 3.5, and 7.

In [None]:
c_values_target_dep_plus = C_result_files[1]
c_values_target_dep_plus = get_json_data(c_values_target_dep_plus, 'Dong Twitter')

In [None]:
dataset_stats = defaultdict(lambda: dict())

mean_error = lambda acc_values: sum(max(acc_values) - acc_values) / (len(acc_values) - 1)
for dataset_name in dataset_train_test:
    # Target Dependent Plus using the Hu and Liu Lexicon C file
    c_values_target_dep_plus = C_result_files[1]
    c_values_target_dep_plus = get_json_data(c_values_target_dep_plus, dataset_name)
    model_results_file = model_result_files[1]
    fine_c_values, best_coarse_c_value = fine_tune_values(c_values_target_dep_plus)
    coarse_c_values = coarse_tune_values(c_values_target_dep_plus)
    all_c_values = {float(c_value) : acc for c_value, acc in c_values_target_dep_plus.items()}
    fine_acc = np.array([all_c_values[c_value] for c_value in fine_c_values])
    coarse_acc = np.array([all_c_values[c_value] for c_value in coarse_c_values])
    all_acc = np.array(list(c_values_target_dep_plus.values()))
    best_c = best_c_value(all_c_values)
    test_accuracy = get_results(model_results_file, dataset_name)['Accuracy']
    dataset_stats[dataset_name]['Fine Tune MAE (Accuracy %)'] = mean_error(fine_acc) * 100
    dataset_stats[dataset_name]['Coarse Tune MAE (Accuracy %)'] = mean_error(coarse_acc) * 100
    dataset_stats[dataset_name]['Best Fine C value'] = best_c
    dataset_stats[dataset_name]['Best Coarse C value'] = best_coarse_c_value
    dataset_stats[dataset_name]['Test score Accuracy (%)'] = test_accuracy

In [None]:
index = list(dataset_train_test)
columns = ['Fine Tune MAE (Accuracy %)', 'Coarse Tune MAE (Accuracy %)', 'Best Fine C value', 
           'Best Coarse C value', 'Dataset', 'Test score Accuracy (%)']
c_value_stats = pd.DataFrame(np.zeros((len(dataset_train_test), 6)), columns=columns)
c_value_stats['Dataset'] = index
c_value_stats = c_value_stats.set_index('Dataset')

# Add the data to the DataFrame
for dataset_name, col_value in dataset_stats.items():
    for column, value in col_value.items():
        c_value_stats[column][dataset_name] = value
c_value_stats = c_value_stats.round({'Fine Tune Accuracy std (%)' : 2, 
                                     'Coarse Tune Accuracy std (%)' : 2})
c_value_stats