# NOTE
**Please ensure that you have ran the *Product and YouTuBean train test split* notebook first so that all of the datasets are avaliable**

In [1]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
import os
import sys

sys.path.append(os.path.abspath(os.pardir))

from collections import defaultdict

import pandas as pd
import numpy as np

# Helper functions
from tdparse.notebook_helper import write_json_data, get_json_data
# Models
from tdparse.models.target import TargetInd
from tdparse.models.target import TargetDepC
from tdparse.models.target import TargetDep
from tdparse.models.target import TargetDepSent
# Word Vector methods
from tdparse.word_vectors import GensimVectors
from tdparse.word_vectors import PreTrained
from tdparse.helper import read_config, full_path
# Sentiment lexicons
from tdparse import lexicons
# Get the data
from tdparse.parsers import semeval_14, semeval_15_16, dong, election
from tdparse.data_types import TargetCollection
# Evaluation methods
from tdparse.evaluation import evaluation_results, scores, get_results, \
                               save_results, combine_results, get_raw_data

In [2]:
# Load all of the datasets
youtubean_train = semeval_14(full_path(read_config('youtubean_train')))
youtubean_test = semeval_14(full_path(read_config('youtubean_test')))
semeval_14_rest_train = semeval_14(full_path(read_config('semeval_2014_rest_train')))
semeval_14_lap_train = semeval_14(full_path(read_config('semeval_2014_lap_train')))
semeval_14_rest_test = semeval_14(full_path(read_config('semeval_2014_rest_test')))
semeval_14_lap_test = semeval_14(full_path(read_config('semeval_2014_lap_test')))
semeval_15_rest_test = semeval_15_16(full_path(read_config('semeval_2015_rest_test')))
semeval_16_rest_test = semeval_15_16(full_path(read_config('semeval_2016_rest_test')),
                                     sep_16_from_15=True)
dong_train = dong(full_path(read_config('dong_twit_train_data')))
dong_test = dong(full_path(read_config('dong_twit_test_data')))
election_train, election_test = election(full_path(read_config('election_folder_dir')))
product_reviews_train = semeval_14(full_path(read_config('product_train')))
product_reviews_test = semeval_14(full_path(read_config('product_test')))
# Combine semeval 14 resturant train and test
semeval_14_rest_all = TargetCollection.combine_collections(semeval_14_rest_train,
                                                           semeval_14_rest_test)
# Combine semeval 14 resturant all with 15 test
semeval_14_15 = TargetCollection.combine_collections(semeval_14_rest_all,
                                                     semeval_15_rest_test)

dataset_train_test = {'SemEval 14 Laptop' : (semeval_14_lap_train, semeval_14_lap_test),
                      'SemEval 14 Restaurant' : (semeval_14_rest_train, semeval_14_rest_test),
                      'SemEval 16 Restaurant 14 Train' : (semeval_14_rest_train, semeval_16_rest_test),
                      'SemEval 16 Restaurant 14 All' : (semeval_14_rest_all, semeval_16_rest_test),
                      'SemEval 16 Restaurant 15&14' : (semeval_14_15, semeval_16_rest_test),
                      'Dong Twitter' : (dong_train, dong_test),
                      'Election Twitter' : (election_train, election_test),
                      'Product Reviews' : (product_reviews_train, product_reviews_test),
                      'YouTuBean' : (youtubean_train, youtubean_test)
                     }

In [3]:
# Get word vectors
w2v_path = full_path(read_config('word2vec_files')['vo_zhang'])
w2v = GensimVectors(w2v_path, None, model='word2vec', name='w2v')
sswe_path = full_path(read_config('sswe_files')['vo_zhang'])
sswe = PreTrained(sswe_path, name='sswe')

# Load the sentiment lexicons and remove all words that are not associated
# to the Positive or Negative class.
subset_cats = {'positive', 'negative'}
mpqa_low = lexicons.Mpqa(subset_cats=subset_cats, lower=True)
nrc_low = lexicons.NRC(subset_cats=subset_cats, lower=True)
hu_liu_low = lexicons.HuLiu(subset_cats=subset_cats, lower=True)
mpqa_huliu_low = lexicons.Lexicon.combine_lexicons(mpqa_low, hu_liu_low)
all_three_low = lexicons.Lexicon.combine_lexicons(mpqa_huliu_low, nrc_low)

In [4]:
def dataset_predictions(train, test, name, model, word_vector, random_state, 
                        sentiment_lexicon=None, result_file_path=None,
                        c_file_path=None, re_write=True, save_raw_data=True):
    if not re_write and result_file_path is not None:
        results_df = get_results(result_file_path, name)
        if save_raw_data and results_df is not None:
            if get_raw_data(result_file_path, name, test):
                return results_df
        elif results_df is not None:
            return results_df
    # loading the data
    data_train = train.data()
    y_train = train.sentiment_data()
    data_test = test.data()
    y_test = test.sentiment_data()

    # Finding the best C value for the model on this dataset
    c_grid_params = {'word_vectors' : [word_vector], 'random_state' : random_state}
    if sentiment_lexicon is not None:
        c_grid_params['senti_lexicons'] = [sentiment_lexicon]
    best_c, c_scores = model.find_best_c(data_train, y_train, 
                                         grid_params=c_grid_params, cv=5, n_jobs=6)
    if c_file_path is not None:
        write_json_data(c_file_path, name, c_scores)
    if sentiment_lexicon is not None:
        print('The best C value for {} model with sentiment lexicon {}: {}'\
              .format(model, sentiment_lexicon, best_c))
    else:
        print('The best C value for {} model: {}'.format(model, best_c))
    
    # Fitting and getting predictions from the model.
    parameters = {'word_vector' : word_vector, 'random_state' : random_state, 'C' : best_c}
    if sentiment_lexicon is not None:
        parameters['senti_lexicon'] = sentiment_lexicon
    best_params = model.get_params(**parameters)
    model.fit(data_train, y_train, params=best_params)
    predicted_values = model.predict(data_test)
    # Return the results
    if result_file_path is not None:
        return evaluation_results(predicted_values, test, name, 
                                  file_name=result_file_path, 
                                  save_raw_data=save_raw_data, re_write=re_write)
    else:
        return evaluation_results(predicted_values, test, name)
   

In [5]:
# Instances of the models
target_dep = TargetDep()
target_dep_plus = TargetDepSent()
target_dep_plus_all = TargetDepSent()
models = [target_dep, target_dep_plus, target_dep_plus_all]

# Target dependent model mass evaluation

The above code loads all of the data, models, and lexicons we are going to use in this notebook

We are going to use three different models.
1. target_dep -- Target Dependent model that uses no sentiment lexicons
2. target_dep_plus -- Target Dependent model that uses only the Hu & Liu lexicon
3. target_dep_plus_all -- Target Dependent model that uses all three lexicons from the original paper

Each model gets it's own results file where it will store the results from each dataset.

In [None]:
# Creating the result files
result_folder = os.path.abspath(os.path.join(os.getcwd(), os.pardir, 'results', 'Target Dependent Models'))
os.makedirs(result_folder, exist_ok=True)
model_result_files = ['Target Dependent.tsv', 'Target Dependent+ Hu&Liu.tsv',
                      'Target Dependent+ All.tsv']
model_result_files = [os.path.join(result_folder, result_file) for result_file in model_result_files]
C_result_files = ['Target Dependent C.json', 'Target Dependent+ Hu&Liu C.json',
                      'Target Dependent+ All C.json']
C_result_files = [os.path.join(result_folder, result_file) for result_file in C_result_files]
# Parameters for each model
std_model_parameters = {'word_vector' : [w2v, sswe], 'random_state' : 42}
hu_liu_model_parameters = {**std_model_parameters, 'sentiment_lexicon' : hu_liu_low}
all_senti_model_parameters = {**std_model_parameters, 'sentiment_lexicon' : all_three_low}
model_parameters = [std_model_parameters, hu_liu_model_parameters, all_senti_model_parameters]
# Combining parameters and result files
parameters_files = list(zip(model_parameters, model_result_files, C_result_files))

model_files = dict(zip(models, parameters_files))
model_files

{Target Dependent: ({'random_state': 42, 'word_vector': [w2v, sswe]},
  '/home/moorea/tdparse/results/Target Dependent Models/Target Dependent.tsv',
  '/home/moorea/tdparse/results/Target Dependent Models/Target Dependent C.json'),
 Target Dependent Plus: ({'random_state': 42,
   'sentiment_lexicon': <tdparse.lexicons.HuLiu at 0x7f7fa51d4da0>,
   'word_vector': [w2v, sswe]},
  '/home/moorea/tdparse/results/Target Dependent Models/Target Dependent+ Hu&Liu.tsv',
  '/home/moorea/tdparse/results/Target Dependent Models/Target Dependent+ Hu&Liu C.json'),
 Target Dependent Plus: ({'random_state': 42,
   'sentiment_lexicon': <tdparse.lexicons.Lexicon at 0x7f7fa3233128>,
   'word_vector': [w2v, sswe]},
  '/home/moorea/tdparse/results/Target Dependent Models/Target Dependent+ All.tsv',
  '/home/moorea/tdparse/results/Target Dependent Models/Target Dependent+ All C.json')}

In [14]:
import time
time_to_process = time.time()
for dataset_name, train_test in dataset_train_test.items():
    print('Processing dataset {}'.format(dataset_name))
    train, test = train_test
    for model, parameter_file_paths in model_files.items():
        print('Processing model {}'.format(model))
        parameters, result_file_path, c_file_path = parameter_file_paths
        dataset_predictions(train, test, dataset_name, model, 
                            result_file_path=result_file_path,
                            c_file_path=c_file_path,
                            re_write=False, save_raw_data=True,
                            **parameters)

time_to_process = time.time() - time_to_process

Processing dataset SemEval 14 Laptop
Processing model Target Dependent
Processing model Target Dependent Plus
Processing model Target Dependent Plus
Processing dataset SemEval 14 Restaurant
Processing model Target Dependent
Processing model Target Dependent Plus
Processing model Target Dependent Plus
Processing dataset SemEval 16 Restaurant 14 Train
Processing model Target Dependent
Processing model Target Dependent Plus
Processing model Target Dependent Plus
Processing dataset SemEval 16 Restaurant 14 All
Processing model Target Dependent
Processing model Target Dependent Plus
Processing model Target Dependent Plus
Processing dataset SemEval 16 Restaurant 15&14
Processing model Target Dependent
Processing model Target Dependent Plus
Processing model Target Dependent Plus
Processing dataset Dong Twitter
Processing model Target Dependent
Processing model Target Dependent Plus
Processing model Target Dependent Plus
Processing dataset Election Twitter
Processing model Target Dependent
Pro

Time it took to process all the datasets {{round(time_to_process / 3600, 2)}} hours

## The affect of the C value

We take a look at the affect of tunning for the C-Value. We tune the C-Value in two steps:
1. Coarse grain search over the following values: from 0.00001 to 10 going up by a factor of 10. Once the best best coarse grain value is found we fine tune it.
2. We multiple the best coarse grain value by the following value and search over these values: 0.35, 0.7, 1, 3.5, and 7.

In [15]:
c_values_target_dep_plus = C_result_files[1]
c_values_target_dep_plus = get_json_data(c_values_target_dep_plus, 'Dong Twitter')

In [16]:
def fine_tune_values(c_values):
    fine_values = []
    for c_value in c_values:
        if '35' in c_value or '7' in c_value:
            fine_values.append(float(c_value))
    fine_values = sorted(fine_values)
    best_coarse_c_value = fine_values[3] / 7
    fine_values.append(best_coarse_c_value)
    return fine_values, best_coarse_c_value
def coarse_tune_values(c_values):
    coarse_values = []
    fine_values = fine_tune_values(c_values)
    for c_value in c_values:
        c_value = float(c_value)
        if c_value not in fine_values:
            coarse_values.append(c_value)
    return coarse_values
def best_c_value(c_values):
    best = 0
    best_c = 0
    for c_value, acc in c_values.items():
        if acc > best:
            best_c = c_value
            best = acc
    return best_c

dataset_stats = defaultdict(lambda: dict())

mean_error = lambda acc_values: sum(max(acc_values) - acc_values) / (len(acc_values) - 1)
for dataset_name in dataset_train_test:
    # Target Dependent Plus using the Hu and Liu Lexicon C file
    c_values_target_dep_plus = C_result_files[1]
    c_values_target_dep_plus = get_json_data(c_values_target_dep_plus, dataset_name)
    model_results_file = model_result_files[1]
    fine_c_values, best_coarse_c_value = fine_tune_values(c_values_target_dep_plus)
    coarse_c_values = coarse_tune_values(c_values_target_dep_plus)
    all_c_values = {float(c_value) : acc for c_value, acc in c_values_target_dep_plus.items()}
    fine_acc = np.array([all_c_values[c_value] for c_value in fine_c_values])
    coarse_acc = np.array([all_c_values[c_value] for c_value in coarse_c_values])
    all_acc = np.array(list(c_values_target_dep_plus.values()))
    best_c = best_c_value(all_c_values)
    test_accuracy = get_results(model_results_file, dataset_name)['Accuracy']
    dataset_stats[dataset_name]['Fine Tune MAE (Accuracy %)'] = mean_error(fine_acc) * 100
    dataset_stats[dataset_name]['Coarse Tune MAE (Accuracy %)'] = mean_error(coarse_acc) * 100
    dataset_stats[dataset_name]['Best Fine C value'] = best_c
    dataset_stats[dataset_name]['Best Coarse C value'] = best_coarse_c_value
    dataset_stats[dataset_name]['Test score Accuracy (%)'] = test_accuracy

In [17]:
index = list(dataset_train_test)
columns = ['Fine Tune MAE (Accuracy %)', 'Coarse Tune MAE (Accuracy %)', 'Best Fine C value', 
           'Best Coarse C value', 'Dataset', 'Test score Accuracy (%)']
c_value_stats = pd.DataFrame(np.zeros((len(dataset_train_test), 6)), columns=columns)
c_value_stats['Dataset'] = index
c_value_stats = c_value_stats.set_index('Dataset')

# Add the data to the DataFrame
for dataset_name, col_value in dataset_stats.items():
    for column, value in col_value.items():
        c_value_stats[column][dataset_name] = value
c_value_stats = c_value_stats.round({'Fine Tune Accuracy std (%)' : 2, 
                                     'Coarse Tune Accuracy std (%)' : 2})
c_value_stats

Unnamed: 0_level_0,Fine Tune MAE (Accuracy %),Coarse Tune MAE (Accuracy %),Best Fine C value,Best Coarse C value,Test score Accuracy (%)
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SemEval 14 Laptop,1.491569,6.168036,0.0035,0.001,70.4
SemEval 14 Restaurant,0.506663,3.846628,0.007,0.01,76.1
SemEval 16 Restaurant 14 Train,0.506663,3.846628,0.007,0.01,81.4
SemEval 16 Restaurant 14 All,0.481787,3.924891,0.0035,0.01,82.3
SemEval 16 Restaurant 15&14,0.672119,4.092248,0.007,0.01,82.6
Dong Twitter,1.216389,5.96991,0.0035,0.01,70.7
Election Twitter,0.569032,3.763862,0.0007,0.001,54.9
Product Reviews,0.747724,3.828926,0.007,0.01,83.1
YouTuBean,1.120072,2.787734,0.001,0.001,76.2
