In [1]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
import os
import sys

sys.path.append(os.path.abspath(os.pardir))

from collections import defaultdict

import pandas as pd
import numpy as np

# Helper functions
from tdparse.notebook_helper import write_json_data
# Models
from tdparse.models.tdparse import TDParse, TDParsePlus
# Word Vector methods
from tdparse.word_vectors import GensimVectors
from tdparse.word_vectors import PreTrained
# Dependency Parser
from tdparse.dependency_parsers import tweebo, stanford
# Sentiment lexicons
from tdparse import lexicons
# Get the data
from tdparse.parsers import semeval_14, semeval_15_16, dong, election
from tdparse.data_types import TargetCollection
from tdparse.helper import read_config, full_path
# Evaluation methods
from tdparse.evaluation import evaluation_results, scores, get_results, \
                               save_results, combine_results, get_raw_data

In [2]:
# Load all of the datasets
youtubean_train = semeval_14(full_path(read_config('youtubean_train')))
youtubean_test = semeval_14(full_path(read_config('youtubean_test')))
semeval_14_rest_train = semeval_14(full_path(read_config('semeval_2014_rest_train')))
semeval_14_lap_train = semeval_14(full_path(read_config('semeval_2014_lap_train')))
semeval_14_rest_test = semeval_14(full_path(read_config('semeval_2014_rest_test')))
semeval_14_lap_test = semeval_14(full_path(read_config('semeval_2014_lap_test')))
semeval_15_rest_test = semeval_15_16(full_path(read_config('semeval_2015_rest_test')))
semeval_16_rest_test = semeval_15_16(full_path(read_config('semeval_2016_rest_test')),
                                     sep_16_from_15=True)
dong_train = dong(full_path(read_config('dong_twit_train_data')))
dong_test = dong(full_path(read_config('dong_twit_test_data')))
election_train, election_test = election(full_path(read_config('election_folder_dir')))
product_reviews_train = semeval_14(full_path(read_config('product_train')))
product_reviews_test = semeval_14(full_path(read_config('product_test')))
# Combine semeval 14 resturant train and test
semeval_14_rest_all = TargetCollection.combine_collections(semeval_14_rest_train,
                                                           semeval_14_rest_test)
# Combine semeval 14 resturant all with 15 test
semeval_14_15 = TargetCollection.combine_collections(semeval_14_rest_all,
                                                     semeval_15_rest_test)

train_test = {'SemEval 14 Laptop' : (semeval_14_lap_train, semeval_14_lap_test),
              'SemEval 14 Restaurant' : (semeval_14_rest_train, semeval_14_rest_test),
              'SemEval 16 Restaurant 14 Train' : (semeval_14_rest_train, semeval_16_rest_test),
              'SemEval 16 Restaurant 14 All' : (semeval_14_rest_all, semeval_16_rest_test),
              'SemEval 16 Restaurant 15&14' : (semeval_14_15, semeval_16_rest_test),
              'Dong Twitter' : (dong_train, dong_test),
              'Election Twitter' : (election_train, election_test),
              'Product Reviews' : (product_reviews_train, product_reviews_test),
              'YouTuBean' : (youtubean_train, youtubean_test)
             }

In [None]:
# Get word vectors
w2v_path = full_path(read_config('word2vec_files')['vo_zhang'])
w2v = GensimVectors(w2v_path, None, model='word2vec', name='w2v')
sswe_path = full_path(read_config('sswe_files')['vo_zhang'])
sswe = PreTrained(sswe_path, name='sswe')

# Load the sentiment lexicons and remove all words that are not associated
# to the Positive or Negative class.
subset_cats = {'positive', 'negative'}
mpqa_low = lexicons.Mpqa(subset_cats=subset_cats, lower=True)
nrc_low = lexicons.NRC(subset_cats=subset_cats, lower=True)
hu_liu_low = lexicons.HuLiu(subset_cats=subset_cats, lower=True)
mpqa_huliu_low = lexicons.Lexicon.combine_lexicons(mpqa_low, hu_liu_low)
all_three_low = lexicons.Lexicon.combine_lexicons(mpqa_huliu_low, nrc_low)

In [None]:
from sklearn.metrics import f1_score
got_wrong = []
res = []

for data_name, traintest in train_test.items():
    try:
        train, test = traintest
        tdparse = TDParse()
        std_model_parameters = {'word_vector' : [w2v, sswe], 'random_state' : 42, 'parser' : stanford, 'C' : 0.007,
                                'scale' : True}
        params = tdparse.get_params(**std_model_parameters)
        tdparse.fit(train.data(), train.sentiment_data(), params=params)
        predicted_values = tdparse.predict(test.data())
        score = f1_score(test.sentiment_data(), predicted_values, average='macro')
        res.append((data_name, score))
    except Exception as e:
        got_wrong.append(e)

In [6]:
res

[('Dong Twitter', 0.65976246243958037)]

In [5]:
got_wrong

[ValueError("The number of identified targets `[]` not equal to the number of targets in the data `Target({'spans': [(63, 74)], 'target_id': 'laptop_train19280', 'target': 'motherboard', 'text': 'After about a week I finally got it back and was told that the motherboard had failed and so they installed a new motherboard.', 'sentiment': -1, 'sentence_id': 'laptop_train1928'})` norm target $motherboard$"),
 ValueError("The number of identified targets `[]` not equal to the number of targets in the data `Target({'spans': [(4, 9)], 'target_id': 'restaurants_train33590', 'target': 'pizza', 'text': 'The pizza is the best if you like thin crusted pizza.', 'sentiment': 1, 'sentence_id': 'restaurants_train3359'})` norm target $pizza$"),
 ValueError("The number of identified targets `[]` not equal to the number of targets in the data `Target({'spans': [(4, 9)], 'target_id': 'restaurants_train33590', 'target': 'pizza', 'text': 'The pizza is the best if you like thin crusted pizza.', 'sentiment': 

In [4]:
def dataset_predictions(train, test, name, model, word_vector, random_state, 
                        sentiment_lexicon=None, result_file_path=None, 
                        c_file_path=None, re_write=True, save_raw_data=True):
    if not re_write and result_file_path is not None:
        results_df = get_results(result_file_path, name)
        if save_raw_data and results_df is not None:
            if get_raw_data(result_file_path, name, test):
                return results_df
        elif results_df is not None:
            return results_df
    # loading the data
    data_train = train.data()
    y_train = train.sentiment_data()
    data_test = test.data()
    y_test = test.sentiment_data()

    # Finding the best C value for the model on this dataset
    c_grid_params = {'word_vectors' : [word_vector], 'random_state' : random_state,
                     'parsers' : [tweebo]}
    if sentiment_lexicon is not None:
        c_grid_params['senti_lexicons'] = [sentiment_lexicon]
    best_c, c_scores = model.find_best_c(data_train, y_train, 
                                         grid_params=c_grid_params, cv=5, n_jobs=7)
    if c_file_path is not None:
        write_json_data(c_file_path, name, c_scores)
    if sentiment_lexicon is not None:
        print('The best C value for {} model with sentiment lexicon {}: {}'\
              .format(model, sentiment_lexicon, best_c))
    else:
        print('The best C value for {} model: {}'.format(model, best_c))
    
    # Fitting and getting predictions from the model.
    parameters = {'word_vector' : word_vector, 'random_state' : random_state, 
                  'C' : best_c, 'parser' : tweebo}
    if sentiment_lexicon is not None:
        parameters['senti_lexicon'] = sentiment_lexicon
    best_params = model.get_params(**parameters)
    model.fit(data_train, y_train, params=best_params)
    predicted_values = model.predict(data_test)
    # Return the results
    if result_file_path is not None:
        return evaluation_results(predicted_values, test, name, 
                                  file_name=result_file_path, 
                                  save_raw_data=save_raw_data, re_write=re_write)
    else:
        return evaluation_results(predicted_values, test, name)
   

In [5]:
# Instances of the models
tdparse = TDParse()
tdparse_plus = TDParsePlus()
models = [tdparse, tdparse_plus]

In [6]:
# Creating the result files
result_folder = os.path.abspath(os.path.join(os.getcwd(), os.pardir, 'results', 'TDParse Models'))
os.makedirs(result_folder, exist_ok=True)
model_result_files = ['TDParse.tsv', 'TDParsePlus.tsv']
model_result_files = [os.path.join(result_folder, result_file) for result_file in model_result_files]
C_result_files = ['TDParse C.json', 'TDParsePlus C.json']
C_result_files = [os.path.join(result_folder, result_file) for result_file in C_result_files]
# Parameters for each model
std_model_parameters = {'word_vector' : [w2v, sswe], 'random_state' : 42}
senti_model_parameters = {**std_model_parameters, 'sentiment_lexicon' : all_three_low}
model_parameters = [std_model_parameters, senti_model_parameters]
# Combining parameters and result files
parameters_files = list(zip(model_parameters, model_result_files, C_result_files))

model_result_files = dict(zip(models, parameters_files))
model_result_files

{TDParse: ({'random_state': 42, 'word_vector': [w2v, sswe]},
  '/home/moorea/tdparse/results/TDParse Models/TDParse.tsv',
  '/home/moorea/tdparse/results/TDParse Models/TDParse C.json'),
 TDParse Plus: ({'random_state': 42,
   'sentiment_lexicon': <tdparse.lexicons.Lexicon at 0x7f35f4faf160>,
   'word_vector': [w2v, sswe]},
  '/home/moorea/tdparse/results/TDParse Models/TDParsePlus.tsv',
  '/home/moorea/tdparse/results/TDParse Models/TDParsePlus C.json')}

In [7]:
import time
time_to_process = time.time()
for dataset_name, train_test in train_test.items():
    print('Processing dataset {}'.format(dataset_name))
    train, test = train_test
    for model, parameter_file_paths in model_result_files.items():
        print('Processing model {}'.format(model))
        parameters, result_file_path, c_file_path = parameter_file_paths
        dataset_predictions(train, test, dataset_name, model, 
                            result_file_path=result_file_path,
                            c_file_path=c_file_path,
                            re_write=False, save_raw_data=True,
                            **parameters)

time_to_process = time.time() - time_to_process

Processing dataset SemEval 14 Laptop
Processing model TDParse
Processing model TDParse Plus
Processing dataset SemEval 14 Restaurant
Processing model TDParse
Processing model TDParse Plus
Processing dataset SemEval 16 Restaurant 14 Train
Processing model TDParse
Processing model TDParse Plus
Processing dataset SemEval 16 Restaurant 14 All
Processing model TDParse
Processing model TDParse Plus
Processing dataset SemEval 16 Restaurant 15&14
Processing model TDParse
Processing model TDParse Plus
Processing dataset Dong Twitter
Processing model TDParse
Processing model TDParse Plus
Processing dataset Election Twitter
Processing model TDParse
Processing model TDParse Plus
Processing dataset Product Reviews
Processing model TDParse
Processing model TDParse Plus
Processing dataset YouTuBean
Processing model TDParse
Processing model TDParse Plus
The best C value for TDParse Plus model with sentiment lexicon Mpqa HuLiu NRC: 0.01
in here /home/moorea/tdparse/results/TDParse Models/TDParsePlus.ts

  ret = ret.dtype.type(ret / rcount)


Time it took to process all the datasets {{round(time_to_process / 3600, 2)}} hours

In [8]:
time_to_process/ 3600

0.45778418130344817