In [1]:
from collections import defaultdict
from typing import Callable, Dict, List, Any, Tuple
from pathlib import Path
import math
import json

from bella import tokenisers
from bella import lexicons
from bella import parsers
from bella.models.tdparse import TDParse, TDParsePlus, TDParseMinus
from bella.models.target import TargetDep, TargetDepPlus
from bella.data_types import TargetCollection, Target
from bella.models.base import SKLearnModel
from bella.word_vectors import GloveCommonCrawl
from bella.dependency_parsers import tweebo
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score

import config

Using TensorFlow backend.


In [2]:
def load_C_cross_val_predictions(datasets: List[TargetCollection],
                                 models: List[SKLearnModel],
                                 metric: str = 'accuracy') -> Dict[str, TargetCollection]:
    '''
    Returns a dictionary of `{dataset_name} {model_name} {fold_number}` as 
    keys and TargetCollection as values where the TargetCollections store the 
    test data and the predictions made for that cross validation fold on the 
    dataset for that specific model.
    '''
    collection_name_index:  Dict[str, int] = {}
    for dataset in datasets:
        dataset_name = dataset.name
        for model in models:
            model_name = model.name()
            model_dir = config.RESULTS_DIR / 'C Value' / model_name
            all_scores = []
            for fold_number in range(5):
                fold_name = f'{dataset_name} {model_name} {fold_number}'
                label_file_name = f'{dataset_name} {fold_number} labels.npy'
                label_fp = str((model_dir / label_file_name).resolve())
                test_collection = np.load(label_fp, allow_pickle=True)
                test_collection = TargetCollection([Target(**data) 
                                                    for data in test_collection])
                if metric == 'accuracy':
                    scores = test_collection.dataset_metric_scores(accuracy_score)
                else:
                    scores = test_collection.dataset_metric_scores(f1_score, average='macro')
                all_scores.append(np.expand_dims(scores, 1))
            all_scores = np.concatenate(all_scores, 1)
            mean_scores = all_scores.mean(axis=1)
            best_mean_index = np.argmax(mean_scores)
            collection_name_index[f'{dataset_name} {model_name}'] = best_mean_index
    return collection_name_index

def get_specific_params(best_param_dicts: List[Dict[str, int]],
                        param_names: List[str],
                        param_values: List[List[Any]]
                        ) -> Dict[str, Dict[str, Dict[str, Any]]]:
    '''
    Returns a dictionary that can be used as the specific_params argument in the
    default_params function. The return finds the best parameter value for a 
    model on a specific dataset and returns this as a dictionary of dataset names
    which contains a dictionary of model names which contains a dictionary of 
    parameter names and their optimal/best parameter value for the model on 
    the specific dataset.
    
    At the moment all dataset names can be only one word long or else 
    this function will fail.
    
    :param best_param_dicts: A list of the return of the best_param_index 
                             function for each parameter name
    :param param_name: The list of parameter names
    :param param_values: A List of possible parameter values for each
                         parameter name.
    :returns: A dictionary of model names which contains a dictionary of 
              parameter names and their optimal/best parameter value for 
              the model on the specific dataset.
    '''
    if (len(best_param_dicts) != len(param_names) and
        len(param_names) != len(param_values)):
        raise ValueError('The length of all three arguments must be the same')
    
    optimal_param = defaultdict(lambda: defaultdict(lambda: dict()))
    for index, best_param_dict in enumerate(best_param_dicts):
        for dataset_model_name, best_index in best_param_dict.items():
            split_names = dataset_model_name.split()
            dataset_name = split_names[0]
            model_name = ' '.join(split_names[1:])
            best_param_value = param_values[index][best_index]
            param_name = param_names[index]
            optimal_param[dataset_name][model_name][param_name] = best_param_value
    return optimal_param

def load_convert_results(result_data: Tuple[List[str], List[List[str]]],
                         mapper: Dict[str, int]) -> Tuple[np.ndarray, np.ndarray]:
    '''
    Returns a tuple of true values and predictions where the true values is
    of shape (n_sample) and the predictions are of shape (n_samples, num_runs)
    '''
    true_values, run_prediction_values = result_data
    true_values = np.array([mapper[value] for value in true_values])
    run_prediction_values = [[mapper[prediction] for prediction in predictions] 
                              for predictions in run_prediction_values]
    return true_values, np.array(run_prediction_values).T

def get_score(true_values: np.ndarray, run_predictions: np.ndarray, 
              metric, **metric_kwargs) -> np.ndarray:
    num_runs = run_predictions.shape[1]
    scores = []
    for run in range(num_runs):
        scores.append(metric(true_values, run_predictions[:, run], 
                      **metric_kwargs))
    return np.array(scores)

label_mapper = {'positive': 2, 'negative': 1, 'neutral': 3}

# Mass Evaluation of [Vo et al. 2015](https://ijcai.org/Proceedings/15/Papers/194.pdf) and [Wang et al. 2017](https://www.aclweb.org/anthology/E17-1046.pdf) models

In this notebook we train and evaluate the two best performing models from [Vo et al. 2015](https://ijcai.org/Proceedings/15/Papers/194.pdf) and [Wang et al. 2017](https://www.aclweb.org/anthology/E17-1046.pdf) which are:

1. Target Dependent (Vo)
2. Target Dependent Plus (Vo)
3. TDParse (Wang) 
4. TDParse Plus (Wang)

Of which the methods with `Plus` in their name use sentiment lexicons. Based on the development results from [./large_scale_feature_settings.ipynb](./large_scale_feature_settings.ipynb) we use the best performing `C value` for each method on each dataset. We also scale the features using MinMax scaling as that was also found to be generally significantly better on all methods and datasets. Finally for the sentiment lexicons as none of the lexicons that contain the Hu and Liu (HL) were ever significantly worse than the best performing sentiment lexicon for each method on each dataset we use a combination of all lexcions the relevant methods on all datasets. The use of all sentiment lexicons rather than just the best performing was due to the three lexcions originally coming from different types MPQA (news), HL (reviews), and NRC (Twitter/social media). The best C value was chosen based on the accuracy metric and not the F1. Further we note that for the F1 metric that using all three lexicons was significantly worse for 1 fold out of the 5 folds for two datasets but generally using all three lexicons is significantly no worse than the best performing.

The tokeniser used is the Spacy tokeniser due to it's speed and wide use in the NLP field. We use the [Tweebo dependency parser](http://www.cs.cmu.edu/~nasmith/papers/kong+schneider+swayamdipta+bhatia+dyer+smith.emnlp14.pdf) as it is the only parser that creates multiple roots for a given sentence. Finally we lower case all words and use the [GloVe 300 dimension 840 billion token word embedding](https://nlp.stanford.edu/projects/glove/).

Below we load all of the six datasets, lexicons, and word embedding:

In [3]:
# Load the sentiment lexicons
subset_cats = {'positive', 'negative'}
mpqa = lexicons.Mpqa(config.MPQA, 
                     subset_cats=subset_cats, lower=True)
hu_liu = lexicons.HuLiu(config.HL, 
                        subset_cats=subset_cats, lower=True)
nrc = lexicons.NRC(config.NRC, subset_cats=subset_cats, lower=True)
mpqa_huliu = lexicons.Lexicon.combine_lexicons(mpqa, hu_liu)
mpqa_nrc = lexicons.Lexicon.combine_lexicons(mpqa, nrc)
huliu_nrc = lexicons.Lexicon.combine_lexicons(hu_liu, nrc)
all_lexicons = lexicons.Lexicon.combine_lexicons(mpqa_huliu, nrc)

tokeniser = tokenisers.spacy_tokeniser
default_random_state = 42

# Results directorys
result_dir = config.RESULTS_DIR / 'Mass Evaluation'
result_dir.mkdir(exist_ok=True, parents=True)
small_result_dir = config.RESULTS_DIR / 'Mass Evaluation Small Dataset'
small_result_dir.mkdir(exist_ok=True, parents=True)

# Image directory
image_dir = config.IMAGES_DIR
image_dir = image_dir / 'Mass Evaluation'

# Datasets 
dong_train = parsers.dong(config.DONG_TRAIN, name='Dong Train')
dong_test = parsers.dong(config.DONG_TEST, name='Dong')
laptop_train = parsers.semeval_14(config.laptop_train, name='Laptop Train')
laptop_test = parsers.semeval_14(config.laptop_test, name='Laptop')
restaurant_train = parsers.semeval_14(config.restaurant_train, name='Restaurant Train')
restaurant_test = parsers.semeval_14(config.restaurant_test, name='Restaurant')
election_train = parsers.election_train(config.ELECTION, name='Election Train')
election_test = parsers.election_test(config.ELECTION, name='Election')
mitchell_train = parsers.semeval_14(config.mitchell_train, name='Mitchell Train')
mitchell_test = parsers.semeval_14(config.mitchell_test, name='Mitchell')
youtubean_train = parsers.semeval_14(config.youtubean_train, name='YouTuBean Train')
youtubean_test = parsers.semeval_14(config.youtubean_test, name='YouTuBean')
train_datasets = [dong_train, laptop_train, restaurant_train, election_train, mitchell_train, youtubean_train]
test_datasets = [dong_test, laptop_test, restaurant_test, election_test, mitchell_test, youtubean_test]
all_datasets = train_datasets + test_datasets

all_datasets_lower_words = [dataset.word_list(tokeniser, lower=True) 
                            for dataset in all_datasets]
all_datasets_lower_words = [word for words in all_datasets_lower_words 
                                 for word in words]
default_word_vector = GloveCommonCrawl(version=840, 
                                       filter_words=all_datasets_lower_words)

# Models
models = [TargetDep, TargetDepPlus, TDParse, TDParsePlus]
# Range of possible C values
c_params = [1] # include the default sklearn C Value
for c in range(-15, 3, 2):
    c_params.append(math.pow(2, c))
# Load data to find best C values
best_c_index = load_C_cross_val_predictions(test_datasets, models)
# Load best C for each dataset for each model
best_c_dataset_model = get_specific_params([best_c_index], ['C'], [c_params])

Loading glove 300d 840b common crawl from file


For the interested reader the best C values for the Macro F1 metric are shown below. However only the C parameters used will be those that performed best on the Accuracy metric. The C value is not used within the LSTM based methods.

In [4]:
import pandas as pd

macro_best_c_index = load_C_cross_val_predictions(test_datasets, models + [TDParseMinus], 'macro')
# Load best C for each dataset for each model
macro_best_c_dataset_model = get_specific_params([macro_best_c_index], ['C'], [c_params])
temp_macro_best_c_dataset_model = {}
for d_name, m_name_value in macro_best_c_dataset_model.items():
    d_name_dict = {}
    for m_name, value in m_name_value.items():
        d_name_dict[m_name] = value['C']
    temp_macro_best_c_dataset_model[d_name] = d_name_dict
pd.DataFrame(temp_macro_best_c_dataset_model)

Unnamed: 0,Dong,Laptop,Restaurant,Election,Mitchell,YouTuBean
Target Dependent,0.007812,0.03125,0.007812,0.125,2.0,1.0
Target Dependent Plus,0.001953,0.007812,0.007812,0.03125,1.0,1.0
TDParse,0.007812,0.03125,0.007812,0.125,0.125,2.0
TDParsePlus,0.001953,0.007812,0.007812,0.03125,2.0,2.0
TDParse Minus,0.03125,0.125,0.125,0.5,0.125,2.0


The best C values for the accuracy metric:

In [5]:
temp_best_c_index = load_C_cross_val_predictions(test_datasets, models + [TDParseMinus])
# Load best C for each dataset for each model
temp_best_c_dataset_model = get_specific_params([temp_best_c_index], ['C'], [c_params])

temp_accuracy_best_c_dataset_model = {}
for d_name, m_name_value in temp_best_c_dataset_model.items():
    d_name_dict = {}
    for m_name, value in m_name_value.items():
        d_name_dict[m_name] = value['C']
    temp_accuracy_best_c_dataset_model[d_name] = d_name_dict
pd.DataFrame(temp_accuracy_best_c_dataset_model)

Unnamed: 0,Dong,Laptop,Restaurant,Election,Mitchell,YouTuBean
Target Dependent,0.007812,0.007812,0.007812,0.007812,0.007812,0.03125
Target Dependent Plus,0.001953,0.007812,0.007812,0.03125,0.001953,0.03125
TDParse,0.007812,0.007812,0.007812,0.007812,0.007812,2.0
TDParsePlus,0.001953,0.007812,0.007812,0.007812,0.007812,0.007812
TDParse Minus,0.007812,0.125,0.03125,0.125,0.03125,0.03125


Below we train and evaluate the models across all six datasets and save the predictions. We also show the results.

In [6]:
def run_save_predictions(train: TargetCollection, test: TargetCollection, 
                         model: SKLearnModel, save_dir: Path,
                         save_name_addon: str = '') -> np.ndarray:
    '''
    Given a training and test dataset, it will run train the model and 
    make predictions on the test datasets, of wich those results will be 
    saved. The results from the test predictions will be returned. 
    If the results already exist the model will not be re-trained 
    but rather the saved predictions will be returned (caching). 
    '''
    dataset_name = train.name.split()[0]
    save_dir = Path(save_dir, model.name())
    save_dir.mkdir(parents=True, exist_ok=True)
    if save_name_addon != '':
        save_file = Path(save_dir, f'{save_name_addon} {dataset_name}.npy')
    else:
        save_file = Path(save_dir, f'{dataset_name}.npy')
    if save_file.exists():
        return np.load(save_file)
    print(save_file)
    
    X_train = train.data_dict()
    X_test = test.data_dict()
    y_train = train.sentiment_data()
    
    model.fit(X_train, y_train)
    raw_predictions = model.predict(X_test)
    np.save(save_file, raw_predictions)
    return raw_predictions

#results_datasets = []
#results_models = []
#results_accuracy = []
#results_f1 = []
for train_dataset, test_dataset in zip(train_datasets, test_datasets):
    dataset_name = test_dataset.name
    #print(dataset_name)
    true_values = test_dataset.sentiment_data()
    for model in models:
        model_name = model.name()
        best_c = best_c_dataset_model[dataset_name][model_name]['C']
        a_model_params = {}
        if 'TDParse' in model_name:
            a_model_params['parser'] = tweebo
        if 'Plus' in model_name:
            a_model_params['senti_lexicon'] = all_lexicons
        a_model_params['word_vectors'] = [default_word_vector]
        a_model_params['tokeniser'] = tokeniser
        a_model_params['scale'] = MinMaxScaler()
        a_model_params['C'] = best_c
        a_model_params['lower'] = True
        a_model_params['random_state'] = default_random_state
        a_model = model(**a_model_params)
        predictions = run_save_predictions(train_dataset, test_dataset, 
                                           a_model, result_dir)
        #accuracy = accuracy_score(true_values, predictions)
        #results_accuracy.append(accuracy)
        #f1 = f1_score(true_values, predictions, average='macro')
        #results_f1.append(f1)
        #results_datasets.append(dataset_name)
        #results_models.append(model_name)
# Load the LSTM methods
#dataset_names = ['YouTuBean', 'Dong', 'Election', 'Laptop', 'Mitchell', 'Restaurant']
#for dataset_name in dataset_names:
#    for model_name in ['LSTM', 'TDLSTM', 'TCLSTM']:
#        lstm_result_dir = config.RESULTS_DIR / 'Mass Evaluation' / 'patience 10' / f"{model_name}"
#        test_result_fp = lstm_result_dir / f'{dataset_name} test.json'
#        with test_result_fp.open('r') as result_file:
#            a_result = json.load(result_file)
#            a_result = load_convert_results(a_result, label_mapper)
#            acc_scores = get_score(a_result[0], a_result[1], accuracy_score)
#            f1_scores = get_score(a_result[0], a_result[1], f1_score, average='macro')
#            for i in range(len(acc_scores)):
#                results_datasets.append(dataset_name)
#                results_models.append(model_name)
#                results_accuracy.append(acc_scores[i])
#                results_f1.append(f1_scores[i])
#results_df = {'F1': results_f1, 'Accuracy': results_accuracy, 
#              'Dataset': results_datasets, 'Model': results_models}
#results_df = pd.DataFrame(results_df)
#pd.pivot_table(data=results_df, values=['F1', 'Accuracy'], index='Dataset', columns='Model')

0.4578557014465332


In the cell below creates the prediction for all datasets trained on the much smaller sized training dataset. This smaller sized training set is the same size as the YouTuBean training dataset size.

In [9]:
dong_small_train = parsers.semeval_14(config.small_training_dataset_dir / 'Dong train.xml', 
                                      name='Dong Train')
laptop_small_train = parsers.semeval_14(config.small_training_dataset_dir / 'Laptop train.xml', 
                                        name='Laptop Train')
restaurant_small_train = parsers.semeval_14(config.small_training_dataset_dir / 'Restaurant train.xml', 
                                            name='Restaurant Train')
election_small_train = parsers.semeval_14(config.small_training_dataset_dir / 'Election train.xml', 
                                          name='Election Train')
mitchell_small_train = parsers.semeval_14(config.small_training_dataset_dir / 'Mitchell train.xml', 
                                          name='Mitchell Train')
small_training_datasets = [dong_small_train, laptop_small_train, restaurant_small_train, 
                           election_small_train, mitchell_small_train]
small_test_datasets = [dong_test, laptop_test, restaurant_test, election_test, mitchell_test]

for train_dataset, test_dataset in zip(small_training_datasets, small_test_datasets):
    dataset_name = test_dataset.name
    print(dataset_name)
    true_values = test_dataset.sentiment_data()
    for model in models:
        model_name = model.name()
        best_c = best_c_dataset_model[dataset_name][model_name]['C']
        a_model_params = {}
        if 'TDParse' in model_name:
            a_model_params['parser'] = tweebo
        if 'Plus' in model_name:
            a_model_params['senti_lexicon'] = all_lexicons
        a_model_params['word_vectors'] = [default_word_vector]
        a_model_params['tokeniser'] = tokeniser
        a_model_params['scale'] = MinMaxScaler()
        a_model_params['C'] = best_c
        a_model_params['lower'] = True
        a_model_params['random_state'] = default_random_state
        a_model = model(**a_model_params)
        predictions = run_save_predictions(train_dataset, test_dataset, 
                                           a_model, small_result_dir)

Dong
Laptop
Restaurant
Election
Mitchell
0.5857939720153809


In [38]:
def mean_std(data: pd.Series) -> str:
   to_percentage = data * 100
   return f'{np.mean(to_percentage):.2f} ({np.std(to_percentage):.2f})'
pd.pivot_table(data=results_df, values=['Accuracy'], index='Dataset', columns='Model', aggfunc=mean_std).T

Unnamed: 0_level_0,Dataset,Dong,Election,Laptop,Mitchell,Restaurant,YouTuBean
Unnamed: 0_level_1,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Accuracy,LSTM,64.57 (0.99),47.85 (0.54),59.90 (2.94),71.04 (0.60),68.63 (3.67),63.33 (0.00)
Accuracy,TCLSTM,68.98 (0.58),57.40 (0.22),56.77 (3.34),70.77 (0.37),71.85 (0.71),66.81 (1.19)
Accuracy,TDLSTM,71.12 (0.50),57.50 (0.80),61.76 (0.66),70.52 (0.60),73.56 (0.47),64.17 (0.96)
Accuracy,TDParse,67.77 (0.00),57.46 (0.00),67.08 (0.00),73.96 (0.00),77.95 (0.00),79.58 (0.00)
Accuracy,TDParsePlus,69.36 (0.00),56.12 (0.00),68.50 (0.00),73.35 (0.00),78.30 (0.00),83.33 (0.00)
Accuracy,Target Dependent,68.50 (0.00),57.22 (0.00),66.14 (0.00),73.45 (0.00),77.32 (0.00),82.50 (0.00)
Accuracy,Target Dependent Plus,70.23 (0.00),53.21 (0.00),68.97 (0.00),74.37 (0.00),78.04 (0.00),81.67 (0.00)


In [34]:
def mean_std(data: pd.Series) -> str:
   to_percentage = data * 100
   return f'{np.mean(to_percentage):.2f} ({np.std(to_percentage):.2f})'
pd.pivot_table(data=results_df, values=['Accuracy'], index='Dataset', columns='Model', aggfunc=mean_std).T

Unnamed: 0_level_0,Dataset,Dong,Election,Laptop,Mitchell,Restaurant,YouTuBean
Unnamed: 0_level_1,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Accuracy,LSTM,60.79 (4.99),47.84 (0.32),53.19 (0.61),70.13 (0.04),66.10 (2.46),63.33 (0.00)
Accuracy,TCLSTM,66.71 (2.87),56.34 (0.55),52.30 (0.75),70.21 (0.14),66.65 (2.08),63.19 (0.39)
Accuracy,TDLSTM,69.77 (1.67),56.31 (0.56),54.36 (0.57),70.11 (0.00),68.04 (3.06),63.33 (0.00)
Accuracy,TDParse,67.77 (0.00),57.46 (0.00),67.08 (0.00),73.96 (0.00),77.95 (0.00),79.58 (0.00)
Accuracy,TDParsePlus,69.36 (0.00),56.12 (0.00),68.50 (0.00),73.35 (0.00),78.30 (0.00),83.33 (0.00)
Accuracy,Target Dependent,68.50 (0.00),57.22 (0.00),66.14 (0.00),73.45 (0.00),77.32 (0.00),82.50 (0.00)
Accuracy,Target Dependent Plus,70.23 (0.00),53.21 (0.00),68.97 (0.00),74.37 (0.00),78.04 (0.00),81.67 (0.00)


In [39]:
pd.pivot_table(data=results_df, values=['F1'], index='Dataset', columns='Model', aggfunc=mean_std).T

Unnamed: 0_level_0,Dataset,Dong,Election,Laptop,Mitchell,Restaurant,YouTuBean
Unnamed: 0_level_1,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
F1,LSTM,61.58 (1.29),30.65 (1.09),41.91 (5.54),35.51 (4.16),37.15 (10.98),25.85 (0.00)
F1,TCLSTM,65.92 (0.97),43.57 (0.71),45.04 (5.15),38.42 (6.70),53.54 (1.79),36.83 (3.05)
F1,TDLSTM,68.54 (0.60),42.54 (2.25),49.82 (1.61),29.67 (3.30),56.66 (1.13),28.98 (3.05)
F1,TDParse,64.33 (0.00),46.64 (0.00),59.23 (0.00),48.58 (0.00),64.66 (0.00),70.05 (0.00)
F1,TDParsePlus,66.36 (0.00),46.30 (0.00),61.89 (0.00),51.17 (0.00),65.26 (0.00),74.46 (0.00)
F1,Target Dependent,65.27 (0.00),46.60 (0.00),57.86 (0.00),48.98 (0.00),63.17 (0.00),74.80 (0.00)
F1,Target Dependent Plus,67.36 (0.00),44.52 (0.00),62.33 (0.00),48.08 (0.00),64.44 (0.00),72.90 (0.00)


In [31]:
pd.pivot_table(data=results_df, values=['F1'], index='Dataset', columns='Model', aggfunc=mean_std).T

Unnamed: 0_level_0,Dataset,Dong,Election,Laptop,Mitchell,Restaurant,YouTuBean
Unnamed: 0_level_1,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
F1,LSTM,52.99 (13.04),28.97 (1.81),26.03 (4.08),27.63 (0.35),29.65 (7.57),25.85 (0.00)
F1,TCLSTM,62.87 (4.34),40.25 (0.84),37.17 (1.00),27.80 (0.46),34.44 (9.09),27.43 (1.23)
F1,TDLSTM,66.71 (2.23),39.93 (0.63),37.46 (1.00),27.48 (0.00),38.17 (11.10),25.85 (0.00)
F1,TDParse,64.33 (0.00),46.64 (0.00),59.23 (0.00),48.58 (0.00),64.66 (0.00),70.05 (0.00)
F1,TDParsePlus,66.36 (0.00),46.30 (0.00),61.89 (0.00),51.17 (0.00),65.26 (0.00),74.46 (0.00)
F1,Target Dependent,65.27 (0.00),46.60 (0.00),57.86 (0.00),48.98 (0.00),63.17 (0.00),74.80 (0.00)
F1,Target Dependent Plus,67.36 (0.00),44.52 (0.00),62.33 (0.00),48.08 (0.00),64.44 (0.00),72.90 (0.00)


Below we show the C-Values that were used for each method for each dataset. This can be seen in [./large_scale_feature_settings.ipynb](./large_scale_feature_settings.ipynb) notebook through the graphs, but the table below is easier to read:  

In [12]:
import pandas as pd

c_dataset_names = []
c_model_names = []
c_model_dataset_values = []
for c_dataset_name, model_c_value in best_c_dataset_model.items():
    for model_c, values in model_c_value.items():
        value = values['C']
        c_dataset_names.append(c_dataset_name)
        c_model_names.append(model_c)
        c_model_dataset_values.append(value)
c_values_df = {'Dataset': c_dataset_names, 'Model': c_model_names, 
               'C': c_model_dataset_values}
c_values_df = pd.DataFrame(c_values_df)
pd.pivot_table(data=c_values_df, values='C', index='Dataset', columns='Model')

Model,TDParse,TDParsePlus,Target Dependent,Target Dependent Plus
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dong,0.007812,0.001953,0.007812,0.001953
Election,0.007812,0.007812,0.007812,0.03125
Laptop,0.007812,0.007812,0.007812,0.007812
Mitchell,0.007812,0.007812,0.007812,0.001953
Restaurant,0.007812,0.007812,0.007812,0.007812
YouTuBean,2.0,0.007812,0.03125,0.03125
