<a href="https://colab.research.google.com/github/apmoore1/target-extraction/blob/master/tutorials/TDSA_Error_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%%capture
!pip install git+git://github.com/apmoore1/target-extraction.git@master#egg=target-extraction
!pip install altair

In [2]:
from collections import defaultdict
from typing import List, Tuple, Callable

import altair as alt
import pandas as pd
from target_extraction.dataset_parsers import semeval_2014, wang_2017_election_twitter_test, wang_2017_election_twitter_train
from target_extraction.data_types import TargetTextCollection
from target_extraction.error_analysis import n_shot_targets, count_error_key_occurrence

alt.renderers.enable('colab')

RendererRegistry.enable('colab')

In [0]:
def num_targets_per_samples(train_test_name_datasets: List[Tuple[TargetTextCollection,TargetTextCollection,str]]
                            ) -> pd.DataFrame:
  dataset_names = []
  number_targets = []
  number_norm_samples = []
  number_samples = []
  for _, test, name in train_test_name_datasets:
    test = test.samples_with_targets()
    num_targets_samples = defaultdict(lambda: 0)
    total_targets = test.number_targets()
    for target_text in test.values():
      num_targets = len(target_text['targets'])
      num_targets_samples[num_targets] += num_targets
    for target_count, sample_count in num_targets_samples.items():
      dataset_names.append(name)
      number_targets.append(target_count)
      number_norm_samples.append((sample_count/total_targets) * 100)
      number_samples.append(sample_count)
  return pd.DataFrame({'Dataset': dataset_names, 'Targets': number_targets, 
                       'Samples': number_samples, 
                       'Samples Norm': number_norm_samples})

def subset_dist_chart(subset_frame: pd.DataFrame, title: str,
                      no_labels: bool = False) -> alt.vegalite.v3.api.Chart:
  '''
  :param subset_frame: A DataFrame containing the following columns: 
                       1. Data Subset, 2. Number of Samples (%),
                       3. Number of Samples, 4. Dataset
  :param title: A title to give to the generated plot
  :param no_labels: Whether the Y-axis should have a label displayed
  :returns: A plot that displays for each of the datasets the percentage of the 
            subsets in that dataset as stacked coloured coloumns.
  '''
  font_size = 14
  y_axis_color = alt.Color('Data Subset:O', 
                           scale=alt.Scale(scheme='redyellowblue'))
  y_axis = alt.Axis(title=None, labels=True, ticks=False)
  y_col = alt.Y('Number of Samples (%)',
                scale=alt.Scale(domain=[0, 100]))
  if no_labels:
    y_axis = alt.Axis(title=None, labels=True, ticks=False)
    y_col = alt.Y('Number of Samples (%):Q', axis=y_axis,
                  scale=alt.Scale(domain=[0, 100]))
        
  tooltip=['Number Samples', 'Data Subset',
           'Number of Samples (%)']
  subset_bar_data = {'x': 'Dataset', 'y': y_col, 
                     'color': y_axis_color, 'tooltip': tooltip}
  chart= alt.Chart(subset_frame).mark_bar()\
                                .encode(**subset_bar_data)\
                                .properties(width=100)
  chart.title = title 
  return chart

def get_subset_data(train_test_name_datasets: List[Tuple[TargetTextCollection,TargetTextCollection,str]],
                    subset_functions: Callable[[TargetTextCollection, TargetTextCollection, bool], TargetTextCollection],
                    lower_target: bool) -> pd.DataFrame:
  data_subset_names = []
  number_samples = []
  number_samples_percentage = []
  dataset_names = []

  for train, test, name in train_test_name_datasets:
    test_size = test.number_targets()
    for subset_func in subset_functions:
      test = subset_func(test, train, lower_target)
      subset_name = subset_func.__name__
      num_in_subset = count_error_key_occurrence(test, subset_name)
      percentage_subset = (num_in_subset / test_size) * 100

      subset_name = ' '.join([word.capitalize() for word in subset_name.split('_')])
      data_subset_names.append(subset_name)
      number_samples.append(num_in_subset)
      number_samples_percentage.append(percentage_subset)
      dataset_names.append(name)
  return pd.DataFrame({'Dataset': dataset_names, 
                       'Number Samples': number_samples, 
                       'Number of Samples (%)': number_samples_percentage, 
                       'Data Subset': data_subset_names})

def get_error_data(dataset_name: List[Tuple[TargetTextCollection,str]],
                   error_keys: List[str], error_func, **error_kwargs
                   ) -> pd.DataFrame:
  data_subset_names = []
  number_samples = []
  number_samples_percentage = []
  dataset_names = []
  
  for dataset, name in dataset_name:
    dataset_size = dataset.number_targets()
    dataset = error_func(dataset, **error_kwargs)
    for error_key in error_keys:
      num_errors = count_error_key_occurrence(dataset, error_key)
      percent_errors = (num_errors / dataset_size) * 100

      data_subset_names.append(error_key)
      number_samples.append(num_errors)
      number_samples_percentage.append(percent_errors)
      dataset_names.append(name)
  return pd.DataFrame({'Dataset': dataset_names, 
                       'Number Samples': number_samples, 
                       'Number of Samples (%)': number_samples_percentage, 
                       'Data Subset': data_subset_names})

def get_target_distribution_data(sample_distribution: bool, 
                                 dataset_name: List[Tuple[TargetTextCollection, str]]
                                 ) -> pd.DataFrame:
  '''
  :param sample_distribution: If true will return the Target count with respect
                              to the dataset sample distribution. Else it will 
                              return the Target Count with respect to the 
                              target distribution
  :param dataset_name: A list of tuples containing (TargetTextCollection, 
                       dataset name) where each datasets statistics will be 
                       created and returned together in the Pandas DataFrame.
  :return: Pandas DataFrame containing statistics to create graphs that can 
           plot both the same and target distribution with respect to Target 
           count.
  '''
  target_counts = []
  dataset = []
  for target_dataset, name in dataset_name:
    target_count = target_dataset.target_count(lower=True).values()
    target_counts.extend(list(target_count))
    dataset.extend([name] * len(target_count))
  distribution_name = 'Targets'
  if sample_distribution:
    distribution_name = 'Samples'
  column_name = f'Number {distribution_name}'
  column_percentage = f'{distribution_name} (%)'
  column_cumulative = f'{distribution_name} Cumulative (%)'

  target_count_df = pd.DataFrame({'Target Count': target_counts,
                                  'Dataset': dataset})
  target_count_df = target_count_df.groupby(['Target Count', 'Dataset'])
  if sample_distribution:
    target_count_df = target_count_df['Target Count'].sum()
  else:
    target_count_df = target_count_df['Target Count'].count()
  target_count_df = pd.DataFrame(target_count_df)
  target_count_df = target_count_df.rename(columns={'Target Count': column_name})
  target_count_df = target_count_df.reset_index()
  target_count_df['index'] = target_count_df.index

  targets_datasets = target_count_df.groupby(['Dataset'])
  dataset_target_count = target_count_df.set_index(['Dataset', 'index'])[column_name]
  dataset_num_targets = targets_datasets[column_name].sum()
  targets_percentage = (dataset_target_count / dataset_num_targets) * 100
  target_count_df[column_percentage] = targets_percentage.reset_index()[column_name]

  target_count_df[column_cumulative] = target_count_df.groupby('Dataset')[column_percentage].cumsum()
  target_count_df[column_cumulative] = target_count_df[column_cumulative].round(3)
  return target_count_df

def n_shot_time_series_df(train_test_dataset: List[Tuple[TargetTextCollection, TargetTextCollection, str]],
                          norm: bool = False) -> pd.DataFrame:
  '''
  :param train_test_dataset: A list of Tuples containing the following: 
                             1. The train dataset, 2. Test dataset, 
                             3. Name of the dataset e.g. Laptop
  :param norm: Whether or not the number of samples should be divided by the 
               total number of samples in that dataset.
  :returns: A DataFrame containing the following columns 1. `N-Shot`, 
            2. `Number Samples (Cumulative)`, 3. `Number Samples`, 4. `Dataset`.
  '''
  all_num_samples: List[int] = []
  n_shot_values: List[int] = []
  dataset_name_col: List[str] = []

  for train, test, dataset_name in train_test_dataset:
    test_target_count = {t: len(v) for t, v in 
                         test.target_sentiments(lower=True).items()}
    train_target_count = {t: len(v) for t, v in 
                          train.target_sentiments(lower=True).items()}
    test_n_shot_target = defaultdict(lambda: 0)
    for target, test_count in test_target_count.items():
      count = 0
      if target in train_target_count:
        count = train_target_count[target]
      test_n_shot_target[count] += test_count
    
    num_samples = list(test_n_shot_target.values())
    total_samples = sum(num_samples)
    assert test.number_targets() == total_samples
    if norm:
      num_samples = [(num_sample/total_samples) * 100 
                     for num_sample in num_samples]

    all_num_samples.extend(num_samples)
    n_shot_values.extend(list(test_n_shot_target.keys()))
    dataset_name_col.extend([dataset_name for _ in num_samples])
  n_shot_df = pd.DataFrame({'N-Shot': n_shot_values, 'Dataset': dataset_name_col,
                            'Number Samples': all_num_samples})
  n_shot_df_cum = n_shot_df.set_index(['N-Shot', 'Dataset'])\
                           .sort_index()\
                           .groupby('Dataset')['Number Samples']\
                           .cumsum()\
                           .reset_index()
  cum_col_mapper = {'Number Samples': 'Number Samples (Cumulative)'}
  n_shot_df_cum = n_shot_df_cum.rename(cum_col_mapper, axis=1)
  n_shot_df = n_shot_df.set_index(['N-Shot', 'Dataset'])
  n_shot_df_cum = n_shot_df_cum.set_index(['N-Shot', 'Dataset'])
  n_shot_df['Number Samples (Cumulative)'] = n_shot_df_cum['Number Samples (Cumulative)']
  return n_shot_df.reset_index()

# The different sentiment specific data splits within Target/Aspect Dependent Sentiment Analysis (TDSA)
In this notebook we will explore the different ways the TDSA datasets can be broken down via their sentiment and target specific properties. These splits will come from related work in the field as well as new splits.

Before exploring any of these splits I think we should first explore the distribution of Targets within the Training datasets and then the distribution of these targets with respect to the number of samples.

## Target Distributions within the Training datasets
Before any of this can happen we must first upload the 3 different datasets that we are going to explore, as to save room in this notebook look at the following notebook on details of how to run these cells. [NOTEBOOK](https://github.com/apmoore1/target-extraction/blob/master/tutorials/Load_and_Explore_Target_Extraction.ipynb)

In [5]:
from pathlib import Path
from google.colab import files
uploaded = {}
for i in range(4):
  temp_uploaded = files.upload()
  uploaded = {**uploaded, **temp_uploaded}
semeval_fps = [Path(key).resolve() for key in uploaded.keys()]
semeval_fps = {semeval_path.name: semeval_path for semeval_path in semeval_fps}
del uploaded
# Paths to the files uploaded
for file_name, fp in semeval_fps.items():
  print(f'SemEval file name: {file_name}. File Path {fp}')
  
laptop_train = semeval_2014(semeval_fps['Laptop_Train_v2.xml'], conflict=False)
laptop_test = semeval_2014(semeval_fps['Laptops_Test_Gold.xml'], conflict=False)
rest_train = semeval_2014(semeval_fps['Restaurants_Train_v2.xml'], conflict=False)
rest_test = semeval_2014(semeval_fps['Restaurants_Test_Gold.xml'], conflict=False)
elec_dir = Path('/tmp', 'elec_dir')
elec_train = wang_2017_election_twitter_train(elec_dir)
elec_test = wang_2017_election_twitter_test(elec_dir)

train_test_datasets = [(laptop_train, laptop_test, 'Laptop'),
                       (rest_train, rest_test, 'Restaurant'),
                       (elec_train, elec_test, 'Election')]
# Only want the samples in the dataset that would be used for target sentiment 
# prediction therefore remove all of those samples that do not contain a sentiment 
# value
train_test_datasets = [(train.samples_with_targets(), 
                        test.samples_with_targets(), name) 
                       for train, test, name in train_test_datasets]

Saving Restaurants_Test_Gold.xml to Restaurants_Test_Gold.xml


Saving Laptops_Test_Gold.xml to Laptops_Test_Gold.xml


Saving Laptop_Train_v2.xml to Laptop_Train_v2.xml


Saving Restaurants_Train_v2.xml to Restaurants_Train_v2.xml
SemEval file name: Restaurants_Test_Gold.xml. File Path /content/Restaurants_Test_Gold.xml
SemEval file name: Laptops_Test_Gold.xml. File Path /content/Laptops_Test_Gold.xml
SemEval file name: Laptop_Train_v2.xml. File Path /content/Laptop_Train_v2.xml
SemEval file name: Restaurants_Train_v2.xml. File Path /content/Restaurants_Train_v2.xml


Now that we have the data we are going to plot the target distribution for each of the training datasets. To do this first we are going to manipulate the data within the TargetTextCollections to create a pandas dataframe in long form where we have the number of times a target has occured (Target Count) and the number of targets that have occurred that frequently as a percentage of all targets (Target (%)). This data can be seen below where we have printed the first 5 rows of the DataFrame.  

In [6]:
train_dataset_name = [(train, name) for train, test, name in train_test_datasets]
target_count_df = get_target_distribution_data(sample_distribution=False,
                                               dataset_name=train_dataset_name)
target_count_df.head(5)

Unnamed: 0,Target Count,Dataset,Number Targets,index,Targets (%),Targets Cumulative (%)
0,1,Election,1276,0,69.010276,69.01
1,1,Laptop,692,1,73.227513,73.228
2,1,Restaurant,895,2,74.895397,74.895
3,2,Election,176,3,9.518659,78.529
4,2,Laptop,116,4,12.275132,85.503


Below we are going to plot the Cumulative Target frequency:

In [7]:
target_dist_chart = alt.Chart(target_count_df).mark_line(point=True).encode(
    x=alt.X("Target Count:Q"),
    y=alt.Y('Targets Cumulative (%)', scale=alt.Scale(domain=(65, 100))),
    color='Dataset',
    tooltip=['Target Count', 'Targets Cumulative (%)', 'Dataset']
)
target_dist_chart.title = 'Target distribution'

target_dist_chart_60 = target_dist_chart.transform_filter((alt.datum['Target Count'] <= 60))
target_dist_chart_60.title = 'Target distribution (Up to count 60)'

dataset_size = {'Dataset': [name for _, name in train_dataset_name],
                'Number of Samples': [dataset.number_targets() 
                                      for dataset, _ in train_dataset_name]}
dataset_size_df = pd.DataFrame(dataset_size)
dataset_size_chart = alt.Chart(dataset_size_df).mark_bar().encode(
    x='Dataset', y='Number of Samples', color='Dataset', 
    tooltip=['Dataset', 'Number of Samples'])

target_dist_chart | target_dist_chart_60 | dataset_size_chart

As we can see from above at least 69% of all Targets occur only once in the Dataset showing that the dataset has a rather large number of targets where we only have a One-Shot learning setup to understand the potentially non-trival target specific sentiment e.g. `a long battery` is a good sentiment for battery but if the target was `movie` then this would be negative.

On the opposite end of the scale we see the differences in the datasets where the Laptop dataset more targets occuring less often, where the most frequent targets occur only 60 times in comparison to the Restaurant and Election datasets which have targets occuring up to 360 and 433 times.

These differences in frequencies can be due to the differences in dataset sizes as shown by the far right bar plot.

## Target Distributions with respect to the number of samples within the Training datasets
Again like before we first need to create the data to plot which is done below. The function creates a dataframe that cotains the number of time a target occurs (Target Count) as well as the number of samples that contain those targets that occur for that number of times (Number Samples) e.g. below for the Election dataset for the Target Count of 1 there are 1276 samples in the dataset that contain target that have only occured 1 further more for the Target Count 2 there are 352 samples that contain targets that have occured twice. 

In [0]:
target_sample_df = get_target_distribution_data(sample_distribution=True,
                                                dataset_name=train_dataset_name)
target_sample_df.head(5)

Unnamed: 0,Target Count,Dataset,Number Samples,index,Samples (%),Samples Cumulative (%)
0,1,Election,1276,0,13.635392,13.635
1,1,Laptop,692,1,29.917856,29.918
2,1,Restaurant,895,2,24.847307,24.847
3,2,Election,352,3,3.761487,17.397
4,2,Laptop,232,4,10.030264,39.948


Below we plot the Target count against the cumulative number of samples.

In [0]:
chart = alt.Chart(target_sample_df).mark_line(point=True).encode(
    x=alt.X("Target Count:Q"),
    y=alt.Y('Samples Cumulative (%)'),
    color='Dataset',
    tooltip=['Target Count', 'Samples Cumulative (%)', 'Dataset']
)
chart.title = 'Data distribution through target count'

cumulative_data_51 = target_sample_df[target_sample_df['Samples Cumulative (%)']<=51]
chart_50 = alt.Chart(cumulative_data_51).mark_area(opacity=0.4).encode(
    x=alt.X("Target Count:Q"),
    y=alt.Y('Samples Cumulative (%)'),
    color='Dataset'
)
# Plotting the points for the Target Count where the Samples Cumulative (%) 
# is at least 50%
data_50_points = target_sample_df[target_sample_df['Samples Cumulative (%)'] >= 50]
data_50_points = data_50_points.groupby('Dataset').first().reset_index()
chart_50_point = alt.Chart(data_50_points).mark_circle(color='black').encode(
    x=alt.X("Target Count:Q"),
    y=alt.Y('Samples Cumulative (%)')
)
text = chart_50_point.mark_text(
    align='left',
    baseline='middle',
    dx=10
).encode(
    text='Target Count:Q'
)
# Plotting the points where Target Count is 1 and displaying the text
# for the Samples Cumulative (%) score.
target_1_points = target_sample_df[target_sample_df['Target Count'] == 1]
chart_1_point = alt.Chart(target_1_points).mark_circle(color='gray').encode(
    x=alt.X("Target Count:Q"),
    y=alt.Y('Samples Cumulative (%)')
)
text_1_point = chart_1_point.mark_text(
    align='left',
    baseline='middle',
    dx=10
).encode(
    text='Samples Cumulative (%)'
)

chart_50.title = 'Data distribution through target count (Samples Cumulative <=51)'
chart | chart_50 + chart_50_point + text + chart_1_point + text_1_point

As we can see from the plots above there is a large difference again between the three datasets. The Laptop dataset is made up of mainly Samples that have Targets that occur very in-frequent compared to Election and Restaurant dataset. The left plot shows this best where 50% of the dataset for the Laptop, Restaurant, and Election datasets are made up of samples with targets that have only occured at most 5, 11, and 36 times respectively.

This is not surpursing giving the Target Distribution we saw before where the Election and Restaurant datasets before had some target that occur very frequently.

This shows another large difference in the datasets where in the case for the Laptop dataset we have very few samples to understand the sentiment of a specific target word in comparison to the Election and Restaurant datasets where for the most frequent targets we can have over 100 samples.

Lastly as we can again see in the right hand plot a massive difference between the datasets in the number of samples that only contain tagrets that occur once: 13%, 24%, and 29% for the Election, Restaurant, and Laptop respectively.

Having seen a target in very few contexts can make it very hard for a model to learn the complex relationship between the Target and the associated sentiment, further more it could lead to generlisation problems where the model learns only that a Target occurs with one sentiment as it has only ever seen that target associated with that one sentiment.

### Splits that are "local" where they only take the test dataset into account

Local splits are those that only use the data within a dataset and do not require another to create a split. Here we are going to show two local data splits; 1. Distinct Sentiment (*DS*), and 2. Number Targets (*NT*).

The *DS* split by [Wang et al, 2017](https://aclweb.org/anthology/E17-1046) is based on the number of distinct sentiments that are within a text for example a sample would be $DS_1$ if the sample's text only contains one sentiment within it or $DS_2$ if it contains two sentiments.

The premise of this is that the more distinct sentiments in a text the more difficult it is for the classifier, as it has to better match the sentiments to the correct target(s). Furthermore if the whole dataset is $DS_1$ then a sentence based sentiment classifier would probably work quite well on this dataset, as it would not have to learn the target sentiment relation explictly.

Below we can see the different subsets within the *DS* split for: 
1. Test dataset.
2. Train dataset.
3. The combination of Train and Test datasets.

In [0]:
from target_extraction.error_analysis import distinct_sentiment

test_dataset_name = [(test, name) for train, test, name in train_test_datasets]
train_and_test_name = [(TargetTextCollection.combine(train, test), name) 
                       for train, test, name in train_test_datasets]
ds_error_keys = [f'distinct_sentiment_{i}' for i in range(1,4)]
train_ds_df = get_error_data(train_dataset_name, ds_error_keys, distinct_sentiment, separate_labels=True)
test_ds_df = get_error_data(test_dataset_name, ds_error_keys, distinct_sentiment, separate_labels=True)
train_test_ds_df = get_error_data(train_and_test_name, ds_error_keys, distinct_sentiment, separate_labels=True)

train_ds_chart = subset_dist_chart(train_ds_df, 
                                   'DS Train data')
test_ds_chart = subset_dist_chart(test_ds_df, 
                                  'DS Test data',
                                  True)
train_test_title = 'DS combination of Train and Test data'
train_test_ds_chart = subset_dist_chart(train_test_ds_df, train_test_title,
                                        True)

train_ds_chart | test_ds_chart | train_test_ds_chart

In [0]:
ds_table = pd.pivot_table(test_ds_df, values='Number Samples', 
                          columns='Data Subset', index='Dataset')

ds_table

Data Subset,distinct_sentiment_1,distinct_sentiment_2,distinct_sentiment_3
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Election,1164,1182,195
Laptop,535,94,9
Restaurant,892,225,3


As we can see only the Election dataset would appear to have almost an equal split between $DS_1$ and $DS_2$ but none of the three really have that many of the very difficult $DS_3$ samples and again only the Election dataset does. Further more the Restaurant dataset almost has no $DS_3$ samples in the test dataset. However the representation of *DS* samples seem to be comparable in the train and test datasets for all of the datasets.

The $NT_i$ split creates subsets based on the number of targets ($i$) within a text e.g. $NT_1$ would be the subset of samples that only have 1 target in a text, of which this is always the lower bound for this split. Furthermore unlike the *DS* split the $NT_i$ does not have a fixed number of subsets as $i$ is determined by the dataset. To demonstrate this best below shows the number of samples per values of $i$ for each dataset.

In [0]:
num_targets_df = num_targets_per_samples(train_test_datasets)
alt.Chart(num_targets_df).mark_line(point=True).encode(
    x=alt.X("Targets:O"),
    y=alt.Y('Samples Norm', scale=alt.Scale(domain=(0, 45)), title='Number of Samples (%)'),
    color='Dataset', tooltip=['Dataset', 'Samples', 'Samples Norm', 'Targets']
)

As we can see the Restaurant and Laptop datasets are fairly similar but the Election dataset has far fewer samples with 1 and 2 targets than the other datasets. Furthermore the Election dataset peek number of samples when binned by number of targets in the text is 4 compared to 1, and 2 of the Laptop and Restaurant dataset which is at least double the number of targets per text. This shows that the Election dataset is most likely a more difficult dataset as more targets per text would more likely mean better modelling of inter target sentiment relations.

Finally this graphs shows the problem described earlier where the value of $i$ for $NT_i$ is determined by the dataset. Therefore we have modified this split so that it only returns 4 subsets:
1. 1-target -- Only contains samples that have only 1 target in the text.
2. low-targets -- Contains 2-$n$ targets in the text where $n$ is determined by the first $1/3$ of samples that are between 2-$n$.
3. med-targets -- Contains $n$-$m$ targets in the text where $m$ is determined by the second $1/3$ of samples that are between $n$-$m$ where $n$ < $m$.
4. high-targets -- Contains $n$-$m$ targets in the text where $i$ is the largest number of targets per text.

These subsets and the values of $n$, $m$, and $i$ are shown below for each of the datasets.

In [0]:
from target_extraction.error_analysis import num_targets_subset

test_dataset_name = [(test, name) for train, test, name in train_test_datasets]

num_targets_error_keys = ['1-target', 'low-targets', 'med-targets', 'high-targets']
num_target_samples = []
norm_num_target_samples = []
dataset_names = []
data_subset_names = []
all_n_values = []

for dataset, name in test_dataset_name:
  dataset, n_values = num_targets_subset(dataset, True)
  dataset_size = dataset.number_targets()
  for index, n_value in enumerate(n_values):
    dataset_names.append(name)
    error_key = num_targets_error_keys[index]
    data_subset_names.append(error_key)
    all_n_values.append(n_value)
    num_samples = count_error_key_occurrence(dataset, error_key)
    num_target_samples.append(num_samples)
    norm_num_samples = (num_samples / dataset_size) * 100
    norm_num_target_samples.append(norm_num_samples)
test_num_targets_df = pd.DataFrame({'Target Range': all_n_values, 
                                    'Dataset': dataset_names, 
                                    'Data Subset': data_subset_names, 
                                    'Number Samples': num_target_samples, 
                                    'Number of Samples (%)': norm_num_target_samples})
test_num_targets_chart = subset_dist_chart(test_num_targets_df, 'NT Subsets')
num_targets_tooltip = ['Number Samples', 'Data Subset', 'Number of Samples (%)', 
                       'Target Range']
test_num_targets_chart.encoding.tooltip = alt.Tooltip(num_targets_tooltip)
test_num_targets_chart

In [0]:
pd.pivot_table(test_num_targets_df, values='Number Samples', 
               columns='Data Subset', index='Dataset')

Data Subset,1-target,high-targets,low-targets,med-targets
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Election,102,495,1216,728
Laptop,259,76,204,99
Restaurant,285,108,384,343


In [0]:
pd.pivot_table(test_num_targets_df, values='Target Range', 
               columns='Data Subset', index='Dataset', aggfunc=lambda x: list(x))

Data Subset,1-target,high-targets,low-targets,med-targets
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Election,"[(1, 1)]","[(5, 9)]","[(2, 3)]","[(4, 4)]"
Laptop,"[(1, 1)]","[(4, 6)]","[(2, 2)]","[(3, 3)]"
Restaurant,"[(1, 1)]","[(5, 13)]","[(2, 2)]","[(3, 4)]"


As we can see above once we remove the 1-target subset all the other subsets contain at least 9% of the data which casn better inform us on which methods perform best with increasing number of targets within a text. Furthermore the closest table shows us the differences in the *n*, *m*, and *i* values for each of the datasets. As we can see they are all very similar apart from the *high-targets* subset where the *i* value can be between 6 and 13.

## Splits that take into account global information
Global information splits unlike there local counterparts use another dataset to create the splits. In all of these cases the other dataset used is the training dataset. There are three splits we are going to look at *n-shot*, *Target Sentiment Relation (TSR)*, and *Fine Grained Target Sentiment Subsets (FGTSS)*.

The first split we are going to look at is the *n-shot* split which has been used/created in Multi-Entity sentiment analysis by [Yang et al. 2018](https://www.aaai.org/ocs/index.php/AAAI/AAAI18/paper/viewFile/17036/16171), where Multi-Entity sentiment analysis is the task of predicting the sentiment with respect to the target **and** the latent aspect category. In this split it creates *n* subsets where *n* represents the number of times a target has been seen in the training dataset. For instance in the test dataset if there are 2 samples where the first sample contains a target that has occured 5 times in the training and second samples contains a targets that has occured 0 times in the training then they will be binned in the *5* and *0* *shot* subsets respectively.

This split is similar to the *NT* split as depending on the dataset the value of *n* can vary therefore below we show for each of the datasets the number of samples in each *n-shot* subset


In [0]:
norm_n_shot_time_df = n_shot_time_series_df(train_test_datasets, True)

x = alt.X('N-Shot:O', title=None)
y = alt.Y('Number Samples', title='Number Samples (%)')
color = 'Dataset'
start_filter = (alt.datum['N-Shot'] > -1) & (alt.datum['N-Shot'] < 6)
n_shot_low_chart = alt.Chart(norm_n_shot_time_df).mark_line(point=True)\
                                              .encode(x=x, y=y, color=color)\
                                              .transform_filter(start_filter)

y = alt.Y('Number Samples', title=None)
x = alt.X('N-Shot:O')
middle_filter = (alt.datum['N-Shot'] > 5) & (alt.datum['N-Shot'] < 56)
n_shot_middle_chart = alt.Chart(norm_n_shot_time_df).mark_line(point=True)\
                                            .encode(x=x, y=y, color=color)\
                                            .transform_filter(middle_filter)
end_filter = alt.datum['N-Shot'] > 55
x = alt.X('N-Shot:O', title=None)
n_shot_high_chart = alt.Chart(norm_n_shot_time_df).mark_line(point=True)\
                                             .encode(x=x, y=y, color=color)\
                                             .transform_filter(end_filter)
n_shot_low_chart | n_shot_middle_chart | n_shot_high_chart

As we can see subset that has the largest number of samples is the *zero-shot* or *Unknown Target*, which is much larger for the smaller datasets. Interestingly though there are targets that occur both very frequently in the train and test for both the election and the restaurant datasets as shown by the upward tail in the far right plot. This split is mainly to focus on the affect the number of times a target needs to be seen for the method to correctly represent it. Thus a general method should be able to handle the *zero-shot* cases well, as it could rely on other similar targets to help represent it.

To better see how much of these datasets are made up of zero to very highly frequently seen targets we plot the cummulative sample counts with respect to increasing values of *n*

In [0]:
n_shot_time_df = n_shot_time_series_df(train_test_datasets, False)
x = 'N-Shot'
norm_y = alt.Y('Number Samples (Cumulative)', scale=alt.Scale(domain=(0,100)), 
                title='Number Samples (Cumulative) %')
y = alt.Y('Number Samples (Cumulative)')
color = 'Dataset'

norm_cuml_line = alt.Chart(norm_n_shot_time_df).mark_line(point=True)\
                                               .encode(x=x, y=norm_y, color=color)
cuml_line = alt.Chart(n_shot_time_df).mark_line(point=True)\
                                     .encode(x=x, y=y, color=color)
norm_cuml_line | cuml_line

As we can see and expect the smaller dataset have very sharp curves suggesting that they have a lot of sparsly seen targets where as the larger datasets do have a sharp curve to start but then flatten suggesting that they have a few targets that have been seen a lot in training and represent a large portion of the test data. We would expect that for larger values of *n* a method should perform very well and to some extent set the upper limit on performance for all values of *n*.

Just like with the *NT* split as these subsets are continous and different with each of the datasets, we have split it into 4 different subsets just like the *NT* split, which are:
1. *zero-shot* -- Targets that only appear in the test and never in the train
2. *low-shot* -- After removing all the *zero-shot* samples the first 1/3 samples that are within the lowest *n* vaules
3. *med-shot* -- The next 1/3 on from the *low-shot*
4. *high-shot* -- The last 1/3 that contain the largest values of *n*

The break down with there associated values of *n* can be seen below


In [0]:
from target_extraction.error_analysis import n_shot_subsets

n_shot_error_keys = ['zero-shot', 'low-shot', 'med-shot', 'high-shot']
num_n_shot_samples = []
norm_n_shot_samples = []
dataset_names = []
data_subset_names = []
all_n_values = []

for train_dataset, test_dataset, name in train_test_datasets:
  test_dataset, n_values = n_shot_subsets(test_dataset, train_dataset, lower=True, 
                                          return_n_values=True)
  dataset_size = test_dataset.number_targets()
  for index, n_value in enumerate(n_values):
    dataset_names.append(name)
    error_key = n_shot_error_keys[index]
    data_subset_names.append(error_key)
    all_n_values.append(n_value)
    num_samples = count_error_key_occurrence(test_dataset, error_key)
    num_n_shot_samples.append(num_samples)
    norm_num_samples = (num_samples / dataset_size) * 100
    norm_n_shot_samples.append(norm_num_samples)
test_n_shot_df = pd.DataFrame({'N Range': all_n_values, 
                               'Dataset': dataset_names, 
                               'Data Subset': data_subset_names, 
                               'Number Samples': num_n_shot_samples, 
                               'Number of Samples (%)': norm_n_shot_samples})
test_n_shot_chart = subset_dist_chart(test_n_shot_df, 'N-Shot Subsets')
n_shot_tooltip = ['Number Samples', 'Data Subset', 'Number of Samples (%)', 
                  'N Range']
test_n_shot_chart.encoding.tooltip = alt.Tooltip(n_shot_tooltip)
test_n_shot_chart

In [0]:
pd.pivot_table(test_n_shot_df, values='Number Samples', columns='Data Subset', 
               index='Dataset')

Data Subset,high-shot,low-shot,med-shot,zero-shot
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Election,711,740,736,354
Laptop,121,125,123,269
Restaurant,243,267,258,352


In [0]:
pd.pivot_table(test_n_shot_df, values='N Range', columns='Data Subset', 
               index='Dataset', aggfunc=lambda x: list(x))

Data Subset,high-shot,low-shot,med-shot,zero-shot
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Election,"[(109, 433)]","[(1, 23)]","[(24, 100)]","[(0, 0)]"
Laptop,"[(17, 60)]","[(1, 3)]","[(4, 14)]","[(0, 0)]"
Restaurant,"[(56, 360)]","[(1, 11)]","[(12, 55)]","[(0, 0)]"


As we can see once broken down into these 4 subsets that values of *n* vary a lot depending on the dataset, where it would appear the larger the dataset the more spread out the *n* values are in the subsets. Thus showing that with more data the more likely you are to have more samples to represent a target. Therefore in the low setting one has to think about the problem of generlising to unseen or low sampled targets.


The *TRS* subset explores the idea of generlisation to unseen targets like the *zero-shot* setting as well as seen targets but with unseen sentiment relations. These are then compared to the last subset in this split which is the "typical" and full data case of seen target with seen sentiment relation. These three subsets are called:
1. Unknown Target (UT)
2. Unknown Sentiment Known Target (USKT)
3. Known Sentiment Known Target (KSKT)

In [0]:
from target_extraction.error_analysis import unknown_targets, unknown_sentiment_known_target, known_sentiment_known_target

subset_functions = [unknown_targets, unknown_sentiment_known_target, 
                    known_sentiment_known_target]
known_unknown_df = get_subset_data(train_test_datasets, 
                                   subset_functions=subset_functions, 
                                   lower_target=True)
subset_dist_chart(known_unknown_df, 'TRS')

In [0]:
trs_table = pd.pivot_table(known_unknown_df, values='Number Samples', 
                           index='Dataset', columns='Data Subset')
trs_table

Data Subset,Known Sentiment Known Target,Unknown Sentiment Known Target,Unknown Targets
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Election,2093,94,354
Laptop,302,67,269
Restaurant,708,60,352


Again like in the *n-shot* setting the smaller datasets and thus low resource setting contain more *UT* but also more *USKT* thus showing that the lower resource datasets require methods that are better at generlisation.

The last split is the *FGTSS*, this is similar to the *TRS* split but we break the *KSKT* and *USKT* subsets down into more fine grained subsets while keeping the *UT* subset. The description of this fine grain sentiment subsets:
1. Different Sentiment (*DS*) - The sentiment label(s) assigned to the target word is different in the train compared to the test.
2. Same Once (*SO*) - The single sentiment label assigned to the target word in the train is the same in the test.
3. Same Multi (*SM*) - The multiple sentiment labels assigned to the target in the train are the same in the test.
4. Similar (*S*) - At least one of the multiple sentiment labels assigned to the target word in the train are the same as in the test but not all.

The premise of these subsets are the following:
1. *DS* - Is a more difficult subset as it would require the classifier to make a prediction for a sentiment class never seen for that target.
2. *SO* - Can be seen as a metric for Target overfitting if the classifier performs really well on this subset more so than the other.
3. *SM* and *S* - Is the more generally and common case but the Similar subset to be more difficult as it can have cases of Targets with sentiments never seen before.

These subsets relate a lot to the *KSKT* and *USKT*. The *USKT* are a mix of the *S* and *DS* subsets and the *KSKT* are a max of the *S*, *SM* and *SO* subsets.

In [0]:
from target_extraction.error_analysis import same_one_sentiment, same_multi_sentiment
from target_extraction.error_analysis import similar_sentiment, different_sentiment

subset_functions = [same_one_sentiment, same_multi_sentiment, 
                    similar_sentiment, different_sentiment, unknown_targets]
fgtss_df = get_subset_data(train_test_datasets, 
                           subset_functions=subset_functions, 
                           lower_target=True)
subset_dist_chart(fgtss_df, 'FGTSS in the Test data')

Again we can see that the low resource datasets (laptop) contains the most samples with respect to itself in the more difficult class of *DS* and has the most samples to show overfitting (*SO* subset).

## Summarise
Below we show for each of the datasets all of the splits stated above apart from the *FGTSS*

In [0]:
known_unknown_mapper = {'Unknown Targets': 'UT', 
                        'Unknown Sentiment Known Target': 'USKT', 
                        'Known Sentiment Known Target': 'KTKS'}
if 'UT' not in known_unknown_df['Data Subset'].unique():
  known_unknown_df['Data Subset'] = known_unknown_df['Data Subset'].map(known_unknown_mapper)
known_unknown_df['Data Split'] = 'TRS'
test_n_shot_df['Data Split'] = 'n-shot'
test_num_targets_df['Data Split'] = 'NT'
ds_mapper = {'distinct_sentiment_1': 'DS1', 'distinct_sentiment_2': 'DS2', 
             'distinct_sentiment_3': 'DS3'}
if 'DS3' not in test_ds_df['Data Subset'].unique():
  test_ds_df['Data Subset'] = test_ds_df['Data Subset'].map(ds_mapper)
test_ds_df['Data Split'] = 'DS'
dataset_size_df['Data Split'] = 'Total Samples'
dataset_size_df['Data Subset'] = ''
dataset_size_df['Number of Samples (%)'] = dataset_size_df['Number of Samples']
all_dfs = [test_ds_df, test_n_shot_df, test_num_targets_df, known_unknown_df]
all_dfs = pd.concat(all_dfs, ignore_index=True, sort=False)
pd.pivot_table(all_dfs, values='Number of Samples (%)', columns=['Data Split', 'Data Subset'], 
               index='Dataset').T.round(1)

Unnamed: 0_level_0,Dataset,Election,Laptop,Restaurant
Data Split,Data Subset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DS,DS1,45.8,83.9,79.6
DS,DS2,46.5,14.7,20.1
DS,DS3,7.7,1.4,0.3
NT,1-target,4.0,40.6,25.4
NT,high-targets,19.5,11.9,9.6
NT,low-targets,47.9,32.0,34.3
NT,med-targets,28.7,15.5,30.6
TRS,KTKS,82.4,47.3,63.2
TRS,USKT,3.7,10.5,5.4
TRS,UT,13.9,42.2,31.4


In [0]:
pd.pivot_table(dataset_size_df, values='Number of Samples', index='Dataset').T

Dataset,Election,Laptop,Restaurant
Number of Samples,9358,2313,3602


# Extra, differences in standard train, test split and when you split the train into train and validation

This part of the notebook looks at the differences in error splits between the official train and test splits compared to when you split the train into a new train and validation split.

Below we upload the split train, validation and the standard test dataset from JSON files created using this [script](https://github.com/apmoore1/target-extraction/blob/master/create_splits.py) for the Laptop, Restaurant and Election datasets.

In [0]:
train_val_test_datasets = []
for dataset_name in ['Laptop', 'Restaurant', 'Election']:
  dataset_info = []
  for split_name in ['train', 'validation', 'test']:
    print(f'Upload the {dataset_name} {split_name} dataset')
    temp_uploaded = files.upload()
    dataset_str = list(temp_uploaded.values())[0].decode('utf-8')
    dataset = TargetTextCollection.from_json(dataset_str)
    dataset_info.append(dataset)
  dataset_info.append(dataset_name)
  dataset_info = tuple(dataset_info)
  train_val_test_datasets.append(dataset_info)
alt_train_test_datasets = [(train, test, name) for train, _, test, name in train_val_test_datasets]
train_val_datasets = [(train, val, name) for train, val, _, name in train_val_test_datasets]

Upload the Laptop train dataset


Saving train.json to train.json
Upload the Laptop validation dataset


Saving val.json to val.json
Upload the Laptop test dataset


Saving test.json to test.json
Upload the Restaurant train dataset


Saving train.json to train (1).json
Upload the Restaurant validation dataset


Saving val.json to val (1).json
Upload the Restaurant test dataset


Saving test.json to test (1).json
Upload the Election train dataset


Saving train.json to train (2).json
Upload the Election validation dataset


Saving val.json to val (2).json
Upload the Election test dataset


Saving test.json to test (2).json


Below we compare the standard train and test split to that of the new train and validation split as well as the new train and standard test split. The comparison is only done here for the global error analysis splits.

In [0]:
all_dfs = []
for datasets_data, split_name in [(alt_train_test_datasets, 'ST'), 
                                  (train_val_datasets, 'V')]:
  n_shot_error_keys = ['zero-shot', 'low-shot', 'med-shot', 'high-shot']
  num_n_shot_samples = []
  norm_n_shot_samples = []
  dataset_names = []
  data_subset_names = []
  all_n_values = []

  for train_dataset, test_dataset, name in datasets_data:
    test_dataset, n_values = n_shot_subsets(test_dataset, train_dataset, lower=True, 
                                            return_n_values=True)
    dataset_size = test_dataset.number_targets()
    for index, n_value in enumerate(n_values):
      dataset_names.append(name)
      error_key = n_shot_error_keys[index]
      data_subset_names.append(error_key)
      all_n_values.append(n_value)
      num_samples = count_error_key_occurrence(test_dataset, error_key)
      num_n_shot_samples.append(num_samples)
      norm_num_samples = (num_samples / dataset_size) * 100
      norm_n_shot_samples.append(norm_num_samples)
  df = pd.DataFrame({'N Range': all_n_values, 
                     'Dataset': dataset_names, 
                     'Data Subset': data_subset_names, 
                     'Number Samples': num_n_shot_samples, 
                     'Number of Samples (%)': norm_n_shot_samples})
  df['Split'] = split_name
  df['Data Split'] = 'n-shot'
  all_dfs.append(df)

  subset_functions = [unknown_targets, unknown_sentiment_known_target, 
                      known_sentiment_known_target]
  alt_known_unknown_df = get_subset_data(datasets_data, 
                                         subset_functions=subset_functions, 
                                         lower_target=True)
  alt_known_unknown_df['Data Subset'] = alt_known_unknown_df['Data Subset'].map(known_unknown_mapper)
  alt_known_unknown_df['Split'] = split_name
  alt_known_unknown_df['Data Split'] = 'TRS'
  all_dfs.append(alt_known_unknown_df)

known_unknown_df['Split'] = 'OT'
test_n_shot_df['Split'] = 'OT'
all_dfs.extend([known_unknown_df, test_n_shot_df])
all_dfs = pd.concat(all_dfs, sort=False)

In [0]:
pd.pivot_table(all_dfs, values='Number of Samples (%)', 
               columns=['Data Split', 'Data Subset'], 
               index=['Dataset', 'Split']).T.round(1)

Unnamed: 0_level_0,Dataset,Election,Election,Election,Laptop,Laptop,Laptop,Restaurant,Restaurant,Restaurant
Unnamed: 0_level_1,Split,OT,ST,V,OT,ST,V,OT,ST,V
Data Split,Data Subset,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
TRS,KTKS,82.4,80.8,80.9,47.3,42.5,58.6,63.2,60.4,67.9
TRS,USKT,3.7,4.1,4.4,10.5,10.8,8.4,5.4,6.2,5.8
TRS,UT,13.9,15.1,14.7,42.2,46.7,33.0,31.4,33.4,26.3
n-shot,high-shot,28.0,28.0,27.6,19.0,15.7,19.9,21.7,21.1,18.3
n-shot,low-shot,29.1,28.3,28.9,19.6,19.9,23.6,23.8,23.1,25.9
n-shot,med-shot,29.0,28.7,28.8,19.3,17.7,23.5,23.0,22.4,29.5
n-shot,zero-shot,13.9,15.1,14.7,42.2,46.7,33.0,31.4,33.4,26.3


Below we compare the validation and standard test split with respect to the local error analysis splits.

In [0]:
local_val_dfs = []
val_name = [(val, name) for train, val, name in train_val_datasets]
val_ds_df = get_error_data(val_name, ds_error_keys, distinct_sentiment, separate_labels=True)
val_ds_df['Data Subset'] = val_ds_df['Data Subset'].map(ds_mapper)
val_ds_df['Data Split'] = 'DS'
val_ds_df['Split'] = 'V'
test_ds_df['Split'] = 'OT'
val_nt_df = get_error_data(val_name, num_targets_error_keys, num_targets_subset)
val_nt_df['Data Split'] = 'NT'
val_nt_df['Split'] = 'V'
test_num_targets_df['Split'] = 'OT'
local_val_dfs = [val_ds_df, val_nt_df, test_num_targets_df, test_ds_df]
local_val_df = pd.concat(local_val_dfs, sort=False)

In [0]:
pd.pivot_table(local_val_df, values='Number of Samples (%)', 
               columns=['Data Split', 'Data Subset'], 
               index=['Dataset', 'Split']).T.round(1)

Unnamed: 0_level_0,Dataset,Election,Election,Laptop,Laptop,Restaurant,Restaurant
Unnamed: 0_level_1,Split,OT,V,OT,V,OT,V
Data Split,Data Subset,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
DS,DS1,45.8,44.6,83.9,80.4,79.6,72.0
DS,DS2,46.5,47.2,14.7,19.2,20.1,25.3
DS,DS3,7.7,8.2,1.4,0.5,0.3,2.7
NT,1-target,4.0,4.3,40.6,38.0,25.4,26.4
NT,high-targets,19.5,23.5,11.9,11.2,9.6,19.4
NT,low-targets,47.9,46.8,32.0,33.7,34.3,31.5
NT,med-targets,28.7,25.4,15.5,17.0,30.6,22.7


All of this analysis shows that even when splitting the training dataset into a new train and validation splits the data distribution is kept realtively similar to the original train and standard test split.