<a href="https://colab.research.google.com/github/apmoore1/target-extraction/blob/master/tutorials/TDSA_Error_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%%capture
!pip install git+git://github.com/apmoore1/target-extraction.git@master#egg=target-extraction
!pip install altair

In [2]:
from typing import List, Tuple, Callable

import altair as alt
import pandas as pd
from target_extraction.dataset_parsers import semeval_2014, wang_2017_election_twitter_test, wang_2017_election_twitter_train
from target_extraction.data_types import TargetTextCollection

alt.renderers.enable('colab')



RendererRegistry.enable('colab')

In [0]:
def subset_dist_chart(subset_frame: pd.DataFrame, title: str,
                      no_labels: bool = False) -> alt.vegalite.v3.api.Chart:
  '''
  :param subset_frame: A DataFrame containing the following columns: 
                       1. Data Subset, 2. Number of Samples (%),
                       3. Number of Samples, 4. Dataset
  :param title: A title to give to the generated plot
  :param no_labels: Whether the Y-axis should have a label displayed
  :returns: A plot that displays for each of the datasets the percentage of the 
            subsets in that dataset as stacked coloured coloumns.
  '''
  font_size = 14
  y_axis_color = alt.Color('Data Subset', 
                           scale=alt.Scale(scheme='redyellowblue'))
  y_axis = alt.Axis(title=None, labels=True, ticks=False)
  y_col = alt.Y('Number of Samples (%)',
                scale=alt.Scale(domain=[0, 100]))
  if no_labels:
    y_axis = alt.Axis(title=None, labels=True, ticks=False)
    y_col = alt.Y('Number of Samples (%)', axis=y_axis,
                  scale=alt.Scale(domain=[0, 100]))
        
  tooltip=['Number Samples', 'Data Subset',
           'Number of Samples (%)']
  subset_bar_data = {'x': 'Dataset', 'y': y_col, 
                     'color': y_axis_color, 'tooltip': tooltip}
  chart= alt.Chart(subset_frame).mark_bar()\
                                .encode(**subset_bar_data)\
                                .properties(width=100)
  chart.title = title
  return chart

def get_subset_data(train_test_name_datasets: List[Tuple[TargetTextCollection,TargetTextCollection,str]],
                    subset_functions: Callable[[TargetTextCollection, TargetTextCollection, bool], TargetTextCollection],
                    lower_target: bool) -> pd.DataFrame:
  data_subset_names = []
  number_samples = []
  number_samples_percentage = []
  dataset_names = []

  for train, test, name in train_test_name_datasets:
    test_size = test.number_targets()
    for subset_func in subset_functions:
      test = subset_func(test, train, lower_target)
      subset_name = subset_func.__name__
      num_in_subset = count_error_key_occurence(test, subset_name)
      percentage_subset = (num_in_subset / test_size) * 100

      subset_name = ' '.join([word.capitalize() for word in subset_name.split('_')])
      data_subset_names.append(subset_name)
      number_samples.append(num_in_subset)
      number_samples_percentage.append(percentage_subset)
      dataset_names.append(name)
  return pd.DataFrame({'Dataset': dataset_names, 
                       'Number Samples': number_samples, 
                       'Number of Samples (%)': number_samples_percentage, 
                       'Data Subset': data_subset_names})

def get_distinct_data(dataset_name: List[Tuple[TargetTextCollection,str]]
                      ) -> pd.DataFrame:
  data_subset_names = []
  number_samples = []
  number_samples_percentage = []
  dataset_names = []
  # Max number of distinct sentiments
  max_distinct_sentiments = 3
  
  for dataset, name in dataset_name:
    dataset_size = dataset.number_targets()
    dataset = distinct_sentiment(dataset)
    for i in range(1, max_distinct_sentiments + 1):
      data_subset_name = f'DS {i}'
      num_in_subset = 0
      for target_object in dataset.values():
        distinct_sentiment_number = target_object['distinct_sentiment']
        if i in distinct_sentiment_number:
          num_in_subset += len(distinct_sentiment_number)
      percentage_subset = (num_in_subset / dataset_size) * 100

      data_subset_names.append(data_subset_name)
      number_samples.append(num_in_subset)
      number_samples_percentage.append(percentage_subset)
      dataset_names.append(name)
  return pd.DataFrame({'Dataset': dataset_names, 
                       'Number Samples': number_samples, 
                       'Number of Samples (%)': number_samples_percentage, 
                       'Data Subset': data_subset_names})

def get_target_distribution_data(sample_distribution: bool, 
                                 dataset_name: List[Tuple[TargetTextCollection, str]]
                                 ) -> pd.DataFrame:
  '''
  :param sample_distribution: If true will return the Target count with respect
                              to the dataset sample distribution. Else it will 
                              return the Target Count with respect to the 
                              target distribution
  :param dataset_name: A list of tuples containing (TargetTextCollection, 
                       dataset name) where each datasets statistics will be 
                       created and returned together in the Pandas DataFrame.
  :return: Pandas DataFrame containing statistics to create graphs that can 
           plot both the same and target distribution with respect to Target 
           count.
  '''
  target_counts = []
  dataset = []
  for target_dataset, name in dataset_name:
    target_count = target_dataset.target_count(lower=True).values()
    target_counts.extend(list(target_count))
    dataset.extend([name] * len(target_count))
  distribution_name = 'Targets'
  if sample_distribution:
    distribution_name = 'Samples'
  column_name = f'Number {distribution_name}'
  column_percentage = f'{distribution_name} (%)'
  column_cumulative = f'{distribution_name} Cumulative (%)'

  target_count_df = pd.DataFrame({'Target Count': target_counts,
                                  'Dataset': dataset})
  target_count_df = target_count_df.groupby(['Target Count', 'Dataset'])
  if sample_distribution:
    target_count_df = target_count_df['Target Count'].sum()
  else:
    target_count_df = target_count_df['Target Count'].count()
  target_count_df = pd.DataFrame(target_count_df)
  target_count_df = target_count_df.rename(columns={'Target Count': column_name})
  target_count_df = target_count_df.reset_index()
  target_count_df['index'] = target_count_df.index

  targets_datasets = target_count_df.groupby(['Dataset'])
  dataset_target_count = target_count_df.set_index(['Dataset', 'index'])[column_name]
  dataset_num_targets = targets_datasets[column_name].sum()
  targets_percentage = (dataset_target_count / dataset_num_targets) * 100
  target_count_df[column_percentage] = targets_percentage.reset_index()[column_name]

  target_count_df[column_cumulative] = target_count_df.groupby('Dataset')[column_percentage].cumsum()
  target_count_df[column_cumulative] = target_count_df[column_cumulative].round(3)
  return target_count_df

# The different sentiment specific data subsets within Target/Aspect Dependent Sentiment Analysis (TDSA)
In this notebook we will explore the different ways the TDSA datasets can be broken down via their sentiment and target specific properties. These subsets will come from related work in the field as well as new subsets.

Before exploring any of these subsets I think we should first explore the distribution of Targets within the Training datasets and then the distribution of these targets with respect to the number of samples.

## Target Distributions within the Training datasets
Before any of this can happen we must first upload the 3 different datasets that we are going to explore, as to save room in this notebook look at the following notebook on details of how to run these cells. [NOTEBOOK](https://github.com/apmoore1/target-extraction/blob/master/tutorials/Load_and_Explore_Target_Extraction.ipynb)

In [5]:
from pathlib import Path
from google.colab import files
uploaded = {}
for i in range(4):
  temp_uploaded = files.upload()
  uploaded = {**uploaded, **temp_uploaded}
semeval_fps = [Path(key).resolve() for key in uploaded.keys()]
semeval_fps = {semeval_path.name: semeval_path for semeval_path in semeval_fps}
del uploaded
# Paths to the files uploaded
for file_name, fp in semeval_fps.items():
  print(f'SemEval file name: {file_name}. File Path {fp}')
  
laptop_train = semeval_2014(semeval_fps['Laptop_Train_v2.xml'], conflict=False)
laptop_test = semeval_2014(semeval_fps['Laptops_Test_Gold.xml'], conflict=False)
rest_train = semeval_2014(semeval_fps['Restaurants_Train_v2.xml'], conflict=False)
rest_test = semeval_2014(semeval_fps['Restaurants_Test_Gold.xml'], conflict=False)
elec_dir = Path('/tmp', 'elec_dir')
elec_train = wang_2017_election_twitter_train(elec_dir)
elec_test = wang_2017_election_twitter_test(elec_dir)

train_test_datasets = [(laptop_train, laptop_test, 'Laptop'),
                       (rest_train, rest_test, 'Restaurant'),
                       (elec_train, elec_test, 'Election')]

Saving Laptops_Test_Gold.xml to Laptops_Test_Gold.xml


Saving Restaurants_Test_Gold.xml to Restaurants_Test_Gold.xml


Saving Laptop_Train_v2.xml to Laptop_Train_v2.xml


Saving Restaurants_Train_v2.xml to Restaurants_Train_v2.xml
SemEval file name: Laptops_Test_Gold.xml. File Path /content/Laptops_Test_Gold.xml
SemEval file name: Restaurants_Test_Gold.xml. File Path /content/Restaurants_Test_Gold.xml
SemEval file name: Laptop_Train_v2.xml. File Path /content/Laptop_Train_v2.xml
SemEval file name: Restaurants_Train_v2.xml. File Path /content/Restaurants_Train_v2.xml


Now that we have the data we are going to plot the target distribution for each of the training datasets. To do this first we are going to manipulate the data within the TargetTextCollections to create a pandas dataframe in long form where we have the number of times a target has occured (Target Count) and the number of targets that have occurred that frequently as a percentage of all targets (Target (%)). This data can be seen below where we have printed the first 5 rows of the DataFrame.  

In [6]:
train_dataset_name = [(train, name) for train, test, name in train_test_datasets]
target_count_df = get_target_distribution_data(sample_distribution=False,
                                               dataset_name=train_dataset_name)
target_count_df.head(5)

Unnamed: 0,Target Count,Dataset,Number Targets,index,Targets (%),Targets Cumulative (%)
0,1,Election,1276,0,69.010276,69.01
1,1,Laptop,692,1,73.227513,73.228
2,1,Restaurant,895,2,74.895397,74.895
3,2,Election,176,3,9.518659,78.529
4,2,Laptop,116,4,12.275132,85.503


Below we are going to plot the Cumulative Target frequency:

In [0]:
target_dist_chart = alt.Chart(target_count_df).mark_line(point=True).encode(
    x=alt.X("Target Count:Q"),
    y=alt.Y('Targets Cumulative (%)', scale=alt.Scale(domain=(65, 100))),
    color='Dataset',
    tooltip=['Target Count', 'Targets Cumulative (%)', 'Dataset']
)
target_dist_chart.title = 'Target distribution'

target_dist_chart_60 = target_dist_chart.transform_filter((alt.datum['Target Count'] <= 60))
target_dist_chart_60.title = 'Target distribution (Up to count 60)'

dataset_size = {'Dataset': [name for _, name in train_dataset_name],
                'Number of Samples': [dataset.number_targets() 
                                      for dataset, _ in train_dataset_name]}
dataset_size_df = pd.DataFrame(dataset_size)
dataset_size_chart = alt.Chart(dataset_size_df).mark_bar().encode(
    x='Dataset', y='Number of Samples', color='Dataset', 
    tooltip=['Dataset', 'Number of Samples'])

target_dist_chart | target_dist_chart_60 | dataset_size_chart

As we can see from above at least 69% of all Targets occur only once in the Dataset showing that the dataset has a rather large number of targets where we only have a One-Shot learning setup to understand the potentially non-trival target specific sentiment e.g. `a long battery` is a good sentiment for battery but if the target was `movie` then this would be negative.

On the opposite end of the scale we see the differences in the datasets where the Laptop dataset more targets occuring less often, where the most frequent targets occur only 60 times in comparison to the Restaurant and Election datasets which have targets occuring up to 360 and 433 times.

These differences in frequencies can be due to the differences in dataset sizes as shown by the far right bar plot.

## Target Distributions with respect to the number of samples within the Training datasets
Again like before we first need to create the data to plot which is done below. The function creates a dataframe that cotains the number of time a target occurs (Target Count) as well as the number of samples that contain those targets that occur for that number of times (Number Samples) e.g. below for the Election dataset for the Target Count of 1 there are 1276 samples in the dataset that contain target that have only occured 1 further more for the Target Count 2 there are 352 samples that contain targets that have occured twice. 

In [0]:
target_sample_df = get_target_distribution_data(sample_distribution=True,
                                                dataset_name=train_dataset_name)
target_sample_df.head(5)

Unnamed: 0,Target Count,Dataset,Number Samples,index,Samples (%),Samples Cumulative (%)
0,1,Election,1276,0,13.635392,13.635
1,1,Laptop,692,1,29.917856,29.918
2,1,Restaurant,895,2,24.847307,24.847
3,2,Election,352,3,3.761487,17.397
4,2,Laptop,232,4,10.030264,39.948


Below we plot the Target count against the cumulative number of samples.

In [0]:
chart = alt.Chart(target_sample_df).mark_line(point=True).encode(
    x=alt.X("Target Count:Q"),
    y=alt.Y('Samples Cumulative (%)'),
    color='Dataset',
    tooltip=['Target Count', 'Samples Cumulative (%)', 'Dataset']
)
chart.title = 'Data distribution through target count'

cumulative_data_51 = target_sample_df[target_sample_df['Samples Cumulative (%)']<=51]
chart_50 = alt.Chart(cumulative_data_51).mark_area(opacity=0.4).encode(
    x=alt.X("Target Count:Q"),
    y=alt.Y('Samples Cumulative (%)'),
    color='Dataset'
)
# Plotting the points for the Target Count where the Samples Cumulative (%) 
# is at least 50%
data_50_points = target_sample_df[target_sample_df['Samples Cumulative (%)'] >= 50]
data_50_points = data_50_points.groupby('Dataset').first().reset_index()
chart_50_point = alt.Chart(data_50_points).mark_circle(color='black').encode(
    x=alt.X("Target Count:Q"),
    y=alt.Y('Samples Cumulative (%)')
)
text = chart_50_point.mark_text(
    align='left',
    baseline='middle',
    dx=10
).encode(
    text='Target Count:Q'
)
# Plotting the points where Target Count is 1 and displaying the text
# for the Samples Cumulative (%) score.
target_1_points = target_sample_df[target_sample_df['Target Count'] == 1]
chart_1_point = alt.Chart(target_1_points).mark_circle(color='gray').encode(
    x=alt.X("Target Count:Q"),
    y=alt.Y('Samples Cumulative (%)')
)
text_1_point = chart_1_point.mark_text(
    align='left',
    baseline='middle',
    dx=10
).encode(
    text='Samples Cumulative (%)'
)

chart_50.title = 'Data distribution through target count (Samples Cumulative <=51)'
chart | chart_50 + chart_50_point + text + chart_1_point + text_1_point

As we can see from the plots above there is a large difference again between the three datasets. The Laptop dataset is made up of mainly Samples that have Targets that occur very in-frequent compared to Election and Restaurant dataset. The left plot shows this best where 50% of the dataset for the Laptop, Restaurant, and Election datasets are made up of samples with targets that have only occured at most 5, 11, and 36 times respectively.

This is not surpursing giving the Target Distribution we sae before where the Election and Restaurant datasets before had some target that occur very frequently.

This shows another large difference in the datasets where in the case for the Laptop dataset we have very few samples to understand the sentiment of a specific target word in comparison to the Election and Restaurant datasets where for the most frequent targets we can have over 100 samples.

Lastly as we can again see in the right hand plot a massive difference between the datasets in the number of samples that only contain tagrets that occur once: 13%, 24%, and 29% for the Election, Restaurant, and Laptop respectively.

Having seen a target in very few contexts can make it very hard for a model to learn the complex relationship between the Target and the associated sentiment, further more it could lead to generlisation problems where the model learns only that a Target occurs with one sentiment as it has only ever seen that target associated with that one sentiment.

## Subsets that look at both the Target and Sentiment in the test datasets with respect to the training datasets
Now knowing what the Target distribution is with respect to the targets and the data (samples) in the training data. We are going to explore subsets that look at the test datasets with respect to there associated training datasets, to see how often targets occur in the training and then the test as well as if the target occurs in the training does it occur in the test with the same sentiment.

These subsets are to probe further into the datasets to see if there could be sub-samples of data that are more challenging or if a classifier performs poorly on them could explain how to improve the classifier.

### Unknwon Target, Known Targets and there known or unknown associated sentiment
**(accroyn for this subset UT&KTS which stands for Uknown Target and Known Target Sentiments)**

These subset breaks the data down into 3 sub-samples:
1. Unknown - Samples in the test data where the target has only ever appeared in the test and not the train.
2. Unknown Sentiment Known Target - Samples in the test data where the target has appeared in both train and test but the sentiment associated to the target in the test sample has never been seen before in the training data for that target.
3. Known Sentiment Known Target - Same as 2 but the sentiment associated to the target in the test samples has been seen before in the training data for that target.

The premise of these subsets are the following:
1. Unknown Targets (UT) - If a classifier performs well on this subset it must mean that it can generlise well to new targets, further more it shows it performs well in the Zero shot case for TDSA.
2. Unknown Sentiment Known Target (USKT) - Performing well here shows that a classifier can generlise to new sentiments, on the flip side performing badly here can show that the classifier is overfitting to the data. A classifier that performs well here but not in the unknown case can show that it can generlise to new sentiments but not new targets.
3. Known Sentiment Known Target (KSKT) - This is the ordinary case to some degree and the easiest.

Below we extract this data out of the training and test datasets that we have and display it in Pandas DataFrame, we then further plot the data. Just for clarification number of samples are in the **test data**.

In [12]:
from target_extraction.error_analysis import unknown_targets, unknown_sentiment_known_target, known_sentiment_known_target
from target_extraction.error_analysis import count_error_key_occurence

subset_functions = [unknown_targets, unknown_sentiment_known_target, 
                    known_sentiment_known_target]
known_unknown_df = get_subset_data(train_test_datasets, 
                                   subset_functions=subset_functions, 
                                   lower_target=True)
known_unknown_df

Unnamed: 0,Dataset,Number Samples,Number of Samples (%),Data Subset
0,Laptop,269,42.163009,Unknown Targets
1,Laptop,67,10.501567,Unknown Sentiment Known Target
2,Laptop,302,47.335423,Known Sentiment Known Target
3,Restaurant,352,31.428571,Unknown Targets
4,Restaurant,60,5.357143,Unknown Sentiment Known Target
5,Restaurant,708,63.214286,Known Sentiment Known Target
6,Election,354,13.931523,Unknown Targets
7,Election,94,3.699331,Unknown Sentiment Known Target
8,Election,2093,82.369146,Known Sentiment Known Target


All of the data has now 

In [0]:
subset_dist_chart(known_unknown_df, 'UT&KTS in the Test data')

As we can see generally the larger datasets have more KSKT due to the training datasets covering more targets, this is best shown with the Election dataset have very few Unknown Targets (~13%). Further more it shows that the smaller datasets can be more challenging as more of the samples have UT like the Laptop dataset containing 42% of its samples as UT.

### Fined Grained Target Sentiment Subsets (FGTSS)
This is similar to the UT&KTS subsets but we break the KSKT and USKT subsets down into more fine grained subsets while keeping the UT subset. The description of this fine grain sentiment subsets:
1. Different Sentiment (DS) - The sentiment label(s) assigned to the target word is different in the train compared to the test.
2. Same Once (SO) - The single sentiment label assigned to the target word in the train is the same in the test.
3. Same Multi (SM) - The multiple sentiment labels assigned to the target in the train are the same in the test.
4. Similar (S) - At least one of the multiple sentiment labels assigned to the target word in the train are the same as in the test but not all.

The premise of these subsets are the following:
1. DS - Is a more difficult subset as it would require the classifier to make a prediction for a sentiment class never seen for that target.
2. SO - Can be seen as a metric for Target overfitting if the classifier performs really well on this subset more so than the other.
3. SM and S - Is the more generally and common case but the Similar subset to be more difficult as it can have cases of Targets with sentiments never seen before.

These subsets relate a lot to the KSKT and USKT where the USKT are a mix of the Similar and Different Sentiment subsets and the KSKT are a max of the Similar, Same Multi and Same Once subsets.

In [13]:
from target_extraction.error_analysis import same_one_sentiment, same_multi_sentiment
from target_extraction.error_analysis import similar_sentiment, different_sentiment

subset_functions = [same_one_sentiment, same_multi_sentiment, 
                    similar_sentiment, different_sentiment, unknown_targets]
fgtss_df = get_subset_data(train_test_datasets, 
                           subset_functions=subset_functions, 
                           lower_target=True)
fgtss_df

Unnamed: 0,Dataset,Number Samples,Number of Samples (%),Data Subset
0,Laptop,50,7.836991,Same One Sentiment
1,Laptop,84,13.166144,Same Multi Sentiment
2,Laptop,189,29.623824,Similar Sentiment
3,Laptop,46,7.210031,Different Sentiment
4,Laptop,269,42.163009,Unknown Targets
5,Restaurant,45,4.017857,Same One Sentiment
6,Restaurant,333,29.732143,Same Multi Sentiment
7,Restaurant,346,30.892857,Similar Sentiment
8,Restaurant,44,3.928571,Different Sentiment
9,Restaurant,352,31.428571,Unknown Targets


In [0]:
subset_dist_chart(fgtss_df, 'FGTSS in the Test data')

## Subsets based on the number of distinct sentiments in the sentence
This is the only subset of data we are going to look at that relates to previous work. These subsets by [Wang et al, 2017](https://aclweb.org/anthology/E17-1046) called Distinct Sentiments (DS) are based on the number of distinct sentiments that are within a sentenc for example a sample would be DS 1 if the sample's text only contains one sentiment within it or DS 2 if it contains two sentiments.

The premise of this is that the more distinct sentiments in the sentence that more difficult it is for the classifier as it has to disentangle the different sentiments that are associated to the different targets. Furthermore if the whole dataset is DS1 then a sentence based sentiment classifier would probably work quite well on this dataset, as it would not have to learn the target sentiment relation explictly.

Below like the other subsets we retrieve the data and plot it for the: 
1. Test dataset.
2. Train dataset.
3. The combination of Train and Test datasets.

In [35]:
from target_extraction.error_analysis import distinct_sentiment

test_dataset_name = [(test, name) for train, test, name in train_test_datasets]
train_and_test_name = [(TargetTextCollection.combine(train, test), name) 
                       for train, test, name in train_test_datasets]

train_ds_df = get_distinct_data(train_dataset_name)
test_ds_df = get_distinct_data(test_dataset_name)
train_test_ds_df = get_distinct_data(train_and_test_name)

train_ds_chart = subset_dist_chart(train_ds_df, 
                                   'Distinct Sentiment in the Train data')
test_ds_chart = subset_dist_chart(test_ds_df, 
                                  'Distinct Sentiment in the Test data',
                                  True)
train_test_title = 'Distinct Sentiment in the combination of Train and'\
                   ' Test data'
train_test_ds_chart = subset_dist_chart(train_test_ds_df, train_test_title,
                                        True)

train_ds_chart | test_ds_chart | train_test_ds_chart

As we can see only the election dataset would appear to have almost an equal split between 1 and 2 DS's but none of the three really have that many of the very difficult samples of DS 3 and again only the Election dataset does. Further more the Restaurant dataset almost has no DS 3 samples in the test dataset. However the representation of DS samples seem to be comparable in the train and test datasets for all of the datasets.