In [1]:
from collections import defaultdict

from bella.data_types import TargetCollection
from bella import parsers
import pandas as pd

import config


Using TensorFlow backend.


# Dataset Statistics

In this notebook we show various dataset statistics for all six datasets used.

First we load the datasets:

In [2]:
# Load all of the training and testing datasets
dong_train = parsers.dong(config.DONG_TRAIN, name='Dong Train')
dong_test = parsers.dong(config.DONG_TEST, name='Dong')
laptop_train = parsers.semeval_14(config.laptop_train, name='Laptop Train')
laptop_test = parsers.semeval_14(config.laptop_test, name='Laptop')
restaurant_train = parsers.semeval_14(config.restaurant_train, name='Restaurant Train')
restaurant_test = parsers.semeval_14(config.restaurant_test, name='Restaurant')
election_train = parsers.election_train(config.ELECTION, name='Election Train')
election_test = parsers.election_test(config.ELECTION, name='Election')
mitchell_train = parsers.semeval_14(config.mitchell_train, name='Mitchell Train')
mitchell_test = parsers.semeval_14(config.mitchell_test, name='Mitchell')
youtubean_train = parsers.semeval_14(config.youtubean_train, name='YouTuBean Train')
youtubean_test = parsers.semeval_14(config.youtubean_test, name='YouTuBean')
# Training sets
training_sets = {'Laptop' : laptop_train, 'Restaurant' : restaurant_train,
                 'Mitchell' : mitchell_train, 'Dong Twitter' : dong_train, 
                 'Election Twitter' : election_train, 'YouTuBean' : youtubean_train}
# Combine the train with the test
youtubean = TargetCollection.combine_collections(youtubean_train, youtubean_test)
restaurant = TargetCollection.combine_collections(restaurant_train,
                                                  restaurant_test)
laptop = TargetCollection.combine_collections(laptop_train,
                                              laptop_test)
dong = TargetCollection.combine_collections(dong_train, dong_test)
election = TargetCollection.combine_collections(election_train, election_test)
mitchell = TargetCollection.combine_collections(mitchell_train, mitchell_test)


datasets = {'Laptop' : laptop, 'Restaurant' : restaurant,
            'Mitchell' : mitchell, 'Dong Twitter' : dong, 
            'Election Twitter' : election, 'YouTuBean' : youtubean}

We create statistics on domain, type, medium, number of targets, sentence length, and the number of unique sentiments per sentence.

In [3]:
dataset_dict = defaultdict(list)
index = []
columns = ['Domain', 'Type', 'Medium', 'No. Targets (Dataset Size)', 
           'No. Senti Labels', 'Mean Targets per Sent', 'No Unique Targets',
           '% Targets with 1 Unique Sentiment per Sentence', 
           '% Targets with 2 Unique Sentiment per Sentence', 
           '% Targets with 3 Unique Sentiment per Sentence', 'Avg sentence length per target']
name_domain = {'Laptop' : 'Laptop', 'Restaurant' : 'Restaurant', 
               'Mitchell' : 'General', 'Dong Twitter' : 'General', 'Election Twitter' : 'Politics',
               'YouTuBean' : 'Mobile Phones'}
name_type = {'Laptop' : 'Review', 'Restaurant' : 'Review', 
             'Mitchell' : 'Social Media', 'Dong Twitter' : 'Social Media', 
             'Election Twitter' : 'Social Media',
             'YouTuBean' : 'Review'}
name_medium = {'Laptop' : 'Written', 'Restaurant' : 'Written', 
               'Mitchell' : 'Written', 'Dong Twitter' : 'Written', 
               'Election Twitter' : 'Written',
               'YouTuBean' : 'Spoken'}
for name, dataset in datasets.items():
    index.append(name)
    targets_i_senti = []
    num_targets = len(dataset)
    num_sentiment_labels = len(dataset.stored_sentiments())
    avg_sent_length = dataset.avg_sentence_length_per_target()
    for i in range(1, 4):
        if i > num_sentiment_labels:
            targets_i_senti.append(0)
        else:
            i_senti_targets = len(dataset.subset_by_sentiment(i))
            targets_i_senti\
            .append((i_senti_targets / num_targets) * 100)
            
    dataset_dict['Domain'].append(name_domain[name])
    dataset_dict['Type'].append(name_type[name])
    dataset_dict['Medium'].append(name_medium[name])
    dataset_dict['No. Targets (Dataset Size)'].append(num_targets)
    dataset_dict['No. Senti Labels'].append(num_sentiment_labels)
    dataset_dict['Mean Targets per Sent'].append(dataset\
                                                 .avg_targets_per_sentence())
    dataset_dict['No Unique Targets'].append(dataset.number_unique_targets())
    dataset_dict['% Targets with 1 Unique Sentiment per Sentence'].append(targets_i_senti[0])
    dataset_dict['% Targets with 2 Unique Sentiment per Sentence'].append(targets_i_senti[1])
    dataset_dict['% Targets with 3 Unique Sentiment per Sentence'].append(targets_i_senti[2])
    dataset_dict['Avg sentence length per target'].append(avg_sent_length)
    

dataset_stats = pd.DataFrame(dataset_dict, index=index, columns=columns)
dataset_stats.round(2)

Unnamed: 0,Domain,Type,Medium,No. Targets (Dataset Size),No. Senti Labels,Mean Targets per Sent,No Unique Targets,% Targets with 1 Unique Sentiment per Sentence,% Targets with 2 Unique Sentiment per Sentence,% Targets with 3 Unique Sentiment per Sentence,Avg sentence length per target
Laptop,Laptop,Review,Written,2951,3,1.58,1295,81.09,17.62,1.29,18.57
Restaurant,Restaurant,Review,Written,4722,3,1.83,1630,75.26,22.94,1.8,17.25
Mitchell,General,Social Media,Written,3288,3,1.22,2507,90.48,9.43,0.09,18.02
Dong Twitter,General,Social Media,Written,6940,3,1.0,145,100.0,0.0,0.0,17.37
Election Twitter,Politics,Social Media,Written,11899,3,2.94,2190,44.5,46.72,8.78,21.68
YouTuBean,Mobile Phones,Review,Spoken,798,3,1.56,522,88.85,11.15,0.0,22.53


Sentiment distribution of all datasets:

In [4]:
dataset_dict = defaultdict(list)
index = []
columns = ['No. Poisitve (%)', 'No. Neutral (%)', 'No. Negative (%)']
sentiment_values = [1, 0, -1]
sentiment_mapper = dict(list(zip(sentiment_values, columns)))

for name, dataset in datasets.items():
    index.append(name)
    num_targets = len(dataset)
    sentiment_freq = defaultdict(lambda: 0)
    for data in dataset.data():
        target_value = data['sentiment']
        sentiment_freq[target_value] += 1
    for sentiment_value, freq in sentiment_freq.items():
        senti_percentage = (freq / num_targets) * 100
        freq_percentage = f'{freq} ({senti_percentage:.2f})'
        
        column = sentiment_mapper[sentiment_value]
        dataset_dict[column].append(freq_percentage)
sentiment_freq_stats = pd.DataFrame(dataset_dict, index=index, columns=columns)
sentiment_freq_stats

Unnamed: 0,No. Poisitve (%),No. Neutral (%),No. Negative (%)
Laptop,1328 (45.00),629 (21.31),994 (33.68)
Restaurant,2892 (61.25),829 (17.56),1001 (21.20)
Mitchell,707 (21.50),2306 (70.13),275 (8.36)
Dong Twitter,1734 (24.99),3473 (50.04),1733 (24.97)
Election Twitter,1744 (14.66),4572 (38.42),5583 (46.92)
YouTuBean,224 (28.07),504 (63.16),70 (8.77)


Size of the training sets

In [5]:
train_dataset_dict = defaultdict(list)
index = []
columns = ['Domain', 'Type', 'Medium', 'No. Targets (Dataset Size)', 
           'No. Senti Labels', 'Mean Targets per Sent', 'No Unique Targets',
           '% Targets with 1 Unique Sentiment per Sentence', 
           '% Targets with 2 Unique Sentiment per Sentence', 
           '% Targets with 3 Unique Sentiment per Sentence', 'Avg sentence length per target']
name_domain = {'Laptop' : 'Laptop', 'Restaurant' : 'Restaurant', 
               'Mitchell' : 'General', 'Dong Twitter' : 'General', 'Election Twitter' : 'Politics',
               'YouTuBean' : 'Mobile Phones'}
name_type = {'Laptop' : 'Review', 'Restaurant' : 'Review', 
             'Mitchell' : 'Social Media', 'Dong Twitter' : 'Social Media', 
             'Election Twitter' : 'Social Media',
             'YouTuBean' : 'Review'}
name_medium = {'Laptop' : 'Written', 'Restaurant' : 'Written', 
               'Mitchell' : 'Written', 'Dong Twitter' : 'Written', 
               'Election Twitter' : 'Written',
               'YouTuBean' : 'Spoken'}
for name, dataset in training_sets.items():
    index.append(name)
    targets_i_senti = []
    num_targets = len(dataset)
    num_sentiment_labels = len(dataset.stored_sentiments())
    avg_sent_length = dataset.avg_sentence_length_per_target()
    for i in range(1, 4):
        if i > num_sentiment_labels:
            targets_i_senti.append(0)
        else:
            i_senti_targets = len(dataset.subset_by_sentiment(i))
            targets_i_senti\
            .append((i_senti_targets / num_targets) * 100)
            
    train_dataset_dict['Domain'].append(name_domain[name])
    train_dataset_dict['Type'].append(name_type[name])
    train_dataset_dict['Medium'].append(name_medium[name])
    train_dataset_dict['No. Targets (Dataset Size)'].append(num_targets)
    train_dataset_dict['No. Senti Labels'].append(num_sentiment_labels)
    train_dataset_dict['Mean Targets per Sent'].append(dataset\
                                                 .avg_targets_per_sentence())
    train_dataset_dict['No Unique Targets'].append(dataset.number_unique_targets())
    train_dataset_dict['% Targets with 1 Unique Sentiment per Sentence'].append(targets_i_senti[0])
    train_dataset_dict['% Targets with 2 Unique Sentiment per Sentence'].append(targets_i_senti[1])
    train_dataset_dict['% Targets with 3 Unique Sentiment per Sentence'].append(targets_i_senti[2])
    train_dataset_dict['Avg sentence length per target'].append(avg_sent_length)
    

train_dataset_stats = pd.DataFrame(train_dataset_dict, index=index, columns=columns)
train_dataset_stats.round(2)

Unnamed: 0,Domain,Type,Medium,No. Targets (Dataset Size),No. Senti Labels,Mean Targets per Sent,No Unique Targets,% Targets with 1 Unique Sentiment per Sentence,% Targets with 2 Unique Sentiment per Sentence,% Targets with 3 Unique Sentiment per Sentence,Avg sentence length per target
Laptop,Laptop,Review,Written,2313,3,1.58,1031,80.33,18.42,1.25,19.28
Restaurant,Restaurant,Review,Written,3602,3,1.82,1268,73.9,23.82,2.28,17.55
Mitchell,General,Social Media,Written,2301,3,1.27,1824,89.14,10.73,0.13,18.05
Dong Twitter,General,Social Media,Written,6248,3,1.0,140,100.0,0.0,0.0,17.36
Election Twitter,Politics,Social Media,Written,9358,3,2.94,1855,44.14,46.77,9.08,21.67
YouTuBean,Mobile Phones,Review,Spoken,558,3,1.74,394,85.66,14.34,0.0,22.75


In [7]:
train_dataset_dict = defaultdict(list)
index = []
columns = ['No. Poisitve (%)', 'No. Neutral (%)', 'No. Negative (%)']
sentiment_values = [1, 0, -1]
sentiment_mapper = dict(list(zip(sentiment_values, columns)))

for name, dataset in training_sets.items():
    index.append(name)
    num_targets = len(dataset)
    sentiment_freq = defaultdict(lambda: 0)
    for data in dataset.data():
        target_value = data['sentiment']
        sentiment_freq[target_value] += 1
    for sentiment_value, freq in sentiment_freq.items():
        senti_percentage = (freq / num_targets) * 100
        freq_percentage = f'{freq} ({senti_percentage:.2f})'
        
        column = sentiment_mapper[sentiment_value]
        train_dataset_dict[column].append(freq_percentage)
train_sentiment_freq_stats = pd.DataFrame(train_dataset_dict, index=index, columns=columns)
train_sentiment_freq_stats

Unnamed: 0,No. Poisitve (%),No. Neutral (%),No. Negative (%)
Laptop,987 (42.67),460 (19.89),866 (37.44)
Restaurant,2164 (60.08),633 (17.57),805 (22.35)
Mitchell,495 (21.51),1614 (70.14),192 (8.34)
Dong Twitter,1561 (24.98),3127 (50.05),1560 (24.97)
Election Twitter,1366 (14.60),3615 (38.63),4377 (46.77)
YouTuBean,157 (28.14),352 (63.08),49 (8.78)


Size of the small training sets

In [9]:
small_training_dataset_dir = config.small_training_dataset_dir
small_laptop_train = parsers.semeval_14(small_training_dataset_dir / 'Laptop train.xml')
small_restaurant_train = parsers.semeval_14(small_training_dataset_dir / 'Restaurant train.xml')
small_mitchell_train = parsers.semeval_14(small_training_dataset_dir / 'Mitchell train.xml')
small_dong_train = parsers.semeval_14(small_training_dataset_dir / 'Dong train.xml')
small_election_train = parsers.semeval_14(small_training_dataset_dir / 'Election train.xml')


small_training_sets = {'Laptop' : small_laptop_train, 'Restaurant' : small_restaurant_train,
                       'Mitchell' : small_mitchell_train, 'Dong Twitter' : small_dong_train, 
                       'Election Twitter' : small_election_train}


small_train_dataset_dict = defaultdict(list)
index = []
columns = ['Domain', 'Type', 'Medium', 'No. Targets (Dataset Size)', 
           'No. Senti Labels', 'Mean Targets per Sent', 'No Unique Targets',
           '% Targets with 1 Unique Sentiment per Sentence', 
           '% Targets with 2 Unique Sentiment per Sentence', 
           '% Targets with 3 Unique Sentiment per Sentence', 'Avg sentence length per target']
name_domain = {'Laptop' : 'Laptop', 'Restaurant' : 'Restaurant', 
               'Mitchell' : 'General', 'Dong Twitter' : 'General', 'Election Twitter' : 'Politics'}
name_type = {'Laptop' : 'Review', 'Restaurant' : 'Review', 
             'Mitchell' : 'Social Media', 'Dong Twitter' : 'Social Media', 
             'Election Twitter' : 'Social Media'}
name_medium = {'Laptop' : 'Written', 'Restaurant' : 'Written', 
               'Mitchell' : 'Written', 'Dong Twitter' : 'Written', 
               'Election Twitter' : 'Written'}
for name, dataset in small_training_sets.items():
    index.append(name)
    targets_i_senti = []
    num_targets = len(dataset)
    num_sentiment_labels = len(dataset.stored_sentiments())
    avg_sent_length = dataset.avg_sentence_length_per_target()
    for i in range(1, 4):
        if i > num_sentiment_labels:
            targets_i_senti.append(0)
        else:
            i_senti_targets = len(dataset.subset_by_sentiment(i))
            targets_i_senti\
            .append((i_senti_targets / num_targets) * 100)
            
    small_train_dataset_dict['Domain'].append(name_domain[name])
    small_train_dataset_dict['Type'].append(name_type[name])
    small_train_dataset_dict['Medium'].append(name_medium[name])
    small_train_dataset_dict['No. Targets (Dataset Size)'].append(num_targets)
    small_train_dataset_dict['No. Senti Labels'].append(num_sentiment_labels)
    small_train_dataset_dict['Mean Targets per Sent'].append(dataset\
                                                 .avg_targets_per_sentence())
    small_train_dataset_dict['No Unique Targets'].append(dataset.number_unique_targets())
    small_train_dataset_dict['% Targets with 1 Unique Sentiment per Sentence'].append(targets_i_senti[0])
    small_train_dataset_dict['% Targets with 2 Unique Sentiment per Sentence'].append(targets_i_senti[1])
    small_train_dataset_dict['% Targets with 3 Unique Sentiment per Sentence'].append(targets_i_senti[2])
    small_train_dataset_dict['Avg sentence length per target'].append(avg_sent_length)
    

small_train_dataset_stats = pd.DataFrame(small_train_dataset_dict, index=index, columns=columns)
small_train_dataset_stats.round(2)

Unnamed: 0,Domain,Type,Medium,No. Targets (Dataset Size),No. Senti Labels,Mean Targets per Sent,No Unique Targets,% Targets with 1 Unique Sentiment per Sentence,% Targets with 2 Unique Sentiment per Sentence,% Targets with 3 Unique Sentiment per Sentence,Avg sentence length per target
Laptop,Laptop,Review,Written,558,3,1.62,359,78.32,21.68,0.0,19.34
Restaurant,Restaurant,Review,Written,558,3,1.77,300,74.73,23.84,1.43,17.17
Mitchell,General,Social Media,Written,558,3,1.49,471,83.15,16.31,0.54,18.75
Dong Twitter,General,Social Media,Written,558,3,1.0,104,100.0,0.0,0.0,17.39
Election Twitter,Politics,Social Media,Written,558,3,3.0,290,42.11,47.67,10.22,21.31


In [10]:
small_train_dataset_dict = defaultdict(list)
index = []
columns = ['No. Poisitve (%)', 'No. Neutral (%)', 'No. Negative (%)']
sentiment_values = [1, 0, -1]
sentiment_mapper = dict(list(zip(sentiment_values, columns)))

for name, dataset in small_training_sets.items():
    index.append(name)
    num_targets = len(dataset)
    sentiment_freq = defaultdict(lambda: 0)
    for data in dataset.data():
        target_value = data['sentiment']
        sentiment_freq[target_value] += 1
    for sentiment_value, freq in sentiment_freq.items():
        senti_percentage = (freq / num_targets) * 100
        freq_percentage = f'{freq} ({senti_percentage:.2f})'
        
        column = sentiment_mapper[sentiment_value]
        small_train_dataset_dict[column].append(freq_percentage)
small_train_sentiment_freq_stats = pd.DataFrame(small_train_dataset_dict, index=index, columns=columns)
small_train_sentiment_freq_stats

Unnamed: 0,No. Poisitve (%),No. Neutral (%),No. Negative (%)
Laptop,236 (42.29),111 (19.89),211 (37.81)
Restaurant,348 (62.37),98 (17.56),112 (20.07)
Mitchell,110 (19.71),399 (71.51),49 (8.78)
Dong Twitter,140 (25.09),279 (50.00),139 (24.91)
Election Twitter,74 (13.26),232 (41.58),252 (45.16)
