In [2]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
from collections import defaultdict
import os
import sys

sys.path.append(os.path.abspath(os.pardir))

import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

from tdparse.helper import read_config, full_path
from tdparse.parsers import semeval_14, semeval_15_16, dong, election
from tdparse.data_types import TargetCollection

In [3]:
# Load all of the datasets
youtubean = semeval_14(full_path(read_config('youtubean')))
semeval_14_rest_train = semeval_14(full_path(read_config('semeval_2014_rest_train')))
semeval_14_lap_train = semeval_14(full_path(read_config('semeval_2014_lap_train')))
semeval_14_rest_test = semeval_14(full_path(read_config('semeval_2014_rest_test')))
semeval_14_lap_test = semeval_14(full_path(read_config('semeval_2014_lap_test')))
semeval_15_rest_test = semeval_15_16(full_path(read_config('semeval_2015_rest_test')))
semeval_16_rest_test = semeval_15_16(full_path(read_config('semeval_2016_rest_test')),
                                     sep_16_from_15=True)
dong_train = dong(full_path(read_config('dong_twit_train_data')))
dong_test = dong(full_path(read_config('dong_twit_test_data')))
election_train, election_test = election(full_path(read_config('election_folder_dir')))
# Product reviews are made up of three different products: 1. Computer, 2. Router, and 3. Speaker
product_reviews_folder = full_path(read_config('product_reviews_dir'))
speaker_reviews = semeval_14(os.path.join(product_reviews_folder, 'Speaker.xml'))
computer_reviews = semeval_14(os.path.join(product_reviews_folder, 'Computer.xml'))
router_reviews = semeval_14(os.path.join(product_reviews_folder, 'Router.xml'))
# Combine all of the product reviews
product_reviews = TargetCollection.combine_collections(speaker_reviews, computer_reviews,
                                                       router_reviews)
# Combine semeval 14 resturant train and test
semeval_14_rest_all = TargetCollection.combine_collections(semeval_14_rest_train,
                                                           semeval_14_rest_test)
# Combine semeval 14 resturant all with 15 test
semeval_14_15 = TargetCollection.combine_collections(semeval_14_rest_all,
                                                     semeval_15_rest_test)
datasets = {'Product Reviews' : product_reviews, 'Sem 14 Laptop Train' : semeval_14_lap_train,
            'Sem 14 Laptop Test' : semeval_14_lap_test, 'Sem 14 Rest Train' : semeval_14_rest_train,
            'Sem 14 Rest All' : semeval_14_rest_all, 'Sem 14 & 15 Rest' : semeval_14_15,
            'Sem 14 Rest Test' : semeval_14_rest_test, 'Sem 15 Rest Test' : semeval_15_rest_test,
            'Sem 16 Rest Test' : semeval_16_rest_test, 'Youtubean' : youtubean,
            'Dong Twit Train' : dong_train, 'Dong Twit Test' : dong_test, 
            'Election Train' : election_train, 'Election Test' : election_test}

In [4]:
for name, dataset in datasets.items():
    print('{} lables: {}'.format(name, dataset.stored_sentiments()))

Product Reviews lables: {1, -1}
Sem 14 Laptop Train lables: {0, 1, -1}
Sem 14 Laptop Test lables: {0, 1, -1}
Sem 14 Rest Train lables: {0, 1, -1}
Sem 14 Rest All lables: {0, 1, -1}
Sem 14 & 15 Rest lables: {0, 1, -1}
Sem 14 Rest Test lables: {0, 1, -1}
Sem 15 Rest Test lables: {0, 1, -1}
Sem 16 Rest Test lables: {0, 1, -1}
Youtubean lables: {0, 1, -1}
Dong Twit Train lables: {0, 1, -1}
Dong Twit Test lables: {0, 1, -1}
Election Train lables: {'negative', 'positive', 'neutral'}
Election Test lables: {'negative', 'positive', 'neutral'}


# Datasets
This notebook will describe the different datasets that have been used as well as the statistics of these datasets. The datasets used are the following:
1. [Dong et al.](https://aclanthology.coli.uni-saarland.de/papers/P14-2009/p14-2009) [Twitter dataset](https://github.com/bluemonk482/tdparse/tree/master/data/lidong) NOTE that the dataset does not link to the paper as the dataset released from the paper has already been pre-processed where as this dataset has not.
2. [SemEval 2014 Resturant dataset](http://alt.qcri.org/semeval2014/task4/index.php?id=data-and-tools). We used Train dataset version 2 and the test dataset. This dataset contains 4 sentiment values; 1. Positive, 2. Neutral, 3. Negative, and 4. Conflict but we are only going to use the first 3 to make it comparable to the other datasets and the fact that the conflict label only has 91 instances in the training set and 14 in the test set.
3. [SemEval 2014 Laptop dataset](http://alt.qcri.org/semeval2014/task4/index.php?id=data-and-tools). We used Train dataset version2 and the test dataset. This dataset contains 4 sentiment values; 1. Positive, 2. Neutral, 3. Negative, and 4. Conflict but we are only going to use the first 3 to make it comparable to the other datasets and the fact that the conflict label only has 45 instances in the training set and 16 in the test set.
4. [SemEval 2015 Resturant dataset](). We only used the test dataset as the training set has overlap with the original 2014 version.
5. [SemEval 2016 Resturant dataset](http://alt.qcri.org/semeval2016/task5/index.php?id=data-and-tools). We only used the test dataset as the training set has overlap with the original 2014 version.
6. [Election dataset](https://figshare.com/articles/EACL_2017_-_Multi-target_UK_election_Twitter_sentiment_corpus/4479563/1)
7. [Product review by Liu et al, IJCAI-2015](https://www.aaai.org/ocs/index.php/IJCAI/IJCAI15/paper/view/10766) [dataset](https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html#datasets)
8. [Youtubean dataset](https://github.com/epochx/opinatt/blob/master/samsung_galaxy_s5.xml) [by Marrese-Taylor et al.](https://www.aclweb.org/anthology/W17-5213) - Dataset of 7 youtube reviews of the Samsung Galaxy S5. The text are the closed captions of the videos where the captions were provided by the authors and not automatically generated.

In [32]:
dataset_dict = defaultdict(list)
index = []
columns = ['No. Targets (Dataset Size)', 'No. Senti Labels', 
          'Mean Targets per Sent', 'No Unique Targets',
          '% Targets with 1 Senti', '% Targets with 2 Senti', 
          '% Targets with 3 Senti']
for name, dataset in datasets.items():
    index.append(name)
    targets_i_senti = []
    num_targets = len(dataset)
    for i in range(1, 4):
        if i > num_sentiment_labels:
            targets_i_senti.append(0)
        else:
            i_senti_targets = len(dataset.subset_by_sentiment(i))
            targets_i_senti\
            .append((i_senti_targets / num_targets) * 100)
    
    dataset_dict['No. Targets (Dataset Size)'].append(num_targets)
    dataset_dict['No. Senti Labels'].append(len(dataset.stored_sentiments()))
    dataset_dict['Mean Targets per Sent'].append(dataset\
                                                 .avg_targets_per_sentence())
    dataset_dict['No Unique Targets'].append(dataset.number_unique_targets())
    dataset_dict['% Targets with 1 Senti per Sent'].append(targets_i_senti[0])
    dataset_dict['% Targets with 2 Senti per Sent'].append(targets_i_senti[1])
    dataset_dict['% Targets with 3 Senti per Sent'].append(targets_i_senti[2])
    

dataset_stats = pd.DataFrame(dataset_dict, index=index, columns=columns)
dataset_stats.round(2)

Unnamed: 0,No. Targets (Dataset Size),No. Senti Labels,Mean Targets per Sent,No Unique Targets,% Targets with 1 Senti,% Targets with 2 Senti,% Targets with 3 Senti
Product Reviews,1101,2,1.4,468,94.37,5.63,0.0
Sem 14 Laptop Train,2313,3,1.58,1031,80.33,18.42,1.25
Sem 14 Laptop Test,638,3,1.55,415,83.86,14.73,1.41
Sem 14 Rest Train,3602,3,1.82,1268,73.9,23.82,2.28
Sem 14 Rest All,4722,3,1.83,1630,75.26,22.94,1.8
Sem 14 & 15 Rest,5319,3,1.79,1805,75.78,22.56,1.65
Sem 14 Rest Test,1120,3,1.87,553,79.64,20.09,0.27
Sem 15 Rest Test,597,3,1.49,269,79.9,19.6,0.5
Sem 16 Rest Test,649,3,1.55,312,89.83,9.71,0.46
Youtubean,798,3,2.07,522,81.45,18.17,0.38


As we can see from the table the two smallest dataset are the Youtubean and Product Reviews dataset and it currently does not have a train/test split. We therefore below create this dataset.

## Youtubean train test splitting

In [None]:
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)