In [2]:
import random as rn
rn.seed(42)
from pathlib import Path

import numpy as np
np.random.seed(42)
from sklearn.model_selection import StratifiedShuffleSplit
from bella.parsers import mitchel as mitchell
from bella.data_types import TargetCollection, Target
from bella import write_data

import config

# Creating Training and Test sets for the [Mitchell et al. 2013](https://www.aclweb.org/anthology/D13-1171) Dataset
We show how we created the Training and Test sets for this dataset.

The original Dataset can be downloaded from [here](http://www.m-mitchell.com/code/MitchellEtAl-13-OpenSentiment.tgz) and the accompying paper can be found [here](https://www.aclweb.org/anthology/D13-1171). As Mitchell et al. Evaluated their models of 10 fold cross validation they do not have one train, test set therefore we take one of their train, test folds combine it and split it into 70% train and 30% test, we then save the new train and test dataset in XML format that is of the same format as the [SemEval 2014](http://alt.qcri.org/semeval2014/task4/) datasets (we choose this dataset format as we found it the easiest to parse, use, understand and visually understand).

The original dataset contains 3288 targets as stated in the paper. We also show in this notebook that we also get the same number of targets and thus have parsed the dataset correctly.

In [3]:
CONFIG_FP = Path('..', 'config.yaml')

# Mitchel Dataset
mitchell_original_train = mitchell(config.mitchell_original_train)
mitchell_original_test = mitchell(config.mitchell_original_test)

mitchell_combined = TargetCollection.combine_collections(mitchell_original_train, 
                                                         mitchell_original_test)
print(f'Parsed dataset size {len(mitchell_combined)}')

Parsed dataset size 3288


In [4]:
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)

mitchell_data = np.asarray(mitchell_combined.data_dict())
mitchell_sentiment = np.asarray(mitchell_combined.sentiment_data())
for train_indexs, test_indexs in splitter.split(mitchell_data, mitchell_sentiment):
    train_data = mitchell_data[train_indexs]
    test_data = mitchell_data[test_indexs]
    
convert_to_targets = lambda data: [Target(**target) for target in data]
mitchell_train = TargetCollection(convert_to_targets(train_data))
mitchell_test = TargetCollection(convert_to_targets(test_data))
print(f'''
The dataset has now been split with respect to the class labels so each class label is represented equally in the train and test splits which can be shown here:

Train Data ratio: {mitchell_train.ratio_targets_sentiment()}
Train Data raw values: {mitchell_train.no_targets_sentiment()}

Test Data ratio: {mitchell_test.ratio_targets_sentiment()}
Test Data raw values: {mitchell_test.no_targets_sentiment()}

Original Data ratio: {mitchell_combined.ratio_targets_sentiment()}  
Original Data raw values: {mitchell_combined.no_targets_sentiment()}

We now save the data to XML file format which is the same as the SemEval data format.
''')


The dataset has now been split with respect to the class labels so each class label is represented equally in the train and test splits which can be shown here:

Train Data ratio: {0: 0.7, 1: 0.22, -1: 0.08}
Train Data raw values: {0: 1614, 1: 495, -1: 192}

Test Data ratio: {0: 0.7, -1: 0.08, 1: 0.21}
Test Data raw values: {0: 692, -1: 83, 1: 212}

Original Data ratio: {-1: 0.08, 1: 0.22, 0: 0.7}  
Original Data raw values: {-1: 275, 1: 707, 0: 2306}

We now save the data to XML file format which is the same as the SemEval data format.



In [5]:
write_data.semeval_14(config.mitchell_train, mitchell_train)
write_data.semeval_14(config.mitchell_test, mitchell_test)