In [1]:
import logging
from typing import Dict, List, Iterable

from overrides import overrides

from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import Field, TextField, SequenceLabelField, MetadataField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.data.dataset_readers.dataset_utils import Ontonotes, OntonotesSentence

import random
import pickle as pkl


logger = logging.getLogger(__name__)  # pylint: disable=invalid-name


class SrlReader(DatasetReader):
    """
    This DatasetReader is designed to read in the English OntoNotes v5.0 data
    for semantic role labelling. It returns a dataset of instances with the
    following fields:

    tokens : ``TextField``
        The tokens in the sentence.
    verb_indicator : ``SequenceLabelField``
        A sequence of binary indicators for whether the word is the verb for this frame.
    tags : ``SequenceLabelField``
        A sequence of Propbank tags for the given verb in a BIO format.

    Parameters
    ----------
    token_indexers : ``Dict[str, TokenIndexer]``, optional
        We similarly use this for both the premise and the hypothesis.  See :class:`TokenIndexer`.
        Default is ``{"tokens": SingleIdTokenIndexer()}``.
    domain_identifier: ``str``, (default = None)
        A string denoting a sub-domain of the Ontonotes 5.0 dataset to use. If present, only
        conll files under paths containing this domain identifier will be processed.

    Returns
    -------
    A ``Dataset`` of ``Instances`` for Semantic Role Labelling.

    """
    def __init__(self,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 domain_identifier: str = None,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._token_indexers = token_indexers or {"tokens":SingleIdTokenIndexer()}
        self._domain_identifier = domain_identifier

    @overrides
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info("Reading SRL instances from dataset files at: %s", file_path)
        if self._domain_identifier is not None:
            logger.info("Filtering to only include file paths containing the %s domain", self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier):
            tokens = [Token(t) for t in sentence.words]
            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = ["O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                yield self.text_to_instance(tokens, verb_label, tags)
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [1 if label[-2:] == "-V" else 0 for label in tags]
                    yield self.text_to_instance(tokens, verb_indicator, tags)

    @staticmethod
    def _ontonotes_subset(ontonotes_reader: Ontonotes,
                          file_path: str,
                          domain_identifier: str) -> Iterable[OntonotesSentence]:
        """
        Iterates over the Ontonotes 5.0 dataset using an optional domain identifier.
        If the domain identifier is present, only examples which contain the domain
        identifier in the file path are yielded.
        """
        for conll_file in ontonotes_reader.dataset_path_iterator(file_path):
            if domain_identifier is None or f"/{domain_identifier}/" in conll_file:
                yield from ontonotes_reader.sentence_iterator(conll_file)

    def text_to_instance(self,  # type: ignore
                         tokens: List[Token],
                         verb_label: List[int],
                         tags: List[str] = None) -> Instance:
        """
        We take `pre-tokenized` input here, along with a verb label.  The verb label should be a
        one-hot binary vector, the same length as the tokens, indicating the position of the verb
        to find arguments for.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        text_field = TextField(tokens, token_indexers=self._token_indexers)
        fields['tokens'] = text_field
        fields['verb_indicator'] = SequenceLabelField(verb_label, text_field)
        if tags:
            fields['tags'] = SequenceLabelField(tags, text_field)

        if all([x == 0 for x in verb_label]):
            verb = None
        else:
            verb = tokens[verb_label.index(1)].text
        fields["metadata"] = MetadataField({"words": [x.text for x in tokens],
                                            "verb": verb})
        return Instance(fields)


In [2]:
def pack_data_into_pkl(domain='bn', quantity=1000, method = 'iid'):
    reader = SrlReader(token_indexers= None,
                       domain_identifier=domain)
    train_data_gen = reader._read('datasets/conll-formatted-ontonotes-5.0/data/train/')
    train_data = list(train_data_gen)
    random.seed(quantity)
    random.shuffle(train_data)
    ktrain_data = train_data[:quantity]
    return ktrain_data

In [3]:
train_data = pack_data_into_pkl(domain='bn', quantity=1000, method = 'iid')
pkl.dump(train_data, open('datasets/cross-domain-subsets/bn_1k.pkl','wb'))

train_data = pack_data_into_pkl(domain='bn', quantity=5000, method = 'iid')
pkl.dump(train_data, open('datasets/cross-domain-subsets/bn_5k.pkl','wb'))

train_data = pack_data_into_pkl(domain='bn', quantity=10000, method = 'iid')
pkl.dump(train_data, open('datasets/cross-domain-subsets/bn_10k.pkl','wb'))

In [4]:
train_data = pack_data_into_pkl(domain='nw', quantity=1000, method = 'iid')
pkl.dump(train_data, open('datasets/cross-domain-subsets/nw_1k.pkl','wb'))

train_data = pack_data_into_pkl(domain='nw', quantity=5000, method = 'iid')
pkl.dump(train_data, open('datasets/cross-domain-subsets/nw_5k.pkl','wb'))

train_data = pack_data_into_pkl(domain='nw', quantity=10000, method = 'iid')
pkl.dump(train_data, open('datasets/cross-domain-subsets/nw_10k.pkl','wb'))

In [5]:
train_data = pack_data_into_pkl(domain='bc', quantity=1000, method = 'iid')
pkl.dump(train_data, open('datasets/cross-domain-subsets/bc_1k.pkl','wb'))

train_data = pack_data_into_pkl(domain='bc', quantity=5000, method = 'iid')
pkl.dump(train_data, open('datasets/cross-domain-subsets/bc_5k.pkl','wb'))

train_data = pack_data_into_pkl(domain='bc', quantity=10000, method = 'iid')
pkl.dump(train_data, open('datasets/cross-domain-subsets/bc_10k.pkl','wb'))

In [6]:
train_data = pack_data_into_pkl(domain='tc', quantity=1000, method = 'iid')
pkl.dump(train_data, open('datasets/cross-domain-subsets/tc_1k.pkl','wb'))

train_data = pack_data_into_pkl(domain='tc', quantity=5000, method = 'iid')
pkl.dump(train_data, open('datasets/cross-domain-subsets/tc_5k.pkl','wb'))

train_data = pack_data_into_pkl(domain='tc', quantity=10000, method = 'iid')
pkl.dump(train_data, open('datasets/cross-domain-subsets/tc_10k.pkl','wb'))

In [7]:
train_data = pack_data_into_pkl(domain='pt', quantity=1000, method = 'iid')
pkl.dump(train_data, open('datasets/cross-domain-subsets/pt_1k.pkl','wb'))

train_data = pack_data_into_pkl(domain='pt', quantity=5000, method = 'iid')
pkl.dump(train_data, open('datasets/cross-domain-subsets/pt_5k.pkl','wb'))

train_data = pack_data_into_pkl(domain='pt', quantity=10000, method = 'iid')
pkl.dump(train_data, open('datasets/cross-domain-subsets/pt_10k.pkl','wb'))

In [8]:
train_data = pack_data_into_pkl(domain='mz', quantity=1000, method = 'iid')
pkl.dump(train_data, open('datasets/cross-domain-subsets/mz_1k.pkl','wb'))

train_data = pack_data_into_pkl(domain='mz', quantity=5000, method = 'iid')
pkl.dump(train_data, open('datasets/cross-domain-subsets/mz_5k.pkl','wb'))

train_data = pack_data_into_pkl(domain='mz', quantity=10000, method = 'iid')
pkl.dump(train_data, open('datasets/cross-domain-subsets/mz_10k.pkl','wb'))

In [9]:
train_data = pack_data_into_pkl(domain='wb', quantity=1000, method = 'iid')
pkl.dump(train_data, open('datasets/cross-domain-subsets/wb_1k.pkl','wb'))

train_data = pack_data_into_pkl(domain='wb', quantity=5000, method = 'iid')
pkl.dump(train_data, open('datasets/cross-domain-subsets/wb_5k.pkl','wb'))

train_data = pack_data_into_pkl(domain='wb', quantity=10000, method = 'iid')
pkl.dump(train_data, open('datasets/cross-domain-subsets/wb_10k.pkl','wb'))