In [1]:
from pathlib import Path
DATA_PATH = '../data/event_pairs.dev'

In [12]:
i = 0
sample = []
with open(DATA_PATH, 'r') as f:
    for line in f:
        if i > 0:
            break
        sample.append(line)
        i += 1

In [14]:
parts = sample[0].strip().split('\t')
parts

['Here is what we know about the fighting at and around Al - Shifa medical center that the Israeli army says has killed more than 140 Palestinian militants .',
 '7',
 '7',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 'Global calls for a humanitarian ceasefire have mounted in recent days as Al - Shifa became the focus of Israel ’s war on Hamas , and fears grew for the thousands of people trapped there , on the frontline of the conflict .',
 '41',
 '41',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '1']

In [3]:
from typing import List, Tuple, Dict
from enum import Enum

In [4]:
DEFAULT_PARTS = 11
NULL_VAL = '<*>'

class TokensMap(Enum):
    """
    A map between ingredients of an event and their
    corresponding indices in raw data
    """
    trigger = (1, 3)
    pp_1 = (3, 5)
    pp_2 = (5, 7)
    time = (7, 9)
    loc = (9, 11)


class Event:
    def __init__(self, parts: List[str]):
        """
        Take
        """
        assert len(parts) == DEFAULT_PARTS
        self._parts = parts

        self._sentence = parts[0]
        self._tokens = self._sentence.strip().split()

        for part in (TokensMap):
            self._add_attrs(part)

    def _add_attrs(self, part: TokensMap):
        """
        Extract tokens from `self._sentence` attribute given an interval

        ['I love pizza .'] and indices (1, 3), return ['love', 'pizza', '.']

        :params: indices_in_parts: a tuple of start and end indices of an interval
        :return: a list of tokens
        """

        # first, retrieve indices of tokens consisting of a part (e.g. trigger)
        name_in_part, indices_in_part = part.name, part.value
        span_indices = self._parts[slice(*indices_in_part)]
        span_indices = [int(i) for i in span_indices]

        # case: -1 denotes the data contains no information about current part
        #       e.g. no trigger (verb phrase) in the sentence
        if -1 in span_indices:
            self.__setattr__(f"_{name_in_part}", NULL_VAL)

        s, e = span_indices

        if s == e:
            return self.__setattr__(f"_{name_in_part}", self._tokens[s])

        return self.__setattr__(f"_{name_in_part}", self._tokens[s : e])

    @property
    def sentence(self):
        return self._sentence

    @property
    def trigger(self):
        return self._trigger

    @property
    def pp1(self):
        return self._pp_1

    @property
    def pp2(self):
        return self._pp_2

    @property
    def time(self):
        return self._time

    @property
    def location(self):
        return self._loc




In [5]:
class EventPair:

    def __init__(self, raw_data: str):
        chunks = raw_data.strip().split('\t')
        mid = len(chunks) // 2

        self._label = chunks[-1]
        self._event_1 = Event(chunks[:mid])
        self._event_2 = Event(chunks[mid:-1])

    @property
    def label(self):
        return self._label

    @property
    def events(self):
        return self._event_1, self._event_2


In [6]:
def load_data(fpath: Path):
    with open(fpath, 'r', encoding='utf-8') as f:
        for line in f:
            yield EventPair(line)

In [7]:
event_pairs = list(load_data(DATA_PATH))

In [8]:
len(event_pairs)

36438

In [9]:
pair = event_pairs[0]

In [11]:
print(pair.label)
e1, e2 = pair.events
print(e1.sentence)
print(e2.sentence)
print(f"{e1.trigger} | {e2.trigger}")

1
Here is what we know about the fighting at and around Al - Shifa medical center that the Israeli army says has killed more than 140 Palestinian militants .
Global calls for a humanitarian ceasefire have mounted in recent days as Al - Shifa became the focus of Israel ’s war on Hamas , and fears grew for the thousands of people trapped there , on the frontline of the conflict .
fighting | conflict
