diff --git a/biodatasets/aimed_interactions/aimed_interactions.py b/biodatasets/aimed_interactions/aimed_interactions.py new file mode 100644 index 000000000..ec13320e7 --- /dev/null +++ b/biodatasets/aimed_interactions/aimed_interactions.py @@ -0,0 +1,432 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This dataset contains 225 annotated article titles/abstracts taken from PubMed. +Each entry is annotated in XML format with tagged protein entities and protein interactions, +and as such the dataset is suitable for the task of Named Entity Recognition and +Relation Extraction + +Dataset was downloaded from https://www.cs.utexas.edu/ftp/mooney/bio-data/interactions.tar.gz +and parsed into the bigbio_kb schema format. Because the orginal paper doesn't +share analysis code, and doesn't list an explicit schema, the schema for +"source" is also parsed into bigbio_kb. + +The dataset was originally collated and analyzed in the following paper: +Comparative experiments on learning information extractors for proteins and their interactions. +Bunescu, Razvan et al. Artificial intelligence in medicine. 33, 2 139-155. 2005. +""" + +import os +import re +from collections import defaultdict +from glob import glob +from typing import Dict, List, Tuple +from xml.etree import ElementTree +from xml.sax.handler import EntityResolver + +import datasets +from bs4 import BeautifulSoup +from yaml import parse + +from utils import schemas +from utils.configs import BigBioConfig +from utils.constants import Tasks + +_CITATION = """\ +@article{bunescu2005comparitive, + title={Comparative experiments on learning information extractors for proteins and their interactions.}, + author={Bunescu, Razvan et al.}, + journal={Artificial intelligence in medicine}, + volume={33}, + number={2}, + pages={139-155}, + year={2005} +} +""" + +_DATASETNAME = "aimed_interactions" + +_DESCRIPTION = """\ +This dataset is designed for the NLP tasks of Named Entity Recognition and Relation Extraction. +""" + +_HOMEPAGE = "" + +_LICENSE = "" # TODO: NULL + +_URLS = { + _DATASETNAME: ["https://www.cs.utexas.edu/ftp/mooney/bio-data/interactions.tar.gz"], +} + +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] + +_SOURCE_VERSION = "1.0.0" +_BIGBIO_VERSION = "1.0.0" + + +NEXT_ID = {"val": 1} + + +def gen_id(): + new_id = NEXT_ID["val"] + NEXT_ID["val"] += 1 + return new_id + + +def _get_example_text(example: dict) -> str: + """ + Concatenate all text from passages in an example of a KB schema + :param example: An instance of the KB schema + """ + return " ".join([t for p in example["passages"] for t in p["text"]]) + + +def fix_offsets(entries): + """ + In some cases, entity and passage offsets would be off by one on their starting indices, + and sometimes their end indices as well. This wasn't happening in my original scratch + notebook, and I couldn't diagnose what was happening, but it may be because of the way + the example text is constructed in the test. As a result, I'm just passing back through the + entries and manually checking for those slightly misaligned offsets and correcting them. + """ + for entry in entries: + fulltext = _get_example_text(entry) + for passage in entry["passages"]: + for ix, offset_text in enumerate(zip(passage["offsets"], passage["text"])): + offset, text = offset_text + start, end = offset + if fulltext[start:end] != text: + if fulltext[start - 1 : end - 1] == text: + passage["offsets"][ix] = [start - 1, end - 1] + elif fulltext[start - 1 : end] == text: + passage["offset"][ix] = [start - 1, end] + elif fulltext[start + 1 : end + 1] == text: + passage["offsets"][ix] = [start + 1, end + 1] + elif fulltext[start + 1 : end] == text: + passage["offsets"][ix] = [start + 1, end] + elif fulltext[start + 2 : end + 2] == text: + passage["offsets"][ix] = [start + 2, end + 2] + for entity in entry["entities"]: + for ix, offset_text in enumerate(zip(entity["offsets"], entity["text"])): + offset, text = offset_text + start, end = offset + if fulltext[start:end] != text: + if fulltext[start - 1 : end - 1] == text: + entity["offsets"][ix] = [start - 1, end - 1] + elif fulltext[start - 1 : end] == text: + entity["offset"][ix] = [start - 1, end] + elif fulltext[start + 1 : end + 1] == text: + entity["offsets"][ix] = [start + 1, end + 1] + elif fulltext[start + 1 : end] == text: + entity["offsets"][ix] = [start + 1, end] + + start, end = entity["offsets"][ix] + if fulltext[start:end] != text: + print(len(text)) + print(len(fulltext[start:end])) + return entries + + +def load(fpath): + with open(fpath, "r") as f: + content = f.read() + return content + + +def wrap_xml(xml_text: str, tag="root") -> str: + """ + The standard libary xml module requires that xml have some kind of root element. + This wraps the of the + PubMed abstracts in a element for the etree parser. + """ + xml_text = f"<{tag}>{xml_text}" + return xml_text + + +def passages_to_xml(text): + ti_ix = text.find("TI - ") + ab_ix = text.find("AB - ") + ad_ix = text.rfind("AD -") + + ti_slice = text[ti_ix:ab_ix] + ab_slice = text[ab_ix:ad_ix] + ad_slice = text[ad_ix:] + + ti_node = wrap_xml(ti_slice, "ArticleTitle") + ab_node = wrap_xml(ab_slice, "AbstractText") + ad_node = wrap_xml(ad_slice, "ADText") + + xml_text = wrap_xml(f"{ti_node}{ab_node}{ad_node}") + return xml_text + + +def fix_prot_pairs(xml_text): + return re.sub("", lambda match: f'', xml_text) + + +def fix_whitespace(xml_text): + """In the dataset, all XML tags are followed by two whitespace characters. This removes those two characters.""" + return re.sub("(<.*?>) ", lambda match: match.group(1), xml_text) + + +def preprocess_xml_text(text): + xml_text = fix_prot_pairs(text) + xml_text = passages_to_xml(xml_text) + xml_text = fix_whitespace(xml_text) + return xml_text + + +def loadtree(fpath): + text = load(fpath) + xml_text = preprocess_xml_text(text) + tree = ElementTree.fromstring(xml_text) + return tree + + +def textify(t): + s = [] + if t.text: + s.append(t.text) + for child in t: + s.extend(textify(child)) + if t.tail: + s.append(t.tail) + return "".join(s) + + +def process_passage(node, start_idx): + + if node.tag == "ArticleTitle": + _type = "title" + elif node.tag == "AbstractText": + _type = "abstract" + elif node.tag == "ADText": + _type = "ad" + else: + raise ValueError( + f'node.tag should be one of ["ArticleTitle", "AbstracText", "ADText"]. Received node.tag == {node.tag}' + ) + + _text = textify(node) + _offsets = [start_idx, start_idx + len(_text)] + + passage = {"id": gen_id(), "type": _type, "text": [_text], "offsets": [_offsets]} + return passage + + +def get_passages(t): + + state = {"current_idx": 0, "str_chunks": [], "passages": []} + + def _textify(t): + if t.text: + state["str_chunks"].append(t.text) + state["current_idx"] += len(t.text) + for child in t: + if child.tag in ["ArticleTitle", "AbstractText", "ADText"]: + parsed_passage = process_passage(child, start_idx=state["current_idx"]) + state["passages"].append(parsed_passage) + _textify(child) + if t.tail: + state["str_chunks"].append(t.tail) + state["current_idx"] += len(t.tail) + + _textify(t) + return state + + +def textify_prot(t): + child_tags = [c.tag for c in t] + + s = [] + if t.text: + s.append(t.text) + for child in t: + s.extend(textify(child)) + + return "".join(s) + + +def process_entity(node, start_idx): + + if not hasattr(node, "tag"): + return + + if node.tag == "prot": + _type = "protein" + else: + raise ValueError(f'node.tag should be one of ["prot"]. Received node.tag == {node.tag}') + + _text = textify_prot(node) + _offsets = [start_idx, start_idx + len(_text)] + + entity = {"id": gen_id(), "type": _type, "text": [_text], "offsets": [_offsets], "normalized": []} + return entity + + +def get_entities_and_relations(t): + + state = { + "current_idx": 0, + "str_chunks": [], + "entities": [], + "pairs": defaultdict(lambda: {"id": gen_id(), "type": "protein-protein interaction", "normalized": []}), + } + + def _textify(t, parent_tag="root"): + if t.text: + state["str_chunks"].append(t.text) + state["current_idx"] += len(t.text) + for child in t: + if child.tag in ["p1", "p2"]: + n = child.tag[1] + pair_num = child.get("pair") + prot = child.find("prot") + if prot: + entity = process_entity(prot, start_idx=state["current_idx"]) + state["pairs"][pair_num][f"arg{n}_id"] = entity["id"] + state["entities"].append(entity) + + if child.tag in ["prot"] and parent_tag not in ["p1", "p2"]: + parsed_entity = process_entity(child, start_idx=state["current_idx"]) + state["entities"].append(parsed_entity) + + _textify(child, parent_tag=t.tag) + if t.tail: + state["str_chunks"].append(t.tail) + state["current_idx"] += len(t.tail) + + _textify(t) + relations = [] + for relation in state["pairs"].values(): + if "arg1_id" in relation and "arg2_id" in relation: + relations.append(relation) + + state["relations"] = relations + del state["pairs"] + return state + + +def parse_article(fpath, fulltext=False): + + doc_id = re.search("_.*?(\d+)", fpath).group(1) + + text = load(fpath) + xml_text = preprocess_xml_text(text) + t = ElementTree.fromstring(xml_text) + + entities_relations = get_entities_and_relations(t) + entities = entities_relations["entities"] + relations = entities_relations["relations"] + passages = get_passages(t)["passages"] + result = { + "id": gen_id(), + "document_id": doc_id, + "passages": passages, + "entities": entities, + "events": [], + "coreferences": [], + "relations": relations, + } + + if fulltext: + fulltext = textify(t) + result["fulltext"] = fulltext + + return result + + +class AIMedInteractionsDataset(datasets.GeneratorBasedBuilder): + """ + This dataset contains annotated article titles/abstracts. + Each entry is annotated in XML format with tagged protein entities and protein interactions, + and as such the dataset is suitable for the task of Named Entity Recognition and + Relation Extraction + + Dataset was downloaded from https://www.cs.utexas.edu/ftp/mooney/bio-data/interactions.tar.gz + and parsed into the bigbio_kb schema format. Because the orginal paper doesn't + share analysis code, and doesn't list an explicit schema, the schema for + "source" is also parsed into bigbio_kb. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + BUILDER_CONFIGS = [ + BigBioConfig( + name="aimed_interactions_source", + version=SOURCE_VERSION, + description="AIMedInteractions source schema", + schema="source", + subset_id="aimed_interactions", + ), + BigBioConfig( + name="aimed_interactions_bigbio_kb", + version=BIGBIO_VERSION, + description="AIMedInteractions BigBio schema", + schema="bigbio_kb", + subset_id="aimed_interactions", + ), + ] + + DEFAULT_CONFIG_NAME = "aimed_interactions_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = schemas.kb_features + elif self.config.schema == "bigbio_kb": + features = schemas.kb_features + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "data_dir": data_dir, + "split": "train", + }, + ) + ] + + def _generate_examples(self, data_dir: str, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + fpaths = glob(data_dir[0] + "/*/**") + # Downloaded data contains a corrupted, duplicate file + fpaths = [x for x in fpaths if ("abstract" in x)] + + entries = [parse_article(p, False) for p in fpaths] + entries = fix_offsets(entries) + + if self.config.schema == "source": + for key, example in enumerate(entries): + yield key, example + elif self.config.schema == "bigbio_kb": + for key, example in enumerate(entries): + yield key, example + + +# This template is based on the following template from the datasets package: +# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py diff --git a/biodatasets/aimed_proteins/aimed_proteins.py b/biodatasets/aimed_proteins/aimed_proteins.py new file mode 100644 index 000000000..0d775d9c0 --- /dev/null +++ b/biodatasets/aimed_proteins/aimed_proteins.py @@ -0,0 +1,368 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This dataset contains 225 annotated article titles/abstracts taken from PubMed. +Each entry is annotated in XML format with tagged protein entities, and as such +the dataset is suitable for the task of Named Entity Recognition. + +Dataset was downloaded from https://www.cs.utexas.edu/ftp/mooney/bio-data/proteins.tar.gz +and parsed into the bigbio_kb schema format. Because the orginal paper doesn't +share analysis code, and doesn't list an explicit schema, the schema for +"source" is also parsed into bigbio_kb. + +The dataset was originally collated and analyzed in the following paper: +Comparative experiments on learning information extractors for proteins and their interactions. +Bunescu, Razvan et al. Artificial intelligence in medicine. 33, 2 139-155. 2005. +""" + +import os +import re +from glob import glob +from typing import Dict, List, Tuple +from xml.etree import ElementTree +from xml.sax.handler import EntityResolver + +import datasets +from bs4 import BeautifulSoup +from yaml import parse + +from utils import schemas +from utils.configs import BigBioConfig +from utils.constants import Tasks + +_CITATION = """\ +@article{bunescu2005comparitive, + title={Comparative experiments on learning information extractors for proteins and their interactions.}, + author={Bunescu, Razvan et al.}, + journal={Artificial intelligence in medicine}, + volume={33}, + number={2}, + pages={139-155}, + year={2005} +} +""" + +_DATASETNAME = "aimed_proteins" + +_DESCRIPTION = """\ +This dataset is designed for the NLP task of NER. +""" + +_HOMEPAGE = "" + +_LICENSE = "" # TODO: NULL + +_URLS = { + _DATASETNAME: ["https://www.cs.utexas.edu/ftp/mooney/bio-data/proteins.tar.gz"], +} + +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION] + +_SOURCE_VERSION = "1.0.0" +_BIGBIO_VERSION = "1.0.0" + + +def _get_example_text(example: dict) -> str: + """ + Concatenate all text from passages in an example of a KB schema + :param example: An instance of the KB schema + """ + return " ".join([t for p in example["passages"] for t in p["text"]]) + + +def wrap_xml(xml_text: str) -> str: + """ + The standard libary xml module requires that xml have some kind of root element. + This wraps the of the + PubMed abstracts in a element for the etree parser. + """ + xml_text = f"{xml_text}" + return xml_text + + +def remove_question_mark_tags(xml_text): + """Removes invalid question mark tags, denoted by , from XML and returns the modified text""" + clean_text = re.sub("<\?>(.*?)", lambda match: match.group(1), xml_text) + return clean_text + + +def preprocess_xml(xml_text: str) -> ElementTree: + """Applies all preprocessing steps to XML text.""" + xml_text = wrap_xml(xml_text) + xml_text = remove_question_mark_tags(xml_text) + return xml_text + + +def textify(t): + """Recursively iterates through ElementTree nodes and converts to text.""" + s = [] + if t.text: + s.append(t.text) + for child in t: + s.extend(textify(child)) + if t.tail: + s.append(t.tail) + return "".join(s) + + +def process_passage(node, start_idx, _id=0): + """Converts an ElementTree node representing a article passage into the appropriate schema.""" + if node.tag == "ArticleTitle": + _type = "title" + elif node.tag == "AbstractText": + _type = "abstract" + else: + raise ValueError(f'node.tag should be one of ["ArticleTitle", "AbstracText"]. Received node.tag == {node.tag}') + + _text = textify(node) + _offsets = [start_idx, start_idx + len(_text)] + passage = { + "id": _id, + "type": _type, + "text": [_text], + "offsets": [_offsets], + } + return passage + + +def get_passages(t): + """Gets all passages from an ElementTree representing an annotated entry.""" + state = {"current_idx": 0, "str_chunks": [], "passages": []} + + def _textify(t): + if t.text: + state["str_chunks"].append(t.text) + state["current_idx"] += len(t.text) + for child in t: + if child.tag in ["ArticleTitle", "AbstractText"]: + parsed_passage = process_passage(child, start_idx=state["current_idx"]) + state["passages"].append(parsed_passage) + _textify(child) + if t.tail: + state["str_chunks"].append(t.tail) + state["current_idx"] += len(t.tail) + + _textify(t) + return state + + +def textify_prot(t): + """Handles textifying proteins. Basically just leaves out the tag's tail attribute.""" + child_tags = [c.tag for c in t] + s = [] + if t.text: + s.append(t.text) + for child in t: + s.extend(textify(child)) + return "".join(s) + + +def process_entity(node, start_idx, _id=0): + """Converts an ElementTree node representing an entity to the appropriate schema.""" + if node.tag == "prot": + _type = "protein" + else: + raise ValueError(f'node.tag should be one of ["prot"]. Received node.tag == {node.tag}') + _text = textify_prot(node) + _offsets = [start_idx, start_idx + len(_text)] + entity = { + "id": _id, + "type": _type, + "text": [_text], + "offsets": [_offsets], + "normalized": [], + } + return entity + + +def get_entities(t): + """Gets all entities from the root ElementTree object.""" + state = {"current_idx": 0, "str_chunks": [], "entities": []} + + def _textify(t): + if t.text: + state["str_chunks"].append(t.text) + state["current_idx"] += len(t.text) + for child in t: + if child.tag in ["prot"]: + parsed_entities = process_entity(child, start_idx=state["current_idx"]) + state["entities"].append(parsed_entities) + _textify(child) + if t.tail: + state["str_chunks"].append(t.tail) + state["current_idx"] += len(t.tail) + + _textify(t) + return state + + +def parse_article(fpath, _id=0, fulltext=False): + """Loads an entry from a filepath and parses it into the dataset schema.""" + doc_id = re.search("abstract(\d-\d+)", os.path.basename(fpath)).group(1) + with open(fpath, "r") as f: + content = f.read() + xml_text = preprocess_xml(content) + t = ElementTree.fromstring(xml_text) + if t: + entities = get_entities(t)["entities"] + passages = get_passages(t)["passages"] + result = { + "id": _id, + "document_id": doc_id, + "passages": passages, + "entities": entities, + "events": [], + "coreferences": [], + "relations": [], + } + if fulltext: + result["fulltext"] = textify(t) + return result + + +def fix_ids(entries, test_offsets=True): + """Resets all ids, passage_ids, and entity_ids so that each is unique.""" + next_id = 0 + for entry in entries: + entry["id"] = next_id + next_id += 1 + for entity in entry["entities"]: + entity["id"] = next_id + next_id += 1 + for passage in entry["passages"]: + passage["id"] = next_id + next_id += 1 + return entries + + +def fix_offsets(entries): + """ + In some cases, entity and passage offsets would be off by one on their starting indices, + and sometimes their end indices as well. This wasn't happening in my original scratch + notebook, and I couldn't diagnose what was happening, but it may be because of the way + the example text is constructed in the test. As a result, I'm just passing back through the + entries and manually checking for those slightly misaligned offsets and correcting them. + """ + for entry in entries: + fulltext = _get_example_text(entry) + for passage in entry["passages"]: + for ix, offset_text in enumerate(zip(passage["offsets"], passage["text"])): + offset, text = offset_text + start, end = offset + if fulltext[start:end] != text: + if fulltext[start - 1 : end - 1] == text: + passage["offsets"][ix] = [start - 1, end - 1] + elif fulltext[start - 1 : end] == text: + passage["offset"][ix] = [start - 1, end] + for entity in entry["entities"]: + for ix, offset_text in enumerate(zip(entity["offsets"], entity["text"])): + offset, text = offset_text + start, end = offset + if fulltext[start:end] != text: + if fulltext[start - 1 : end - 1] == text: + entity["offsets"][ix] = [start - 1, end - 1] + elif fulltext[start - 1 : end] == text: + entity["offset"][ix] = [start - 1, end] + + start, end = entity["offsets"][ix] + if fulltext[start:end] != text: + print(len(text)) + print(len(fulltext[start:end])) + return entries + + +class AIMedProteinsDataset(datasets.GeneratorBasedBuilder): + """ + This dataset contains 225 annotated article titles/abstracts taken from PubMed. + Each entry is annotated in XML format with tagged protein entities, and as such + the dataset is suitable for the task of Named Entity Recognition. + + The dataset was originally collated and analyzed in the following paper: + Comparative experiments on learning information extractors for proteins and their interactions. + Bunescu, Razvan et al. Artificial intelligence in medicine. 33, 2 139-155. 2005. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + BUILDER_CONFIGS = [ + BigBioConfig( + name="aimed_proteins_source", + version=SOURCE_VERSION, + description="AIMed source schema", + schema="source", + subset_id="aimed", + ), + BigBioConfig( + name="aimed_proteins_bigbio_kb", + version=BIGBIO_VERSION, + description="AIMed BigBio schema", + schema="bigbio_kb", + subset_id="aimed", + ), + ] + + DEFAULT_CONFIG_NAME = "aimed_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = schemas.kb_features + elif self.config.schema == "bigbio_kb": + features = schemas.kb_features + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "data_dir": data_dir, + "split": "train", + }, + ) + ] + + def _generate_examples(self, data_dir: str, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + fpaths = glob(data_dir[0] + "/*/**") + # Downloaded data contains a corrupted, duplicate file + fpaths = [x for x in fpaths if ("abstract" in x) and ("2-14~" not in x)] + entries = [parse_article(fpath, 0, True) for fpath in fpaths] + entries = fix_ids(entries) + entries = fix_offsets(entries) + for entry in entries: + if "fulltext" in entry: + del entry["fulltext"] + + if self.config.schema == "source": + for key, example in enumerate(entries): + yield key, example + elif self.config.schema == "bigbio_kb": + for key, example in enumerate(entries): + yield key, example + + +# This template is based on the following template from the datasets package: +# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py