## Format BC2GM dataset to tsv.

This code formats the BC2GM dataset to a tsv format and also exports its in Arrow format. Paths are relative to the root project as this code uses `pyhere`.

## Imports

In [None]:
from typing import List
from pathlib import Path

from tqdm import tqdm
from pyhere import here
from datasets import ClassLabel, Dataset, Features, Value, Sequence, DatasetDict

## Create dataset

Input filenames should be the name of the split they correspond to (e.g. _train_).

In [None]:
# Define input files
input_files = [
    "data/bc2gm/raw/train.tsv",
    "data/bc2gm/raw/test.tsv",
    "data/bc2gm/raw/devel.tsv",

]

# Define output folder
output_dir = "data/bc2gm/processed"

Initialize `DatasetDict` and `Features` schema.

In [None]:
entities_list = ["O", "B-GENE", "I-GENE"]

# Initialize DatasetDict
ner_dataset = DatasetDict()

# Features schema
features_schema = Features({
    "sentence": Value(dtype="string"),
    "tokens": Sequence(feature=Value(dtype="string")),
    "ner_tags": Sequence(
        ClassLabel(
            num_classes=len(entities_list),
            names=entities_list
        )
    )
})

In [None]:
# Iterate over the all input files
for file_name in input_files:

    print(f"Processing {file_name}")

    # Read lines of input file
    with open(here(file_name), "r") as file:
        lines = file.readlines()

    # Initialize values
    dataset_entries = []
    sentence_words: List[str] = []
    label_items: List[str] = []

    # Iterate over each line
    for idx, line in tqdm(enumerate(lines), total=len(lines)):

        # If line is not new line character (it contains a word and a label)
        if line != "\n":
            
            # Split line to get word and label
            line_elements = line.strip().split("\t")
            word = line_elements[0]
            label = line_elements[1]
            
            # Append word and label
            sentence_words.append(word)
            label_items.append(label)

        # If line is a new line character
        else:
            # Create sentence string by joining words list
            sentence_string = " ".join(sentence_words)

            # Add entry to ner_dataset
            dataset_entry = {
                "sentence": sentence_string,
                "tokens": sentence_words,
                "ner_tags": label_items
            }

            # Append dataset entry to list of entries
            dataset_entries.append(dataset_entry)

            # Reset values
            sentence_words = []
            label_items = []
    
    # Create Dataset and append to DatasetDict
    dataset_split_name = Path(file_name).stem
    ner_dataset[dataset_split_name] = Dataset.from_list(dataset_entries, features=features_schema)

    # Remove dataset_entries from memory
    del dataset_entries

In [None]:
ner_dataset

## Save dataset

In [None]:
ner_dataset.save_to_disk(str(here(output_dir)))

In [None]:
# To load the dataset run:

import os
from datasets import load_dataset

data_files = {
    "train": str(here(os.path.join(output_dir, "train/data-00000-of-00001.arrow"))),
    "test": str(here(os.path.join(output_dir, "test/data-00000-of-00001.arrow"))),
    "devel": str(here(os.path.join(output_dir, "devel/data-00000-of-00001.arrow")))
}

raw_dataset = load_dataset("arrow", data_files=data_files)
raw_dataset