In [None]:
%pip install -r ../requirements.txt

In [None]:
%pip install -e ../

In [None]:
%pip install datasets

In [None]:
import pandas as pd
from pathlib import Path

from datasets import load_dataset

from typing import List, Dict

from unique_batches.utils.io import read_token_level_annotated_data

# Load dataset

In [None]:
dataset = load_dataset("tner/mit_restaurant")

In [None]:
dataset

In [None]:
train_df = pd.DataFrame(dataset['train'])
validation_df = pd.DataFrame(dataset['validation'])
test_df = pd.DataFrame(dataset['test'])

df = pd.concat([train_df, validation_df, test_df]).reset_index(drop=True)

In [None]:
df

# Clean

In [None]:
def is_only_numeric(tokens: List[str]):
    return all([tok.isdecimal() for tok in tokens])

In [None]:
df = df[~df['tokens'].apply(is_only_numeric)]

# Put in appropriate form

In [None]:
label2id = {
    "O": 0,
    "B-Rating": 1,
    "I-Rating": 2,
    "B-Amenity": 3,
    "I-Amenity": 4,
    "B-Location": 5,
    "I-Location": 6,
    "B-RestaurantName": 7,
    "I-RestaurantName": 8,
    "B-Price": 9,
    "B-Hours": 10,
    "I-Hours": 11,
    "B-Dish": 12,
    "I-Dish": 13,
    "B-Cuisine": 14,
    "I-Price": 15,
    "I-Cuisine": 16
}

In [None]:
id2label = {
    i: label for (label, i) in label2id.items()
}

In [None]:
def tags_to_labels(tags: List[int], dictionary: Dict[int, str]) -> List[str]:
    labels = list(map(dictionary.get, tags))
    return labels

In [None]:
df['labels'] = df['tags'].apply(lambda tags: tags_to_labels(tags, id2label))

In [None]:
def merge_tokens_labels(tokens: List[str], labels: List[str]) -> List[str]:
    token_labels = [f"{token}|{label}" for token, label in zip(tokens, labels)]
    return " ".join(token_labels)

In [None]:
df['annotation'] = df.apply(lambda row: merge_tokens_labels(row['tokens'], row['labels']), axis=1)

In [None]:
df

In [None]:
df['domain'] = 'dummyDomain'
df['intent'] = 'dummyIntent'

df = df[['domain', 'intent', 'annotation']]

df

In [None]:
datastore = Path("../datastore/").resolve().absolute()

dataset_path = datastore / "mit_restaurant" / "MITrestaurant.tsv"
if not dataset_path.parent.exists():
    dataset_path.parent.mkdir(parents=True)

df.to_csv(dataset_path, sep="\t", index=False, header=None)

# Check

In [None]:
mit_df = read_token_level_annotated_data(dataset_path)
mit_df

In [None]:
tags = set()
for idx, row in mit_df.iterrows():
    ner_tags = row["tags"]
    for tag in ner_tags:
        tags.add(tag)

print(f"The dataset contains {len(tags)} distinct named entities")