## The Dataset

### MIT Restaurant Dataset

https://groups.csail.mit.edu/sls/downloads/restaurant/

https://huggingface.co/datasets/tner/mit_restaurant


```
{
    'tags': [0, 0, 0, 0, 0, 0, 0, 0, 5, 3, 4, 0],
    'tokens': ['can', 'you', 'find', 'the', 'phone', 'number', 'for', 'the', 'closest', 'family', 'style', 'restaurant']
}

```

```
{
    "O": 0,
    "B-Rating": 1,
    "I-Rating": 2,
    "B-Amenity": 3,
    "I-Amenity": 4,
    "B-Location": 5,
    "I-Location": 6,
    "B-Restaurant_Name": 7,
    "I-Restaurant_Name": 8,
    "B-Price": 9,
    "B-Hours": 10,
    "I-Hours": 11,
    "B-Dish": 12,
    "I-Dish": 13,
    "B-Cuisine": 14,
    "I-Price": 15,
    "I-Cuisine": 16
}

```

In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [2]:
# hide warnings
import warnings
warnings.filterwarnings('ignore')

# https://groups.csail.mit.edu/sls/downloads/restaurant/


In [3]:
!pip install -U transformers
!pip install -U accelerate
!pip install -U datasets
!pip install seqeval
!pip install evaluate

Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.46.3-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m66.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.45.1
    Uninstalling transformers-4.45.1:
      Successfully uninstalled transformers-4.45.1
Successfully installed transformers-4.46.3
Collecting accelerate
  Downloading accelerate-1.1.1-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.1.1-py3-none-any.whl (333 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m333.2/333.2 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
  Attempting

In [4]:
import pandas as pd
import json
import requests

In [5]:
response = requests.get("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/mit_restaurant_search_ner/train.bio")
response = response.text
response = response.splitlines()

In [6]:
train_tokens = []
train_tags = []

temp_tokens = []
temp_tags = []
for line in response:
    if line != "":
        tag, token = line.strip().split("\t")
        temp_tags.append(tag)
        temp_tokens.append(token)
    else:
        train_tokens.append(temp_tokens)
        train_tags.append(temp_tags)

        temp_tokens, temp_tags = [], []

In [7]:
len(train_tokens), len(train_tags)

(7659, 7659)

In [8]:
response = requests.get("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/mit_restaurant_search_ner/test.bio")
response = response.text
response = response.splitlines()

test_tokens = []
test_tags = []

temp_tokens = []
temp_tags = []
for line in response:
    if line != "":
        tag, token = line.strip().split("\t")
        temp_tags.append(tag)
        temp_tokens.append(token)
    else:
        test_tokens.append(temp_tokens)
        test_tags.append(temp_tags)

        temp_tokens, temp_tags = [], []

len(test_tokens), len(test_tags)


(1520, 1520)

## HuggingFace Dataset Prep

In [9]:
from datasets import Dataset, DatasetDict

df = pd.DataFrame({'tokens': train_tokens, 'ner_tags_str': train_tags})
train = Dataset.from_pandas(df)

df = pd.DataFrame({'tokens': test_tokens, 'ner_tags_str': test_tags})
test = Dataset.from_pandas(df)

dataset = DatasetDict({'train': train, 'test': test, 'validation': test})

dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags_str'],
        num_rows: 7659
    })
    test: Dataset({
        features: ['tokens', 'ner_tags_str'],
        num_rows: 1520
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags_str'],
        num_rows: 1520
    })
})

In [10]:
dataset['train'][0]

{'tokens': ['2', 'start', 'restaurants', 'with', 'inside', 'dining'],
 'ner_tags_str': ['B-Rating', 'I-Rating', 'O', 'O', 'B-Amenity', 'I-Amenity']}

In [11]:
unique_tags = set()
for tag in dataset['train']['ner_tags_str']:
    unique_tags.update(tag)

unique_tags = list(set([x[2:] for x in list(unique_tags) if x!='O']))

tag2index = {"O": 0}
for i, tag in enumerate(unique_tags):
    tag2index[f'B-{tag}'] = len(tag2index)
    tag2index[f'I-{tag}'] = len(tag2index)

index2tag = {v:k for k,v in tag2index.items()}

In [12]:
dataset = dataset.map(lambda example: {"ner_tags": [tag2index[tag] for tag in example['ner_tags_str']]})

Map:   0%|          | 0/7659 [00:00<?, ? examples/s]

Map:   0%|          | 0/1520 [00:00<?, ? examples/s]

Map:   0%|          | 0/1520 [00:00<?, ? examples/s]

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags_str', 'ner_tags'],
        num_rows: 7659
    })
    test: Dataset({
        features: ['tokens', 'ner_tags_str', 'ner_tags'],
        num_rows: 1520
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags_str', 'ner_tags'],
        num_rows: 1520
    })
})

In [14]:
dataset["train"][0]["ner_tags"]

[7, 8, 0, 0, 1, 2]

## Model Building


In [15]:
from transformers import AutoTokenizer

In [16]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [17]:
dataset['train'][2]

{'tokens': ['5', 'star', 'resturants', 'in', 'my', 'town'],
 'ner_tags_str': ['B-Rating',
  'I-Rating',
  'O',
  'B-Location',
  'I-Location',
  'I-Location'],
 'ner_tags': [7, 8, 0, 3, 4, 4]}

In [18]:
input = dataset['train'][2]['tokens']
output = tokenizer(input, is_split_into_words=True)
tokenizer.convert_ids_to_tokens(output.input_ids)

['[CLS]', '5', 'star', 'rest', '##ura', '##nts', 'in', 'my', 'town', '[SEP]']

In [19]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            # if id=-100 then loss is not calculated
            if word_idx is None:
                label_ids.append(-100)

            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            
            else:
                label_ids.append(-100)

            previous_word_idx = word_idx
        
        labels.append(label_ids)

    tokenized_inputs['labels'] = labels

    return tokenized_inputs
    


In [20]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/7659 [00:00<?, ? examples/s]

Map:   0%|          | 0/1520 [00:00<?, ? examples/s]

Map:   0%|          | 0/1520 [00:00<?, ? examples/s]

In [21]:
tokenized_dataset['train'][2]

{'tokens': ['5', 'star', 'resturants', 'in', 'my', 'town'],
 'ner_tags_str': ['B-Rating',
  'I-Rating',
  'O',
  'B-Location',
  'I-Location',
  'I-Location'],
 'ner_tags': [7, 8, 0, 3, 4, 4],
 'input_ids': [101, 1019, 2732, 2717, 4648, 7666, 1999, 2026, 2237, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 7, 8, 0, -100, -100, 3, 4, 4, -100]}

Note: 
- 0, -100, -100
- 'rest', '##ura', '##nts'
- For the loss function we put -100 on '##ura', '##nts' for example to align

In [22]:
dataset['train'][2]

{'tokens': ['5', 'star', 'resturants', 'in', 'my', 'town'],
 'ner_tags_str': ['B-Rating',
  'I-Rating',
  'O',
  'B-Location',
  'I-Location',
  'I-Location'],
 'ner_tags': [7, 8, 0, 3, 4, 4]}

## Data Collation and Metrics


In [23]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [24]:
import evaluate
import numpy as np

metric = evaluate.load('seqeval')
label_names = list(tag2index)

def compute_metrics(eval_preds):
    logits, labels = eval_preds

    predictions = np.argmax(logits, axis=-1)
    true_labels = [[label_names[l] for l in label if l!=-100] for label in labels]

    true_predictions = [[label_names[p] for p, l in zip(prediction, label) if l != -100] 
                        for prediction, label in zip(predictions, labels)]
    
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": all_metrics['overall_precision'],
        'recall': all_metrics['overall_recall'],
        'f1': all_metrics['overall_f1'],
        'accuracy': all_metrics['overall_accuracy'],
    }

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

## Model Training

In [25]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(model_ckpt, id2label=index2tag, label2id=tag2index)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
from transformers import TrainingArguments, Trainer


In [27]:
args = TrainingArguments("finetuned-ner", 
                         evaluation_strategy='epoch',
                         save_strategy='epoch',
                         learning_rate=2e-5,
                         num_train_epochs=3,
                         weight_decay=0.01)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [28]:
trainer = Trainer(model=model, args=args,
                  train_dataset=tokenized_dataset['train'],
                  eval_dataset=tokenized_dataset['validation'],
                  data_collator=data_collator,
                  compute_metrics=compute_metrics,
                  tokenizer=tokenizer)

In [29]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.643,0.307555,0.73551,0.793651,0.763475,0.90904
2,0.2479,0.284806,0.775083,0.811746,0.792991,0.916971
3,0.2035,0.285495,0.773763,0.808889,0.790936,0.91669


TrainOutput(global_step=2874, training_loss=0.31571376166081544, metrics={'train_runtime': 2739.1258, 'train_samples_per_second': 8.388, 'train_steps_per_second': 1.049, 'total_flos': 105239751014754.0, 'train_loss': 0.31571376166081544, 'epoch': 3.0})

In [30]:
trainer.save_model("ner_distilbert")

## Prediction and Load & Save Model

In [31]:
from transformers import pipeline

checkpoint = "ner_distilbert"
pipe = pipeline('token-classification', model=checkpoint, aggregation_strategy='simple')

In [32]:
pipe("which restaurant serves the best shushi in new york?")

[{'entity_group': 'Rating',
  'score': 0.97273916,
  'word': 'best',
  'start': 28,
  'end': 32},
 {'entity_group': 'Dish',
  'score': 0.9182242,
  'word': 'shushi',
  'start': 33,
  'end': 39},
 {'entity_group': 'Location',
  'score': 0.90122294,
  'word': 'new york',
  'start': 43,
  'end': 51}]