In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
!pip install -U transformers
!pip install  -U accelerate
!pip install -U datasets

Collecting transformers
  Downloading transformers-4.46.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.46.2-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.20.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.19.1
    Uninstalling tokenizers-0.19.1:
      Successfully uninstalled tokenizers-

In [3]:
import pandas as pd
import json
import requests

In [4]:
train = pd.read_csv("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/refs/heads/master/mit_restaurant_search_ner/train.bio",
                    sep='\t', header=None)
train.head()

Unnamed: 0,0,1
0,B-Rating,2
1,I-Rating,start
2,O,restaurants
3,O,with
4,B-Amenity,inside


In [5]:
response = requests.get("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/refs/heads/master/mit_restaurant_search_ner/train.bio")
response=response.text

In [6]:
response = response.splitlines()

In [7]:
train_tokens = []
train_tags = []

temp_tokens = []
temp_tags = []

for line in response:
  if line != "":
    tag, token = line.strip().split("\t")
    temp_tags.append(tag)
    temp_tokens.append(token)
  else:
      train_tokens.append(temp_tokens)
      train_tags.append(temp_tags)

      temp_tokens, temp_tags = [], []


In [8]:
len(train_tokens), len(train_tags)

(7659, 7659)

In [9]:
response = requests.get("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/refs/heads/master/mit_restaurant_search_ner/test.bio")
response=response.text
response = response.splitlines()

test_tokens = []
test_tags = []

temp_tokens = []
temp_tags = []

for line in response:
  if line != "":
    tag, token = line.strip().split("\t")
    temp_tags.append(tag)
    temp_tokens.append(token)
  else:
      test_tokens.append(temp_tokens)
      test_tags.append(temp_tags)

      temp_tokens, temp_tags = [], []

len(test_tokens), len(test_tags)

(1520, 1520)

**HuggingFace Dataset Prep**

In [10]:
from datasets import Dataset, DatasetDict

df = pd.DataFrame({'tokens': train_tokens, 'ner_tags_str': train_tags})
train = Dataset.from_pandas(df)

df = pd.DataFrame({'tokens': test_tokens, 'ner_tags_str': test_tags})
test = Dataset.from_pandas(df)

dataset = DatasetDict({'train': train, 'test': test, 'validation': test})
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags_str'],
        num_rows: 7659
    })
    test: Dataset({
        features: ['tokens', 'ner_tags_str'],
        num_rows: 1520
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags_str'],
        num_rows: 1520
    })
})

In [11]:
dataset['train'][0]

{'tokens': ['2', 'start', 'restaurants', 'with', 'inside', 'dining'],
 'ner_tags_str': ['B-Rating', 'I-Rating', 'O', 'O', 'B-Amenity', 'I-Amenity']}

In [12]:
unique_tags = set()

for tag in dataset['train']['ner_tags_str']:
  unique_tags.update(tag)

unique_tags = list(set([x[2:] for x in list(unique_tags) if x != 'O']))

tag2index = {"O": 0}

for i, tag in enumerate(unique_tags):
  tag2index[f'B-{tag}'] = len(tag2index)
  tag2index[f'I-{tag}'] = len(tag2index)

index2tag = {v:k for k,v in tag2index.items()}

In [13]:
tag2index, index2tag

({'O': 0,
  'B-Location': 1,
  'I-Location': 2,
  'B-Price': 3,
  'I-Price': 4,
  'B-Rating': 5,
  'I-Rating': 6,
  'B-Cuisine': 7,
  'I-Cuisine': 8,
  'B-Hours': 9,
  'I-Hours': 10,
  'B-Restaurant_Name': 11,
  'I-Restaurant_Name': 12,
  'B-Dish': 13,
  'I-Dish': 14,
  'B-Amenity': 15,
  'I-Amenity': 16},
 {0: 'O',
  1: 'B-Location',
  2: 'I-Location',
  3: 'B-Price',
  4: 'I-Price',
  5: 'B-Rating',
  6: 'I-Rating',
  7: 'B-Cuisine',
  8: 'I-Cuisine',
  9: 'B-Hours',
  10: 'I-Hours',
  11: 'B-Restaurant_Name',
  12: 'I-Restaurant_Name',
  13: 'B-Dish',
  14: 'I-Dish',
  15: 'B-Amenity',
  16: 'I-Amenity'})

In [14]:
dataset = dataset.map(lambda ex: {"ner_tags": [tag2index[tag] for tag in ex['ner_tags_str']]})

Map:   0%|          | 0/7659 [00:00<?, ? examples/s]

Map:   0%|          | 0/1520 [00:00<?, ? examples/s]

Map:   0%|          | 0/1520 [00:00<?, ? examples/s]

In [15]:
dataset['train'][0]

{'tokens': ['2', 'start', 'restaurants', 'with', 'inside', 'dining'],
 'ner_tags_str': ['B-Rating', 'I-Rating', 'O', 'O', 'B-Amenity', 'I-Amenity'],
 'ner_tags': [5, 6, 0, 0, 15, 16]}

**Model Building**

In [16]:
from transformers import AutoTokenizer

In [17]:
model_ckpt = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [18]:
input = dataset['train'][2]['tokens']
#Split into words is true because we splitted them before
output = tokenizer(input, is_split_into_words=True)

In [19]:
output

{'input_ids': [101, 1019, 2732, 2717, 4648, 7666, 1999, 2026, 2237, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [20]:
tokenizer.convert_ids_to_tokens(output.input_ids)

['[CLS]', '5', 'star', 'rest', '##ura', '##nts', 'in', 'my', 'town', '[SEP]']

In [21]:
def tokenize_and_align_labels(examples):
  tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

  labels = []
  for i, label in enumerate(examples['ner_tags']):
    word_ids = tokenized_inputs.word_ids(batch_index=i)

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:
      #if id=-100 then loss is not calculated
      if word_idx is None:
        label_ids.append(-100)

      elif word_idx != previous_word_idx:
        label_ids.append(label[word_idx])

      else:
        label_ids.append(-100)

      previous_word_idx = word_idx

    labels.append(label_ids)

  tokenized_inputs['labels'] = labels
  return tokenized_inputs

In [22]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/7659 [00:00<?, ? examples/s]

Map:   0%|          | 0/1520 [00:00<?, ? examples/s]

Map:   0%|          | 0/1520 [00:00<?, ? examples/s]

In [23]:
tokenized_dataset['train'][2]

{'tokens': ['5', 'star', 'resturants', 'in', 'my', 'town'],
 'ner_tags_str': ['B-Rating',
  'I-Rating',
  'O',
  'B-Location',
  'I-Location',
  'I-Location'],
 'ner_tags': [5, 6, 0, 1, 2, 2],
 'input_ids': [101, 1019, 2732, 2717, 4648, 7666, 1999, 2026, 2237, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 5, 6, 0, -100, -100, 1, 2, 2, -100]}

**Data Collation and Matrices**

In [24]:
!pip install seqeval
!pip install evaluate

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=81f436929f38d3c0efb8c6222e939173ed7040b7944b438719ab4931a9745507
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [25]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [26]:
import evaluate
import numpy as np


metric = evaluate.load('seqeval')
label_names = list(tag2index)

def compute_metrics(eval_preds):
  logits, labels = eval_preds

  predictions = np.argmax(logits, axis= -1)
  true_labels = [[label_names[l] for l in label if l!=-100] for label in labels]

  true_predictions = [[label_names[p] for p,l in zip(prediction, label) if l!=-100]
                      for prediction, label in zip(predictions, labels)]

  all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

  return {
      "precision": all_metrics['overall_precision'],
      "recall": all_metrics['overall_recall'],
      "f1": all_metrics['overall_f1'],
      "accuracy": all_metrics['overall_accuracy'],
  }


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

**Model Training**

In [27]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(model_ckpt, id2label=index2tag, label2id=tag2index)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
from transformers import TrainingArguments, Trainer

In [29]:
args = TrainingArguments("finetuned-ner", evaluation_strategy='epoch', save_strategy='epoch',
                         learning_rate = 2e-5,
                         num_train_epochs = 3,
                         weight_decay = 0.01,

)

In [30]:
trainer = Trainer(model = model, args=args, train_dataset=tokenized_dataset['train'],
                  eval_dataset = tokenized_dataset['validation'],
                  data_collator = data_collator,
                  compute_metrics = compute_metrics,
                  tokenizer = tokenizer)

In [31]:
import wandb
wandb.init(mode="disabled")
trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.6263,0.305963,0.731492,0.790476,0.759841,0.907496
2,0.2479,0.282969,0.772214,0.802857,0.787237,0.915778
3,0.2015,0.28685,0.774695,0.806667,0.790358,0.917462


TrainOutput(global_step=2874, training_loss=0.31101561554953217, metrics={'train_runtime': 155.9904, 'train_samples_per_second': 147.298, 'train_steps_per_second': 18.424, 'total_flos': 105239751014754.0, 'train_loss': 0.31101561554953217, 'epoch': 3.0})

In [33]:
trainer.save_model('ner_distilbert')

**Prediction and Load & Save Model**

In [34]:
from transformers import pipeline

ckpt = "ner_distilbert"
pipe = pipeline('token-classification', model = ckpt, aggregation_strategy = 'simple')

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [35]:
pipe("which restaurant serves the best sushi in new york?")

[{'entity_group': 'Rating',
  'score': 0.9772924,
  'word': 'best',
  'start': 28,
  'end': 32},
 {'entity_group': 'Dish',
  'score': 0.79689693,
  'word': 'sushi',
  'start': 33,
  'end': 38},
 {'entity_group': 'Location',
  'score': 0.9361863,
  'word': 'new york',
  'start': 42,
  'end': 50}]