# NER Restaurant Reviews

---
Aparna Dutta (aparnadutta@brandeis.edu)

# Setup 


In [1]:
import torch 
torch.cuda.is_available()

True

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install flair==0.10 transformers==4.17.0

Collecting flair==0.10
  Downloading flair-0.10-py3-none-any.whl (322 kB)
[K     |████████████████████████████████| 322 kB 9.5 MB/s 
[?25hCollecting transformers==4.17.0
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 68.6 MB/s 
Collecting segtok>=1.5.7
  Downloading segtok-1.5.11-py3-none-any.whl (24 kB)
Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 2.4 MB/s 
[?25hCollecting gdown==3.12.2
  Downloading gdown-3.12.2.tar.gz (8.2 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting sentencepiece==0.1.95
  Downloading sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 52.4 MB/s 
[?25hCollecting bpemb>=0.3.2
  Downloading bpemb-0.3.3-py3-none-any.whl (19 kB)
Collecting mo

## Load data

In [4]:
import flair
flair.set_seed(42)
flair.__version__

'0.10'

In [5]:
from collections import Counter
from typing import Optional, List, Tuple

from flair.data import Corpus, FlairDataset
from flair.datasets import ColumnCorpus
from flair.models import SequenceTagger

DELIM = "-"
BEGIN = "B"
INSIDE = "I"
OUTSIDE = "O"
DOCSTART = "-DOCSTART-"

O_2_I = "O_to_I"
I_2_I_TYPE = "I_to_I_TYPE"
B_2_I_TYPE = "B_to_I_TYPE"

In [6]:
def read_conll_format(datadir: str, token_col: int, label_col: int) -> Corpus:
    """Return a ColumnCorpus from a directory using the specified token column and label column.

    Assumes that "-DOCSTART-" as the document_separator_token and that the directory contains
    files with the names train.txt, dev.txt, and test.txt."""
    columns = {token_col: 'text', label_col: 'ner'}
    corpus: Corpus = ColumnCorpus(document_separator_token=DOCSTART,
                                  data_folder=datadir,
                                  column_format=columns,
                                  train_file='train.txt',
                                  dev_file='dev.txt',
                                  test_file='test.txt')
    return corpus

In [7]:
rest_review_corpus = read_conll_format("drive/MyDrive/COSI_217B_NER_Project/Model/data/", 0, 1)

2022-05-13 19:18:15,462 Reading data from drive/MyDrive/COSI_217B_NER_Project/Model/data
2022-05-13 19:18:15,473 Train: drive/MyDrive/COSI_217B_NER_Project/Model/data/train.txt
2022-05-13 19:18:15,476 Dev: drive/MyDrive/COSI_217B_NER_Project/Model/data/dev.txt
2022-05-13 19:18:15,478 Test: drive/MyDrive/COSI_217B_NER_Project/Model/data/test.txt


In [8]:
# The tag type we are training the model to predict
tag_type = 'ner'

# Make the tag dictionary from the corpus
tag_dictionary = rest_review_corpus.make_label_dictionary(tag_type)
print(tag_dictionary)

2022-05-13 19:18:20,914 Computing label dictionary. Progress:


100%|██████████| 3662/3662 [00:00<00:00, 5658.57it/s]

2022-05-13 19:18:21,570 Corpus contains the labels: ner (#91976)
2022-05-13 19:18:21,575 Created (for label 'ner') Dictionary with 16 tags: <unk>, O, B-CUISINE, B-DISH, I-DISH, B-EST, I-EST, B-FOOD, I-FOOD, B-TYPE, B-LOC, I-LOC, I-TYPE, B-DIET, I-CUISINE, I-DIET
Dictionary with 16 tags: <unk>, O, B-CUISINE, B-DISH, I-DISH, B-EST, I-EST, B-FOOD, I-FOOD, B-TYPE, B-LOC, I-LOC, I-TYPE, B-DIET, I-CUISINE, I-DIET





# Training a basic biLSTM model

In [21]:
from flair.embeddings import WordEmbeddings
from flair.data import Sentence
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings, CharacterEmbeddings

# Init glove embeddings 
# glove_embedding = WordEmbeddings('glove')

# Create a StackedEmbedding object that combines glove and forward/backward flair embeddings
stacked_embeddings = StackedEmbeddings([
                                        WordEmbeddings('glove'),
                                        FlairEmbeddings('news-forward-fast'),
                                        FlairEmbeddings('news-backward-fast'),
                                        # CharacterEmbeddings()
                                       ])

## Creating a sequence tagger


In [22]:
from flair.models import SequenceTagger
glove_tagger = SequenceTagger(
    embeddings=stacked_embeddings, 
    tag_dictionary=tag_dictionary,
    tag_type='ner',
    hidden_size=512,  # RNN hidden size
    use_crf=True,
    use_rnn=True,
    rnn_type="LSTM",  # Bidirectional
    dropout=0.0,
    reproject_embeddings=False,
)

## Training the model


In [11]:
from flair.trainers import ModelTrainer
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR

In [23]:
EXPT_NAME = 'glove-crf-flair'
EXPT_DIR = 'drive/MyDrive/COSI_217B_NER_Project/Model/expts/'

# Creates a model trainer with our tagger and our corpus
glove_trainer = ModelTrainer(glove_tagger, rest_review_corpus)

# Run training with specified parameters.
# The first parameter is where to store models and final predictions
glove_trainer.train(f'{EXPT_DIR}{EXPT_NAME}',
              learning_rate=5.0e-3,
              mini_batch_size=32,
              max_epochs=10,
              scheduler=OneCycleLR,
              embeddings_storage_mode='gpu',
              optimizer=AdamW,
)

2022-05-13 19:34:54,678 ----------------------------------------------------------------------------------------------------
2022-05-13 19:34:54,680 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings(
      'glove'
      (embedding): Embedding(400001, 100)
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.25, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 1024)
        (decoder): Linear(in_features=1024, out_features=275, bias=True)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.25, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 1024)
        (decoder): Linear(in_features=1024, out_features=275, bias=True)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (rnn): LSTM(2148, 512, batch_first=True, bidirectional=True)


{'dev_loss_history': [tensor(0.1989, device='cuda:0'),
  tensor(0.1577, device='cuda:0'),
  tensor(0.1323, device='cuda:0'),
  tensor(0.1199, device='cuda:0'),
  tensor(0.1130, device='cuda:0'),
  tensor(0.1108, device='cuda:0'),
  tensor(0.1073, device='cuda:0'),
  tensor(0.1096, device='cuda:0'),
  tensor(0.1099, device='cuda:0'),
  tensor(0.1091, device='cuda:0')],
 'dev_score_history': [0.4412811387900356,
  0.6032258064516128,
  0.6466250709018717,
  0.6796116504854369,
  0.6865503657850309,
  0.6948088990302338,
  0.692175899486008,
  0.6903553299492385,
  0.6913580246913581,
  0.6929577464788732],
 'test_score': 0.6835689907362261,
 'train_loss_history': [0.4074051519305828,
  0.21006210432826533,
  0.16248362883698622,
  0.13477498966371992,
  0.11802282633652446,
  0.10099334344352091,
  0.09030308109206138,
  0.08378022758299407,
  0.07910684453846942,
  0.07700276136336102]}

## Running inference


In [13]:
def predict(
        tagger: SequenceTagger, corpus_section: FlairDataset
) -> List[List[Tuple[str, str]]]:
    """Return sentences with tokens tagged by the specified tagger.

    The return value is a list of sentences, with each sentence represented as a list of
    (text, tag) tuples representing each token."""
    tagger.predict(sentences=[sent for sent in corpus_section], label_name='pred_ner')
    pred_tags = [[(token.text, token.get_tag('pred_ner').value) for token in sent] for sent in corpus_section]
    return pred_tags

def write_predictions(tagger: SequenceTagger, corpus_section: FlairDataset, outpath: str) -> None:
  with open(outpath, 'w', encoding="utf8") as outfile:
    for sent in predict(tagger, corpus_section):
      for token in sent:
        print(f"{token[0]} {token[1]}", file=outfile)
      print(file=outfile)

In [28]:
model = SequenceTagger.load(f'{EXPT_DIR}stacked_embeds/best-model.pt')
write_predictions(model, rest_review_corpus.test, f"{EXPT_DIR}stacked_embeds/test-sys-out-best.bio")

2022-05-13 19:45:06,294 loading file drive/MyDrive/COSI_217B_NER_Project/Model/expts/stacked_embeds/best-model.pt


In [15]:
# evaluate the model output using seqscore
! pip install seqscore

Collecting seqscore
  Downloading seqscore-0.4.1-py3-none-any.whl (23 kB)
Installing collected packages: seqscore
Successfully installed seqscore-0.4.1


In [29]:
import subprocess

seq_out = subprocess.run(["seqscore", "score", "--labels", "BIO",
                          "--repair-method", "conlleval",
                          "--reference", "drive/MyDrive/COSI_217B_NER_Project/Model/data/test.txt",
                          f"{EXPT_DIR}stacked_embeds/test-sys-out-best.bio"],
                         capture_output=True,
                         text=True)

print("Experiment name:", 'stacked_embeds')
print(seq_out.stdout)

Experiment name: stacked_embeds
| Type    |   Precision |   Recall |     F1 |   Reference |   Predicted |   Correct |
|---------|-------------|----------|--------|-------------|-------------|-----------|
| ALL     |       72.71 |    69.62 |  71.13 |        1060 |        1015 |       738 |
| CUISINE |       70.00 |    80.00 |  74.67 |          35 |          40 |        28 |
| DISH    |       48.24 |    33.06 |  39.23 |         124 |          85 |        41 |
| EST     |       82.28 |    78.31 |  80.25 |         166 |         158 |       130 |
| FOOD    |       71.09 |    72.56 |  71.82 |         583 |         595 |       423 |
| LOC     |       86.99 |    92.24 |  89.54 |         116 |         123 |       107 |
| TYPE    |       64.29 |    25.00 |  36.00 |          36 |          14 |         9 |



In [19]:
def count_invalid_transitions(filepath: str, delimiter: str = " ") -> Counter:
    """Count the invalid BIO transitions in a CoNLL-format file.

    The Counter returned counts the types of invalid transitions, for example:
    {
        "O_2_I": 3,
        "I_2_I_TYPE": 2,
        "B_2_I_TYPE": 4,
    }
    """
    sent_labels = read_labels(filepath, delimiter)

    invalid_transitions = []
    for sent in sent_labels:
        if sent[0][0] == INSIDE:
            invalid_transitions.append(O_2_I)
        for lab1, lab2 in zip(sent, sent[1:]):
            if (lab1[0] == OUTSIDE) and (lab2[0] == INSIDE):
                invalid_transitions.append(O_2_I)
            if (lab1[0] == INSIDE) and (lab2[0] == INSIDE) and (lab1[1] != lab2[1]):
                invalid_transitions.append(I_2_I_TYPE)
            if (lab1[0] == BEGIN) and (lab2[0] == INSIDE) and (lab1[1] != lab2[1]):
                invalid_transitions.append(B_2_I_TYPE)

    return Counter(invalid_transitions)


def read_labels(filepath: str, delimiter: str = " "):
    sent_labels = []

    def split_label(label: str):
        return tuple(label.split(sep="-", maxsplit=2)) if '-' in label else (label, label)

    with open(filepath, encoding='utf8') as file:
        labels = []
        for line in file:
            line = line.rstrip().split(delimiter)
            word, label = line[0], line[-1]
            if word and word != DOCSTART:
                labels.append(split_label(label))
            elif (not word) and labels:
                sent_labels.append(labels)
                labels = []
        if labels:
            sent_labels.append(labels)
    return sent_labels

In [20]:
# Compare output of count_invalid_transitions on glove-only and glove-crf predictions
invalid_transitions = count_invalid_transitions(f"{EXPT_DIR}{EXPT_NAME}/test-sys-out-best.bio")
print(invalid_transitions)

Counter({'B_to_I_TYPE': 2, 'O_to_I': 1})


# Transformer models

In [None]:
from flair.embeddings import TransformerWordEmbeddings
EXPT_NAME = 'distilroberta-lstm-crf'
EXPT_DIR = 'drive/MyDrive/information_extraction/final/expts/'

# Initialize fine-tuneable transformer embeddings
transformer_embeddings = TransformerWordEmbeddings(
    model='distilroberta-base',
    layers="-1",
    subtoken_pooling="first",
    fine_tune=True,
    use_context=False,
    layer_mean=False
)

In [None]:
# Initialize basic sequence tagger with transformer model (no CRF, no RNN, no reprojection)
tagger = SequenceTagger(
    embeddings=transformer_embeddings, 
    tag_dictionary=tag_dictionary,
    tag_type='ner',
    hidden_size=512,
    use_crf=False,
    use_rnn=False,
    # rnn_type="LSTM",  # Bidirectional
    reproject_embeddings=False,
)

In [None]:
trainer = ModelTrainer(tagger, rest_review_corpus)
trainer.train(f'{EXPT_DIR}{EXPT_NAME}',
              learning_rate=5.0e-6,
              mini_batch_size=16,
              max_epochs=5,
              scheduler=OneCycleLR,
              embeddings_storage_mode='gpu',
              weight_decay=0.0,
              optimizer=AdamW,
)

In [None]:
! pip install seqscore

Collecting seqscore
  Downloading seqscore-0.4.1-py3-none-any.whl (23 kB)
Installing collected packages: seqscore
Successfully installed seqscore-0.4.1


In [None]:
import subprocess

EXPT_DIR = 'drive/MyDrive/COSI_217B_NER_Project/Model/expts/'
EXPT_NAME = 'glove-crf-stacked'

model = SequenceTagger.load(f'{EXPT_DIR}{EXPT_NAME}/best-model.pt')
write_predictions(model, rest_review_corpus.test, f"{EXPT_DIR}{EXPT_NAME}/test-sys-out-best.bio")

seq_out = subprocess.run(["seqscore", "score", "--labels", "BIO",
                          "--repair-method", "conlleval",
                          "--reference", "drive/MyDrive/COSI_217B_NER_Project/Model/data/test.txt",
                          f"{EXPT_DIR}{EXPT_NAME}/test-sys-out-best.bio"],
                         capture_output=True,
                         text=True)
print(seq_out.stdout)

2022-05-12 20:49:39,940 loading file drive/MyDrive/COSI_217B_NER_Project/Model/expts/glove-crf-stacked/best-model.pt
| Type    |   Precision |   Recall |     F1 |   Reference |   Predicted |   Correct |
|---------|-------------|----------|--------|-------------|-------------|-----------|
| ALL     |       72.33 |    69.06 |  70.66 |        1060 |        1012 |       732 |
| CUISINE |       72.97 |    77.14 |  75.00 |          35 |          37 |        27 |
| DISH    |       45.12 |    29.84 |  35.92 |         124 |          82 |        37 |
| EST     |       85.42 |    74.10 |  79.35 |         166 |         144 |       123 |
| FOOD    |       69.68 |    74.10 |  71.82 |         583 |         620 |       432 |
| LOC     |       88.33 |    91.38 |  89.83 |         116 |         120 |       106 |
| TYPE    |       77.78 |    19.44 |  31.11 |          36 |           9 |         7 |

