In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Installing Spacy and downloading model



In [2]:
!pip install -U spacy -q

In [3]:
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 5.1 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


## Importing packages and loading model

In [4]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

In [5]:
nlp = spacy.load("en_core_web_sm")

In [6]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [7]:
from spacy.lang.en.examples import sentences 
doc = nlp(sentences[0])

from spacy import displacy
spacy.displacy.render(doc, style="ent", jupyter=True)

## Loading annotated data

In [8]:
import json
f = open('/content/drive/MyDrive/Pioneer Alpha/train.json')
TRAIN_DATA = json.load(f)

In [9]:
import json
f = open('/content/drive/MyDrive/Pioneer Alpha/dev.json')
DEV_DATA = json.load(f)

In [10]:
TRAIN_DATA

{'annotations': [['Lasagne are a type of pasta.\r',
   {'entities': [[0, 7, 'FOOD'], [22, 27, 'FOOD']]}],
  ['The Miso Chicken dumpling was really good and I simply loved the bento so much.\r',
   {'entities': [[4, 25, 'FOOD'], [65, 70, 'FOOD']]}],
  ['Beef Wellington is a steak dish made out of fillet steak coated with pastry.\r',
   {'entities': [[0, 15, 'FOOD'],
     [21, 26, 'FOOD'],
     [44, 56, 'FOOD'],
     [69, 75, 'FOOD']]}],
  ['I have finished my meal with my favourite waffle from their menu.\r',
   {'entities': [[42, 48, 'FOOD']]}],
  ['The tehari would have tasted better if they added potato and used a good amount of mustard oil.\r',
   {'entities': [[4, 10, 'FOOD'], [50, 56, 'FOOD'], [83, 90, 'FOOD']]}],
  ['The service was top notch and the pizza was to die for.\r',
   {'entities': [[34, 39, 'FOOD']]}],
  ['Authentic Italian pizza. \r', {'entities': [[18, 23, 'FOOD']]}],
  ['No other pizza compares to this one.\r', {'entities': [[9, 14, 'FOOD']]}],
  ['The seafood soup 

In [11]:
DEV_DATA

{'annotations': [['This cake tastes like it has cheese in it.\r',
   {'entities': [[5, 9, 'FOOD'], [29, 35, 'FOOD']]}],
  ['I always have a boiled egg for breakfast.\r',
   {'entities': [[16, 26, 'FOOD']]}],
  ['You can substitute oil for butter in this recipe.\r',
   {'entities': [[27, 33, 'FOOD']]}],
  ['This margarine is full of additives – just look at the label!\r',
   {'entities': [[5, 14, 'FOOD']]}],
  ['Yogurt is usually very low in fat.\r', {'entities': [[0, 6, 'FOOD']]}],
  ['I like cottage cheese with fruit.\r',
   {'entities': [[7, 21, 'FOOD'], [27, 32, 'FOOD']]}],
  ['Ice cream is popular among children.\r', {'entities': [[0, 9, 'FOOD']]}],
  ['She skimmed the milk of its cream.\r',
   {'entities': [[16, 20, 'FOOD'], [28, 33, 'FOOD']]}],
  ['I usually have a sandwich for lunch.\r', {'entities': [[17, 25, 'FOOD']]}],
  ['She sliced off a piece of sausage.\r', {'entities': [[26, 33, 'FOOD']]}],
  ['I’d like a hamburger and an iced tea.\r',
   {'entities': [[11, 20, 'FOOD'], 

In [12]:
db = DocBin()

In [13]:
import pandas as pd
import os
from tqdm import tqdm
from spacy.tokens import DocBin

In [14]:
for text, annot in tqdm(TRAIN_DATA['annotations']): 
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents 
    db.add(doc)

100%|██████████| 66/66 [00:00<00:00, 1927.13it/s]


In [15]:
for text, annot in tqdm(DEV_DATA['annotations']): 
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents 
    db.add(doc)

100%|██████████| 20/20 [00:00<00:00, 2646.33it/s]


In [16]:
db.to_disk("./train_data.spacy")
db.to_disk("./dev_data.spacy")

## Revision data

In [17]:
df = pd.read_csv("/content/drive/MyDrive/Pioneer Alpha/rev.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [18]:
df.drop(df.columns.difference(['Article']), 1, inplace=True)

  """Entry point for launching an IPython kernel.


In [19]:
df.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [20]:
nlp = spacy.load("en_core_web_sm")

In [21]:
import re

In [24]:
revision_texts = []

In [23]:
revisions = []
for doc in nlp.pipe(revision_texts, batch_size=50, disable=["tagger", "parser"]):
    if len(doc.ents) > 0:
        revisions.append((doc.text, {"entities": [(e.start_char, e.end_char, e.label_) for e in doc.ents]}))



In [25]:
import random

In [26]:
# create arrays to store the revision data
TRAIN_REVISION_DATA = []
TEST_REVISION_DATA = []

# create dictionaries to keep count of the different entities
TRAIN_ENTITY_COUNTER = {}
TEST_ENTITY_COUNTER = {}

# This will help distribute the entities (i.e. we don't want 1000 PERSON entities, but only 80 ORG entities)
REVISION_SENTENCE_SOFT_LIMIT = 100

# helper function for incrementing the revision counters
def increment_revision_counters(entity_counter, entities):
    for entity in entities:
        label = entity[2]
        if label in entity_counter:
            entity_counter[label] += 1
        else:
            entity_counter[label] = 1

random.shuffle(revisions)
for revision in revisions:
    # get the entities from the revision sentence
    entities = revision[1]["entities"]

    # simple hack to make sure spaCy entities don't get too one-sided
    should_append_to_train_counter = 0
    for _, _, label in entities:
        if label in TRAIN_ENTITY_COUNTER and TRAIN_ENTITY_COUNTER[label] > REVISION_SENTENCE_SOFT_LIMIT:
            should_append_to_train_counter -= 1
        else:
            should_append_to_train_counter += 1

    # simple switch for deciding whether to append to train data or test data
    if should_append_to_train_counter >= 0:
        TRAIN_REVISION_DATA.append(revision)
        increment_revision_counters(TRAIN_ENTITY_COUNTER, entities)
    else:
        TEST_REVISION_DATA.append(revision)
        increment_revision_counters(TEST_ENTITY_COUNTER, entities)

In [27]:
TRAIN_ENTITY_COUNTER

{'CARDINAL': 149,
 'DATE': 204,
 'EVENT': 74,
 'FAC': 88,
 'GPE': 157,
 'LANGUAGE': 27,
 'LAW': 33,
 'LOC': 102,
 'MONEY': 103,
 'NORP': 120,
 'ORDINAL': 108,
 'ORG': 209,
 'PERCENT': 101,
 'PERSON': 191,
 'PRODUCT': 75,
 'QUANTITY': 75,
 'TIME': 105,
 'WORK_OF_ART': 102}

In [28]:
TEST_ENTITY_COUNTER

{'CARDINAL': 1601,
 'DATE': 2322,
 'EVENT': 17,
 'FAC': 23,
 'GPE': 1944,
 'LANGUAGE': 6,
 'LAW': 4,
 'LOC': 89,
 'MONEY': 119,
 'NORP': 900,
 'ORDINAL': 283,
 'ORG': 3257,
 'PERCENT': 95,
 'PERSON': 3528,
 'PRODUCT': 14,
 'QUANTITY': 12,
 'TIME': 140,
 'WORK_OF_ART': 35}

In [29]:
ner = nlp.get_pipe("ner")
ner.add_label("FOOD")

1

In [30]:
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

##Preparing for training

In [31]:
!python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [32]:
!python -m spacy train /content/config.cfg --output ./ --gpu-id 0 --paths.train /content/train_data.spacy --paths.dev /content/dev_data.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2022-06-24 21:10:59,187] [INFO] Set up nlp object from config
[2022-06-24 21:10:59,197] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-06-24 21:10:59,201] [INFO] Created vocabulary
[2022-06-24 21:10:59,202] [INFO] Finished initializing nlp object
[2022-06-24 21:11:13,434] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     45.33    3.01    2.97    3.06    0.03
 13     200         79.19   1757.06  100.00  100.00  100.00    1.00
 29     400         52.37     21.50  100.00  100.00  100.00    1.00
 50     600         65.19     23.46  100.00  100.00  100.00    1.00
 74     800         76.23     19.71  100.00  100.00  100.00 

In [34]:
nlp_ner = spacy.load("/content/model-best")

In [35]:
doc = nlp_ner('''Out of your chicken breast rut? Pork chops are truly "the other white meat" — a lean option that's just as versatile and delicious. You'll love these chops breaded and seared, marinated and grilled, or browned and simmered in a fantastically flavorful sauce. We've included recipes for thick- and thin-cut chops to fit your family's needs. Just like chicken, pork also adapts to other cuisines (it even outranks chicken as the most popular protein in the world). Enjoy flavors from China, Italy, Germany, and the U.S.''')

In [36]:
spacy.displacy.render(doc, style="ent", jupyter=True)

**NB:**

Here my code actually follows the ways mentioned in spacy documentation to add new named entity to existing model. Though I added revision data with all other entities that were in the pre trained model, still the final model doesn't predict any other entities except for FOOD due to catastrophic forgetting. Though according to this note - https://deepnote.com/@isaac-aderogba/Spacy-Food-Entities-2cc2d19c-c3ac-4321-8853-0bcf2ef565b3 shows the similar way to train a new entity into the pipeline.

Another good solution can be to make a train set with FOOD labels that has all other labels annotated as well. Feding such data to the existing models can be resulted into predicting all the desired entities.