<a href="https://colab.research.google.com/gist/johnidm/27e3b2ff50e592bc37183907ba97d31d/untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Train a Custom Named Entity Recognition with spaCy v3

In [8]:
# !pip install -U spacy -q

In [9]:
# !python -m spacy download pt_core_news_sm -q

In [10]:
# !python -m spacy info
# !python -m spacy info pt_core_news_sm

### NER

In [11]:
import spacy
# nlp = spacy.load("pt_core_news_sm")
nlp = spacy.load("pt_core_news_md")
print(nlp.pipe_names)

['tok2vec', 'morphologizer', 'parser', 'lemmatizer', 'attribute_ruler', 'ner']


In [16]:
text = """
O Bitcoin (BTC) recuperou parte das perdas registradas em meio à 
batalha regulatória.
"""

doc = nlp(text)
print(doc.ents)

(Bitcoin, BTC)


In [20]:
doc = nlp("""
Meu nome é Johnny B. Goode e hoje estou
tocando em Hollywood no Teatro Álvaro de Carvalho
""")

for ent in doc.ents:
  print(f"{ent.label_} : {ent.text}")

PER : Johnny B. Goode
LOC : Hollywood
LOC : Teatro Álvaro de Carvalho


In [32]:
from spacy import displacy

# colors = {"PER": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
colors = {"PER": "linear-gradient(60deg, #aa8cfc, #fc7ce7)"}

options = {"colors": colors}

displacy.render(doc, style="ent", jupyter=True, options=options)

### Training a new pipeline

In [47]:
import spacy
import urllib

url = "https://gist.githubusercontent.com/johnidm/157acebd00fcb70d8044b43cc02ab884/raw/99a97a9d1f866dab9e2b54378f039fc435ffbf4e/document.txt"

# nlp = spacy.load("pt_core_news_sm")
nlp = spacy.load("pt_core_news_md")

document = urllib.request.urlopen(url).read().decode("utf-8")
doc = nlp(document)

spacy.displacy.render(doc, style="ent", jupyter=True)

In [60]:
import spacy
import urllib

# nlp = spacy.load("pt_core_news_sm")
nlp = spacy.load("pt_core_news_md")



url = "https://gist.githubusercontent.com/johnidm/157acebd00fcb70d8044b43cc02ab884/raw/99a97a9d1f866dab9e2b54378f039fc435ffbf4e/document.txt"

entity_ruler = nlp.add_pipe("entity_ruler", before="ner")

patterns = [
    {
        "label": "CRYPTO",
        "pattern": [
            {
                'LOWER': { 
                    'IN': ['bitcoin', 'tether', 'ether', 'ethereum', 'eth']
                }
            }
        ]
    }
]

entity_ruler.add_patterns(patterns)

document = urllib.request.urlopen(url).read().decode("utf-8")
doc = nlp(document)

spacy.displacy.render(doc, style="ent", jupyter=True)

In [67]:
import json, urllib


url = "https://gist.githubusercontent.com/johnidm/0971d537443515fce71ab28907ecaef5/raw/f1cc41b94345516720bcc98c1984581f028b9486/dataset.json"

data = json.loads(urllib.request.urlopen(url).read().decode("utf-8"))

dataset = data["annotations"]
TRAIN_DATA = dataset[:30]
DEV_DATA = dataset[30:]

In [68]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm


def convert(path, dataset):
    nlp = spacy.blank("pt")
    db = DocBin()
    for text, annot in tqdm(dataset): 
            doc = nlp.make_doc(text) 
            ents = []
            for start, end, label in annot["entities"]:
                span = doc.char_span(start, end, label=label, alignment_mode="contract")
                if span is None:
                    print("Skipping entity")
                else:
                    ents.append(span)
            doc.ents = ents 
            db.add(doc)
    db.to_disk(path)
    
convert("train.spacy", TRAIN_DATA)
convert("dev.spacy", DEV_DATA)

100%|██████████| 30/30 [00:00<00:00, 1843.22it/s]
100%|██████████| 11/11 [00:00<00:00, 1700.48it/s]


In [70]:
!python -m spacy init config config.cfg --lang pt --pipeline ner --optimize efficiency --force

[38;5;4mℹ Generated config template specific for your use case[0m
- Language: pt
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [71]:
!python -m spacy train config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./dev.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[2023-08-11 13:09:38,915] [INFO] Set up nlp object from config
[2023-08-11 13:09:38,921] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-08-11 13:09:38,923] [INFO] Created vocabulary
[2023-08-11 13:09:38,924] [INFO] Finished initializing nlp object
[2023-08-11 13:09:39,076] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     33.00    0.00    0.00    0.00    0.00
 13     200          3.89    636.03  100.00  100.00  100.00    1.00
 29     400          0.00      0.00  100.00  100.00  100.00    1.00
 50     600          0.00      0.00  100.00  100.00  100.00    1.00
 

### Load the model

In [74]:
import urllib


url = "https://gist.githubusercontent.com/johnidm/157acebd00fcb70d8044b43cc02ab884/raw/99a97a9d1f866dab9e2b54378f039fc435ffbf4e/document.txt"

document = urllib.request.urlopen(url).read().decode("utf-8")
document[:60]

'Já o bioinformata Marcel Ribeiro-Dantas, pesquisador na área'

In [77]:
import spacy


nlp = spacy.load("model-best")

doc = nlp(document)

# colors = {"CRYPTO": "linear-gradient(315deg, #f5d020, #f53803)"}
colors = {"CRYPTO": "linear-gradient(315deg, #f66820, #f66823)"}

options = {"ents": ["CRYPTO"], "colors": colors}


spacy.displacy.render(doc, style="ent", options=options, jupyter=True)