# Package install

In [1]:
%pip install spacy 

Note: you may need to restart the kernel to use updated packages.


# Package import 

In [3]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json

# Creating space

The first line of code creates a new instance of a blank spaCy language model for English language. 

The second line of code creates an empty DocBin object. DocBin is a container for spaCy Doc objects, which are used to represent documents in spaCy. DocBin objects are useful for storing a large number of documents in a space-efficient format, and for serializing and deserializing spaCy documents for processing and analysis. The db object can be used to add Doc objects and then later serialize the collection of documents to disk or to a network socket.

In [3]:
nlp = spacy.blank('en')
db = DocBin()

# Data lecture (JSON)

In [4]:
f = open('training_data.json')
TRAIN_DATA = json.load(f)
TRAIN_DATA


{'classes': ['CRYPTO', 'ORG', 'TIME', 'VALUE', 'CHANGE', 'PERSON'],
 'annotations': [['Cryptocurrency prices today surged with Bitcoin trading at $48,089.82, a 2.6% increase in the last 24 hours. Ethereum surged marginally to $3,838.45, a 1.3% increase in the last 24 hours.',
   {'entities': [[0, 14, 'CRYPTO'],
     [40, 47, 'CRYPTO'],
     [59, 69, 'VALUE'],
     [73, 77, 'CHANGE'],
     [99, 108, 'TIME'],
     [109, 117, 'CRYPTO'],
     [139, 148, 'VALUE'],
     [152, 156, 'CHANGE'],
     [178, 186, 'TIME']]}],
  ['Dogecoin went up 15.5% at $0.181994 in the last 24 hours, after Tesla Inc chief Elon Musk said on Tuesday the electric carmaker will accept Dogecoin as payment for merchandise on a test basis, sending the meme-based cryptocurrency up over 20%. Dogecoin, popular among retail investors, raced up to $0.20 after the tweet.',
   {'entities': [[0, 8, 'CRYPTO'],
     [17, 22, 'CHANGE'],
     [26, 35, 'VALUE'],
     [48, 56, 'TIME'],
     [64, 73, 'ORG'],
     [80, 89, 'PERSON'],


In [5]:
for text, annot in tqdm (TRAIN_DATA['annotations']):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot['entities']:
        span = doc.char_span(start, end, label=label, alignment_mode='contract')
        if span is None:
            print('Skipping entity')
        else:
            ents.append(span)
        doc.ents = ents
        db.add(doc)

db.to_disk('./training_data.spacy')

100%|██████████| 8/8 [00:00<00:00, 839.66it/s]


# Train the model

In [6]:
! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [7]:
! python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[2023-04-26 16:29:28,814] [INFO] Set up nlp object from config
[2023-04-26 16:29:28,819] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-04-26 16:29:28,820] [INFO] Created vocabulary
[2023-04-26 16:29:28,822] [INFO] Finished initializing nlp object
[2023-04-26 16:29:28,996] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     19.04    4.00    2.18   24.64    0.04
  6     200        207.18   2287.68   75.57   63.26   93.84    0.76
 15     400         66.99   1503.93   73.82   82.46   66.82    0.74
 26     600         66.51   1747.40   77.63   72.24   83.89    0.78
 40     800         75.15   2161.78   78.09   72.00   85.31    

# Test the model

In [9]:
nlp_ner = spacy.load('./model-best/')


In [10]:
doc = nlp_ner('''The S&P 500 and the Dow hit all-time highs on Thursday, extending a record-setting run as a dip in weekly jobless claims allayed fears over the economic damage from a rampant surge in COVID-19 infections in the United States.

The blue-chip Dow was set to rise for a seventh straight session, its longest streak of gains since March.

The Labor Department's data showed that the number of Americans filing for new unemployment claims slipped to 198,000 in the week leading up to Christmas, from 205,000 a week earlier. Economists polled by Reuters had forecast 208,000 applications for the latest week.''')

In [11]:
spacy.displacy.render(doc, style='ent', jupyter=True)