In [1]:
import spacy
from spacy import displacy

In [3]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [4]:
import json
 
with open('data/flight_spacy_dataset.json', 'r') as f:
    training_data = json.load(f)

In [5]:
training_data[0]

{'text': 'Show me flights from Boston to Paris on Dec 16, 2025.',
 'entities': [[21, 27, 'SOURCE'],
  [31, 36, 'DESTINATION'],
  [40, 52, 'DEPART_DATE']],
 'meta': {'DEPART_DATE_40_52': {'normalized': '2025-12-16',
   'fmt': 'short_comma'}}}

In [6]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en")
doc_bin = DocBin()

In [7]:
from spacy.util import filter_spans

for training_example  in tqdm(training_data): 
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents 
    doc_bin.add(doc)

doc_bin.to_disk("train.spacy") 

  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [00:00<00:00, 2238.77it/s]


In [8]:
!python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

⚠ To generate a more effective transformer-based config (GPU-only), install the
spacy-transformers package and re-run this command. The config generated now
does not use transformers.
ℹ Generated config template specific for your use case
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
✔ Auto-filled config with all values
✔ Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [9]:

!python -m spacy train config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./train.spacy

ℹ Saving to output directory: .
ℹ Using CPU
[1m
✔ Initialized pipeline
[1m
ℹ Pipeline: ['tok2vec', 'ner']
ℹ Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     57.41    0.00    0.00    0.00    0.00
  1     200        203.16   2819.60   99.14   99.09   99.20    0.99
  2     400         15.21     49.68  100.00  100.00  100.00    1.00
  4     600          0.04      0.04   99.96   99.96   99.96    1.00
  6     800          5.51      3.30  100.00  100.00  100.00    1.00
  8    1000          0.01      0.02  100.00  100.00  100.00    1.00
 12    1200          0.02      0.04  100.00  100.00  100.00    1.00
 16    1400        128.63     45.78   99.84   99.84   99.84    1.00
 20    1600        134.89     42.08  100.00  100.00  100.00    1.00
 26    1800         31.30     11.19  100.00  100.00  100.00    1.00
 33    2000         12.13      3.11  100.00  100

In [12]:

nlp = spacy.load("model-best")
doc = nlp("I want a round-trip flight from Los Angeles to Bangalore on 2025-11-29.")

print(doc.ents)
displacy.render(doc, style="ent", jupyter=True)

(round-trip, Los Angeles, Bangalore, 2025-11-29)
