In [31]:
! pip install -U spacy -q

In [32]:
!python -m spacy info

[1m

spaCy version    3.8.7                         
Location         /usr/local/lib/python3.12/dist-packages/spacy
Platform         Linux-6.1.123+-x86_64-with-glibc2.35
Python version   3.12.11                       
Pipelines        en_core_web_lg (3.8.0), en_core_web_sm (3.8.0)



In [33]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object

In [34]:
import json
f = open('training_data.json')
TRAIN_DATA = json.load(f)

In [35]:
TRAIN_DATA

{'classes': ['ORG', 'DATE', 'PRODUCTS', 'LOCATION', 'AMT', 'PERSON'],
 'annotations': [['In Q2 2024, JPMorgan Chase reported revenue of 39.2 billion U.S. dollars. CEO Jamie Dimon highlighted strong growth in investment banking services across North America. \r',
   {'entities': [[3, 10, 'DATE'],
     [12, 26, 'ORG'],
     [47, 73, 'AMT'],
     [78, 89, 'PERSON'],
     [120, 149, 'PRODUCTS'],
     [157, 170, 'LOCATION']]}],
  ['Goldman Sachs earned 12.7 billion U.S. dollars in Q1 2024 from trading and asset management in Europe. David Solomon noted increased client activity in derivatives. \r',
   {'entities': [[0, 13, 'ORG'],
     [20, 46, 'AMT'],
     [50, 57, 'DATE'],
     [63, 91, 'PRODUCTS'],
     [95, 101, 'LOCATION'],
     [103, 116, 'PERSON']]}],
  ['In Q3 2023, Morgan Stanley generated 13.5 billion U.S. dollars through wealth management operations in Asia. CEO James Gorman emphasized the resilience of advisory services. \r',
   {'entities': [[3, 10, 'DATE'],
     [12, 26, 'ORG'

In [36]:
for text, annot in tqdm(TRAIN_DATA['annotations']):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./training_data.spacy") # save the docbin object

100%|██████████| 10/10 [00:00<00:00, 770.60it/s]

Skipping entity





In [37]:
! python -m spacy init config config.cfg --lang en --pipeline ner --optimize accuracy

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: accuracy
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
! python -m spacy download en_core_web_lg


Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [38]:
! python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     49.73    0.00    0.00    0.00    0.00
 66     200         45.57   2288.61  100.00  100.00  100.00    1.00
140     400          0.00      0.00  100.00  100.00  100.00    1.00
240     600          0.00      0.00  100.00  100.00  100.00    1.00
340     800          0.00      0.00  100.00  100.00  100.00    1.00
498    1000          0.00      0.00  100.00  100.00  100.00    1.00
698    1200          0.00      0.00  100.00  100.00  100.00    1.00
898    1400         86.32    233.48  100.00  100.00  100.00    1.00
1098    1600         29.19     30.19   94.92   94.92   94.92    0.95
1298    1800         58.48     58.20  100.00  100.

In [39]:
nlp_ner = spacy.load("/content/model-best")

In [44]:
doc = nlp_ner('''Goldman Sachs earned 12.7 billion U.S. dollars in Q1 2024 from trading and asset management in Europe. David Solomon noted increased client activity in derivatives.
In Q3 2023, Morgan Stanley generated 13.5 billion U.S. dollars through wealth management operations in Asia. CEO James Gorman emphasized the resilience of advisory services.
Citigroup posted revenue of 19.4 billion U.S. dollars in Q2 2023, driven by strong credit card lending in the United States. Jane Fraser said digital banking adoption remained robust.
In Q1 2024, Bank of America reported revenue of 26.1 billion U.S. dollars from consumer banking operations in North America. Brian Moynihan highlighted strong mortgage demand.
Wells Fargo achieved revenue of 20.8 billion U.S. dollars in Q4 2023, supported by commercial lending growth in the United States. CEO Charles Scharf emphasized efficiency improvements.
HSBC generated 15.9 billion U.S. dollars in Q2 2023 from global retail banking in Europe. Noel Quinn stated that digital transformation initiatives were accelerating.
Barclays reported revenue of 13.2 billion U.S. dollars in Q3 2023 from investment services in the United Kingdom. C.S. Venkatakrishnan pointed to stable trading income.
In Q1 2024, Deutsche Bank posted revenue of 14.7 billion U.S. dollars from corporate banking in Germany. Christian Sewing highlighted strong fee-based services.
Visa earned 8.9 billion U.S. dollars in Q4 2023 from payment processing across North America. CEO Ryan McInerney emphasized growth in contactless transactions.
In Q2 2024, JPMorgan Chase reported revenue of 39.2 billion U.S. dollars. CEO Jamie Dimon highlighted strong growth in investment banking services across North America.''')

In [45]:
spacy.displacy.render(doc, style="ent", jupyter=True)