# 1. Annotating a SpaCy Dataset

In [1]:
!pip install spacy

import spacy
from spacy.tokens import DocBin

# Load a blank model for English
nlp = spacy.blank("en")

# Create training examples [(text, {"entities": [(start, end, label)]})]
TRAIN_DATA = [
    ("Google was founded in 1998.", {"entities": [(0, 6, "ORG")]}),
    ("Larry Page co-founded Google.", {"entities": [(0, 10, "PERSON"), (23, 29, "ORG")]}),
]

# Convert to SpaCy format
doc_bin = DocBin()
for text, annot in TRAIN_DATA:
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label)
        if span:
            ents.append(span)
    doc.ents = ents
    doc_bin.add(doc)

# Save
doc_bin.to_disk("train.spacy")




# 2. Training a SpaCy Blank

In [2]:
!python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency


[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [3]:
!python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./train.spacy --output ./output


[38;5;2m✔ Created output directory: output[0m
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00      7.70   36.36   22.22  100.00    0.36
200     200          0.74    126.72  100.00  100.00  100.00    1.00
400     400          0.00      0.00  100.00  100.00  100.00    1.00
600     600          0.00      0.00  100.00  100.00  100.00    1.00
800     800          0.00      0.00  100.00  100.00  100.00    1.00
1000    1000          0.00      0.00  100.00  100.00  100.00    1.00
1200    1200          0.00      0.00  100.00  100.00  100.00    1.00
1400    1400          0.00      0.00  100.00  100.00  100.00    1.00
1600    1600          0.00      0.00  100.00  100.00  100.00   

In [4]:
nlp2 = spacy.load("./output/model-best")
doc = nlp2("Sundar Pichai is the CEO of Google.")
print([(ent.text, ent.label_) for ent in doc.ents])


[('Sundar', 'ORG'), ('Pichai is', 'PERSON')]


# 3. Git Workflow (in Colab)

In [6]:
# Configure Git (first time)
!git config --global user.name "Indraneel Pothuri"
!git config --global user.email "indraneelpothuri@email.com"

# Clone your repo
!git clone https://github.com/amalsalilan/Infosys-Springboard-Internship-FinanceInsight.git
%cd your-repo

# Add files (example: training data and config)
!cp /content/train.spacy .
!git add train.spacy config.cfg

# Commit changes
!git commit -m "Added training data and config"

# Push (you may need a token instead of password)
!git push origin Indraneel


Cloning into 'Infosys-Springboard-Internship-FinanceInsight'...
remote: Enumerating objects: 470, done.[K
remote: Counting objects: 100% (142/142), done.[K
remote: Compressing objects: 100% (123/123), done.[K
remote: Total 470 (delta 72), reused 18 (delta 18), pack-reused 328 (from 2)[K
Receiving objects: 100% (470/470), 6.79 MiB | 23.33 MiB/s, done.
Resolving deltas: 100% (156/156), done.
[Errno 2] No such file or directory: 'your-repo'
/content
cp: '/content/train.spacy' and './train.spacy' are the same file
fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git
