In [9]:
import spacy
import random
from spacy.tokens import DocBin
from tqdm.notebook import tqdm  # for progress bars

In [12]:
import os

# Function to load data in IOB format (e.g., CoNLL format)
def load_data(file_path):
    sentences = []
    sentence = []

    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            line = line.strip()
            if line == "":
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            elif not line.startswith("-DOCSTART-"):
                parts = line.split()
                token = parts[0]
                label = parts[-1]  # Assuming the label is in the last column
                sentence.append((token, label))
        if sentence:
            sentences.append(sentence)
    return sentences

# Load training and validation data
train_data = load_data("/content/eng.train")
dev_data = load_data("/content/eng.testa")
test_data = load_data("/content/eng.testb")

# Function to convert data to spaCy's DocBin format
def convert_to_spacy(data, output_path):
    nlp = spacy.blank("en")  # Initialize blank English model
    db = DocBin()  # Create a DocBin for storing processed data

    for sentence in tqdm(data):
        tokens = [token for token, label in sentence]
        labels = [label for token, label in sentence]

        doc = nlp.make_doc(" ".join(tokens))  # Create Doc object from text
        ents = []

        for i, (token, label) in enumerate(sentence):
            if label != "O":
                label_type = label.split("-")[-1]  # Get entity type
                if label.startswith("B-"):
                    # Find the start and end based on token index instead of character position
                    start = i
                    end = i + 1
                    ents.append(spacy.tokens.Span(doc, start=start, end=end, label=label_type))

        doc.ents = ents
        db.add(doc)

    db.to_disk(output_path)  # Save to disk

# Convert data and save it
convert_to_spacy(train_data, "train.spacy")
convert_to_spacy(dev_data, "dev.spacy")


  0%|          | 0/14041 [00:00<?, ?it/s]

  0%|          | 0/3250 [00:00<?, ?it/s]

In [15]:
# Initialize transformer-based config for NER
!python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency --gpu

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: GPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [21]:
# Train the model with transformer-based config
!python -m spacy train /content/config.cfg --output /content/output --paths.train /content/train.spacy --paths.dev /content/dev.spacy --gpu-id 0


[38;5;2m✔ Created output directory: /content/output[0m
[38;5;4mℹ Saving to output directory: /content/output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     44.28    0.00    0.00    0.00    0.00
  0     200         65.13   3271.47   46.69   52.91   41.79    0.47
  0     400        152.94   2449.57   62.64   63.93   61.41    0.63
  0     600        200.24   2463.64   69.82   70.61   69.05    0.70
  0     800        252.84   2659.69   71.33   72.76   69.96    0.71
  0    1000        302.93   3057.73   72.52   76.53   68.92    0.73
  1    1200        373.69   3299.78   76.10   77.21   75.03    0.76
  1    1400        476.12   3113.21   76.30   78.91   73.86    0.76
  1    1600        552.79   3820.62   77.00  

In [22]:
# Load the best trained model
nlp_trained = spacy.load("/content/output/model-best")

# Prepare test data by converting each test sentence to plain text
test_sentences = [" ".join([token for token, label in sentence]) for sentence in test_data]

# Process test sentences through the model to get predictions
test_docs = list(nlp_trained.pipe(test_sentences))



In [23]:
from spacy import displacy

# Display sample predictions
for i, doc in enumerate(test_docs[:5]):  # Show first 5 examples
    print(f"Document {i+1}:")
    print("Text:", doc.text)
    if doc.ents:
        displacy.render(doc, style="ent", jupyter=True)
    else:
        print("No entities found.")
    print("\n" + "-"*50 + "\n")

Document 1:
Text: SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT .



--------------------------------------------------

Document 2:
Text: Nadim Ladki



--------------------------------------------------

Document 3:
Text: AL-AIN , United Arab Emirates 1996-12-06



--------------------------------------------------

Document 4:
Text: Japan began the defence of their Asian Cup title with a lucky 2-1 win against Syria in a Group C championship match on Friday .



--------------------------------------------------

Document 5:
Text: But China saw their luck desert them in the second match of the group , crashing to a surprise 2-0 defeat to newcomers Uzbekistan .



--------------------------------------------------



In [34]:
from sklearn.metrics import classification_report

# Modified to include both tokens and labels
true_entities = [[(token, label) for token, label in sentence] for sentence in test_data]
predicted_entities = [[ent.label_ if ent else "O" for ent in doc.ents] for doc in test_docs]

# Modified logic to handle length differences and align predictions with true labels
y_true = []
y_pred = []

for true_sent, pred_doc in zip(true_entities, test_docs):
    pred_labels = ["O"] * len(true_sent)  # Initialize with 'O' for all tokens

    # Align based on token text matching
    pred_idx = 0
    for true_idx, (true_token, true_label) in enumerate(true_sent): # Unpack now works
        if pred_idx < len(pred_doc) and true_token == pred_doc[pred_idx].text:
            if pred_doc[pred_idx].ent_iob_ != "O":
                pred_labels[true_idx] = pred_doc[pred_idx].ent_type_
            pred_idx += 1

    y_true.extend([label for _, label in true_sent])  # Extract true labels
    y_pred.extend(pred_labels)

# Print classification report
print(classification_report(y_true, y_pred, zero_division=0))

              precision    recall  f1-score   support

       B-LOC       0.00      0.00      0.00      1668
      B-MISC       0.00      0.00      0.00       702
       B-ORG       0.00      0.00      0.00      1661
       B-PER       0.00      0.00      0.00      1617
       I-LOC       0.00      0.00      0.00       257
      I-MISC       0.00      0.00      0.00       216
       I-ORG       0.00      0.00      0.00       835
       I-PER       0.00      0.00      0.00      1156
         LOC       0.00      0.00      0.00         0
        MISC       0.00      0.00      0.00         0
           O       0.91      0.99      0.95     38323
         ORG       0.00      0.00      0.00         0
         PER       0.00      0.00      0.00         0

    accuracy                           0.82     46435
   macro avg       0.07      0.08      0.07     46435
weighted avg       0.75      0.82      0.78     46435

