In [1]:
# Install necessary libraries (Run this in a cell if not installed)
!pip install spacy scikit-learn pandas
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m93.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
import spacy
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support

# Load the pre-trained English model
nlp = spacy.load("en_core_web_sm")

# --- 1. Create Sample Real-World Data (News/Social Media style) ---
# Format: (Text, List of Entities)
# Entities are formatted as: (Start Index, End Index, Label)
test_data = [
    (
        "Apple Inc. is planning to open a new store in San Francisco next June.",
        {"entities": [(0, 10, "ORG"), (46, 59, "GPE"), (65, 69, "DATE")]}
    ),
    (
        "Elon Musk tweeted about Dogecoin on Tuesday.",
        {"entities": [(0, 9, "PERSON"), (36, 43, "DATE")]}
    ),
    (
        "The Olympics will be held in Paris in 2024.",
        {"entities": [(29, 34, "GPE"), (38, 42, "DATE")]} # Note: 'Olympics' acts as EVENT usually, but sm model might struggle, so we stick to clear entities for scoring
    ),
    (
        "Google bought YouTube for $1.65 billion.",
        {"entities": [(0, 6, "ORG"), (14, 21, "ORG"), (26, 39, "MONEY")]}
    )
]

print(f"Loaded {len(test_data)} test cases.")

Loaded 4 test cases.


In [3]:
# --- 2. Run NER Prediction ---

y_true = []
y_pred = []

print("--- Extraction Results ---")

for text, annotations in test_data:
    # 1. Get ground truth entities for this sentence
    true_entities = annotations['entities']

    # 2. Predict with spaCy
    doc = nlp(text)

    # We need to align the predictions with the ground truth tokens.
    # To simplify evaluation for this assignment, we will evaluate at the Token level.
    # This means we check if every word was correctly identified as an Entity or 'O' (Outside).

    print(f"\nText: {text}")
    print(f"Predicted: {[(ent.text, ent.label_) for ent in doc.ents]}")

    # Token-level alignment for scoring
    # We iterate over every token in the document
    for token in doc:
        # Default truth is 'O' (Outside entity)
        true_label = "O"

        # Check if this token falls inside any true entity range
        # token.idx is the character start position
        for start, end, label in true_entities:
            if token.idx >= start and token.idx < end:
                true_label = label
                break

        # Get predicted label
        # token.ent_type_ gives the label (e.g., 'ORG'), empty string if none.
        pred_label = token.ent_type_ if token.ent_type_ else "O"

        y_true.append(true_label)
        y_pred.append(pred_label)

print("\nProcessing complete.")

--- Extraction Results ---

Text: Apple Inc. is planning to open a new store in San Francisco next June.
Predicted: [('Apple Inc.', 'ORG'), ('San Francisco', 'GPE'), ('next June', 'DATE')]

Text: Elon Musk tweeted about Dogecoin on Tuesday.
Predicted: [('Elon Musk', 'PERSON'), ('Dogecoin', 'ORG'), ('Tuesday', 'DATE')]

Text: The Olympics will be held in Paris in 2024.
Predicted: [('Olympics', 'EVENT'), ('Paris', 'GPE'), ('2024', 'DATE')]

Text: Google bought YouTube for $1.65 billion.
Predicted: [('Google', 'ORG'), ('YouTube', 'PRODUCT'), ('$1.65 billion', 'MONEY')]

Processing complete.


In [4]:
# --- 3. Calculate Metrics ---

# specific labels we care about (ignoring 'O' usually helps see entity performance, but we include all here)
labels = sorted(list(set(y_true + y_pred)))
# Move 'O' to the end for better readability
if 'O' in labels:
    labels.remove('O')
    labels.append('O')

print("\n--- Model Evaluation Metrics ---")

# Calculate overall accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f"Overall Accuracy: {accuracy:.2f}")

# Detailed report
report = classification_report(y_true, y_pred, labels=labels, zero_division=0)
print("\nDetailed Classification Report:")
print(report)

# Calculate weighted average for Precision, Recall, F1
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted', zero_division=0)

print("-" * 30)
print(f"Weighted Precision : {precision:.2f}")
print(f"Weighted Recall    : {recall:.2f}")
print(f"Weighted F1 Score  : {f1:.2f}")
print("-" * 30)


--- Model Evaluation Metrics ---
Overall Accuracy: 0.90

Detailed Classification Report:
              precision    recall  f1-score   support

        DATE       0.75      1.00      0.86         3
       EVENT       0.00      0.00      0.00         0
         GPE       1.00      1.00      1.00         3
       MONEY       1.00      1.00      1.00         3
         ORG       0.75      0.75      0.75         4
      PERSON       1.00      1.00      1.00         2
     PRODUCT       0.00      0.00      0.00         0
           O       1.00      0.88      0.94        26

    accuracy                           0.90        41
   macro avg       0.69      0.70      0.69        41
weighted avg       0.96      0.90      0.93        41

------------------------------
Weighted Precision : 0.96
Weighted Recall    : 0.90
Weighted F1 Score  : 0.93
------------------------------


In [5]:
# --- 4. Visualization ---
from spacy import displacy

# Render the first example
example_text = test_data[0][0] # "Apple Inc. is planning..."
doc = nlp(example_text)

print("\n--- Entity Visualization ---")
displacy.render(doc, style="ent", jupyter=True)


--- Entity Visualization ---
