In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install spacy
!python -m spacy download en_core_web_trf


Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting spacy-curated-transformers<1.0.0,>=0.2.2 (from en-core-web-trf==3.8.0)
  Downloading spacy_curated_transformers-0.3.1-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting curated-transformers<0.2.0,>=0.1.0 (from spacy-curated-transformers<1.0.0,>=0.2.2->en-core-web-trf==3.8.0)
  Downloading curated_transformers-0.1.1-py2.py3-none-any.whl.metadata (965 bytes)
Collecting curated-tokenizers<0.1.0,>=0.0.9 (from spacy-curated-transformers<1.0.0,>=0.2.2->en-core-web-trf==3.8.0)
  Downloading curated_tokenizers-0.0.9-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.9 kB)
Downloading spacy_curated_transformers-0.3.1-py2.py3-none-any.whl (237 kB)
[2K   [90m━━━━━

In [1]:
import pandas as pd
PATH ="/content/drive/MyDrive/news_project/data/cleaned/balanced_10000_records.csv"
df = pd.read_csv(PATH)
df.head()

Unnamed: 0,News ID,Category,Topic,Headline,News body,Title entity,Entity content
0,N27021,travel,travelnews,Most Dangerous Vacation Destinations,The following list is based on travel alerts a...,{},{}
1,N105998,music,musicnews,Pharrell Williams Guarantees Internships to 11...,Pharrell Williams surprised the 2019 graduatin...,{'Harlem': 'Harlem'},"{'Harlem': {'type': 'item', 'id': 'Q105676692'..."
2,N83893,foodanddrink,newstrends,Eat up! Grandwich competition starts July 1,"GRAND RAPIDS, Mich. - One of Grand Rapids' tas...",{},{}
3,N62196,news,newsus,DHS predicts up to 25 percent drop in migrant ...,WASHINGTON Border Patrol agents are on track...,{},{}
4,N77148,music,music-celebrity,Katy Perry Steps Out in Lingerie Look for Date...,Katy Perry's latest look works for a night in ...,"{'Orlando Bloom': 'Orlando Bloom', 'London': '...","{'Orlando Bloom': {'type': 'item', 'id': 'Q444..."


In [2]:
import re

def clean_text_only_chars(text):
    # Remove everything except alphabets and spaces
    text = re.sub(r'[^A-Za-z\s]', ' ', text)
    # Replace multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


In [3]:
# GPU load

import spacy

# MUST CALL BEFORE LOADING MODEL
spacy.require_gpu()

nlp = spacy.load("en_core_web_trf")

# OPTIONAL SPEED BOOST
nlp.disable_pipes("parser", "lemmatizer", "attribute_ruler")

print("Loaded spaCy Transformer Model on GPU 🚀")


Loaded spaCy Transformer Model on GPU 🚀


In [4]:
df["Headline"] = df["Headline"].astype(str).apply(clean_text_only_chars)
df["News body"] = df["News body"].astype(str).apply(clean_text_only_chars)
df["Title entity"] = df["Title entity"].astype(str)

In [5]:
import ast

def generate_bio_using_spacy_and_title(doc, title_value):
    tokens = [tok.text for tok in doc]
    tags = ["O"] * len(tokens)

    # 1️⃣ spaCy labeling
    for ent in doc.ents:
        tags[ent.start] = "B-" + ent.label_
        for i in range(ent.start + 1, ent.end):
            tags[i] = "I-" + ent.label_

    # 2️⃣ Title entity fallback
    try:
        ent_dict = ast.literal_eval(title_value)
    except:
        ent_dict = {}

    for surface, expanded in ent_dict.items():
        surface_clean = surface.replace("'s", "").strip()
        stoks = surface_clean.split()
        n = len(stoks)

        for i in range(len(tokens)-n+1):
            # If spaCy already labeled → skip
            if tokens[i:i+n] == stoks and tags[i] == "O":
                tags[i] = "B-MISC"
                for j in range(i+1, i+n):
                    tags[j] = "I-MISC"

    return tokens, tags


In [6]:
from tqdm import tqdm
import numpy as np

CHUNK_SIZE = 50000
total_rows = len(df)

print("Starting processing...")

for start in range(0, total_rows, CHUNK_SIZE):

    end = min(start + CHUNK_SIZE, total_rows)
    chunk_df = df.iloc[start:end]

    print(f"\n🚀 Processing rows {start} to {end}")

    # STEP-1: Create text input
    text_batch = (
        chunk_df["Headline"] + ". " + chunk_df["News body"]
    ).tolist()

    # STEP-2: Run spaCy transformer NER in batches
    docs = list(nlp.pipe(text_batch, batch_size=64))

    chunk_sentences = []
    chunk_labels = []

    # STEP-3: Apply BIO mapping
    for doc, (_, row) in tqdm(zip(docs, chunk_df.iterrows()), total=len(chunk_df)):
        tokens, tags = generate_bio_using_spacy_and_title(
            doc, row["Title entity"]
        )
        chunk_sentences.append(tokens)
        chunk_labels.append(tags)

    # STEP-4: Save output of this chunk
    np.save(f"ner_sentences_{start}_{end}.npy", np.array(chunk_sentences, dtype=object), allow_pickle=True)
    np.save(f"ner_labels_{start}_{end}.npy", np.array(chunk_labels, dtype=object), allow_pickle=True)

    print(f"✅ Saved chunk {start}-{end} successfully!")


Starting processing...

🚀 Processing rows 0 to 50000


  dlpack_tensor = xp_tensor.toDlpack()  # type: ignore


OutOfMemoryError: CUDA out of memory. Tried to allocate 364.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 98.12 MiB is free. Process 8083 has 14.64 GiB memory in use. Of the allocated memory 12.40 GiB is allocated by PyTorch, and 2.11 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [6]:
from tqdm import tqdm
import numpy as np
import gc
import torch

texts = (df["Headline"] + ". " + df["News body"]).tolist()

sentences = []
labels = []

BATCH_SIZE = 16  # 🔥 LOWER Batch = LOWER GPU USAGE

print("Starting processing 10K rows...")

docs = nlp.pipe(texts, batch_size=BATCH_SIZE)

for doc, (_, row) in tqdm(zip(docs, df.iterrows()), total=len(df)):
    tokens, tags = generate_bio_using_spacy_and_title(doc, row["Title entity"])
    sentences.append(tokens)
    labels.append(tags)

# FREE GPU CACHE
torch.cuda.empty_cache()
gc.collect()

np.save("/content/drive/MyDrive/news_project/ner_sentences.npy", np.array(sentences, dtype=object), allow_pickle=True)
np.save("/content/drive/MyDrive/news_project/ner_labels.npy", np.array(labels, dtype=object), allow_pickle=True)

print("\n🎉 DONE — Successfully processed 10K rows!")


Starting processing 10K rows...


  dlpack_tensor = xp_tensor.toDlpack()  # type: ignore
100%|██████████| 10000/10000 [11:09<00:00, 14.95it/s]



🎉 DONE — Successfully processed 10K rows!


In [7]:
sentences

Output hidden; open in https://colab.research.google.com to view.

In [2]:
import numpy as np

sentences = np.load("/content/drive/MyDrive/news_project/ner_sentences.npy", allow_pickle=True)
labels = np.load("/content/drive/MyDrive/news_project/ner_labels.npy", allow_pickle=True)

print(len(sentences), len(labels))
print(sentences[0])
print(labels[0])


10000 10000
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GPE', 'B-GPE', 'O', 'B-GPE', 'I-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'B-GPE', 'O', 'B-DATE', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'B-GPE', 'I-GPE', 'O', 'O', 'O', 'O', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'O', 'O

In [3]:
tag_list = sorted(list({tag for seq in labels for tag in seq}))
tag2id = {tag: i for i, tag in enumerate(tag_list)}
id2tag = {i: tag for tag, i in tag2id.items()}

print(tag2id)


{'B-CARDINAL': 0, 'B-DATE': 1, 'B-EVENT': 2, 'B-FAC': 3, 'B-GPE': 4, 'B-LANGUAGE': 5, 'B-LAW': 6, 'B-LOC': 7, 'B-MISC': 8, 'B-MONEY': 9, 'B-NORP': 10, 'B-ORDINAL': 11, 'B-ORG': 12, 'B-PERCENT': 13, 'B-PERSON': 14, 'B-PRODUCT': 15, 'B-QUANTITY': 16, 'B-TIME': 17, 'B-WORK_OF_ART': 18, 'I-CARDINAL': 19, 'I-DATE': 20, 'I-EVENT': 21, 'I-FAC': 22, 'I-GPE': 23, 'I-LAW': 24, 'I-LOC': 25, 'I-MISC': 26, 'I-MONEY': 27, 'I-NORP': 28, 'I-ORG': 29, 'I-PERCENT': 30, 'I-PERSON': 31, 'I-PRODUCT': 32, 'I-QUANTITY': 33, 'I-TIME': 34, 'I-WORK_OF_ART': 35, 'O': 36}


In [4]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

encodings = tokenizer(
    sentences.tolist(),
    is_split_into_words=True,
    padding=True,
    truncation=True,
    return_offsets_mapping=True
)

aligned_labels = []

for i in range(len(sentences)):
    word_ids = encodings.word_ids(batch_index=i)
    sample_labels = labels[i]
    prev_word = None
    aligned = []

    for w in word_ids:
        if w is None:
            aligned.append(-100)  # ignore in loss
        else:
            aligned.append(tag2id[sample_labels[w]])

    aligned_labels.append(aligned)

encodings.pop("offset_mapping")


Output hidden; open in https://colab.research.google.com to view.

In [10]:
import torch
from torch.utils.data import Dataset

class NerDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k,v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item


In [11]:
from sklearn.model_selection import train_test_split
import numpy as np

train_idx, val_idx = train_test_split(
    np.arange(len(aligned_labels)),
    test_size=0.1,
    random_state=42
)

train_dataset = torch.utils.data.Subset(dataset, train_idx)
val_dataset   = torch.utils.data.Subset(dataset, val_idx)


In [12]:
from transformers import BertForTokenClassification

model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert_ner_output",
    save_strategy="epoch",
    eval_strategy="epoch",  # valid dataset only
    logging_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    report_to="none"
)


In [19]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset  # only to monitor val loss
)


In [20]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.2687,0.191531
2,0.1589,0.17568


TrainOutput(global_step=1126, training_loss=0.21375816711100548, metrics={'train_runtime': 1805.4391, 'train_samples_per_second': 9.97, 'train_steps_per_second': 0.624, 'total_flos': 4704829913088000.0, 'train_loss': 0.21375816711100548, 'epoch': 2.0})

In [21]:
trainer.save_model("/content/drive/MyDrive/news_project/ner_trans/bert_ner_model")
tokenizer.save_pretrained("/content/drive/MyDrive/news_project/ner_trans/bert_ner_model")


('/content/drive/MyDrive/news_project/ner_trans/bert_ner_model/tokenizer_config.json',
 '/content/drive/MyDrive/news_project/ner_trans/bert_ner_model/special_tokens_map.json',
 '/content/drive/MyDrive/news_project/ner_trans/bert_ner_model/vocab.txt',
 '/content/drive/MyDrive/news_project/ner_trans/bert_ner_model/added_tokens.json',
 '/content/drive/MyDrive/news_project/ner_trans/bert_ner_model/tokenizer.json')

In [23]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=5e645c0e965edf589ccdf182493621842963ef873a8150db774036d0d7446062
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [24]:
# EVAL
import torch
import numpy as np
from seqeval.metrics import classification_report, precision_score, recall_score, f1_score
import pandas as pd
from transformers import BertForTokenClassification

# ---------------------------------------
# 1. Load trained BERT model
# ---------------------------------------
MODEL_PATH = "/content/drive/MyDrive/news_project/ner_trans/bert_ner_model"  # change if needed
model = BertForTokenClassification.from_pretrained(MODEL_PATH)
model.eval()

print("Loaded Success")

Loaded Success


In [25]:
# ---------------------------------------
# 2. Create validation dataset again
# ---------------------------------------
X_val = torch.tensor([encodings['input_ids'][i] for i in val_idx])
mask_val = torch.tensor([encodings['attention_mask'][i] for i in val_idx])
Y_val = [aligned_labels[i] for i in val_idx]

print("Validation items:", len(Y_val))

Validation items: 1000


In [26]:
# ---------------------------------------
# 3. Predict
# ---------------------------------------
batch_size = 32
all_preds = []

for i in range(0, len(X_val), batch_size):
    batch_ids = X_val[i:i+batch_size]
    batch_mask = mask_val[i:i+batch_size]

    with torch.no_grad():
        outputs = model(batch_ids, attention_mask=batch_mask)
        preds = outputs.logits.argmax(-1).tolist()

    all_preds.extend(preds)

In [27]:
# ---------------------------------------
# 4. Convert predictions to tag text
# ---------------------------------------
y_true = []
y_pred = []

for true_seq, pred_seq in zip(Y_val, all_preds):
    for t, p in zip(true_seq, pred_seq):
        if t != -100:
            y_true.append(id2tag[t])
            y_pred.append(id2tag[p])

In [28]:
# ---------------------------------------
# 5. Calculate Metrics
# ---------------------------------------
precision = precision_score([y_true], [y_pred])
recall = recall_score([y_true], [y_pred])
f1 = f1_score([y_true], [y_pred])

print("\nClassification Report:")
print(classification_report([y_true], [y_pred]))

print("\nPrecision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)



Classification Report:
              precision    recall  f1-score   support

    CARDINAL       0.84      0.91      0.88      1558
        DATE       0.77      0.85      0.81      3913
       EVENT       0.53      0.69      0.60       590
         FAC       0.58      0.73      0.64      1185
         GPE       0.82      0.85      0.84      4606
    LANGUAGE       0.75      0.12      0.21        24
         LAW       0.51      0.56      0.54        32
         LOC       0.65      0.56      0.60       619
        MISC       0.52      0.23      0.32      1098
       MONEY       0.49      0.42      0.45        52
        NORP       0.86      0.83      0.84       744
     ORDINAL       0.91      0.96      0.94       714
         ORG       0.74      0.78      0.76      9348
     PERCENT       0.18      0.14      0.15        22
      PERSON       0.85      0.87      0.86     12163
     PRODUCT       0.64      0.63      0.63      1970
    QUANTITY       0.50      0.60      0.55       174
   

In [31]:
# ---------------------------------------
# 6. Load previous model result CSV
# ---------------------------------------
csv_path = "/content/drive/MyDrive/news_project/data/dl_model_comparison.csv"
df_prev = pd.read_csv(csv_path)

In [32]:
# ---------------------------------------
# 7. Append new row
# ---------------------------------------
new_row = {
    "Model": "Transformer",
    "Embedding": "BERT-base",
    "Precision": round(precision, 4),
    "Recall": round(recall, 4),
    "F1_Score": round(f1, 4)
}

df_prev = pd.concat([df_prev, pd.DataFrame([new_row])], ignore_index=True)

# ---------------------------------------
# 8. Save updated file
# ---------------------------------------
df_prev.to_csv(csv_path, index=False)
print("\nUpdated saved to:", csv_path)

print("\nFinal Table:")
print(df_prev)


Updated saved to: /content/drive/MyDrive/news_project/data/dl_model_comparison.csv

Final Table:
         Model  Embedding  Precision  Recall  F1_Score
0       BiLSTM   Word2Vec     0.8972  0.5958    0.6526
1       BiLSTM      GloVe     0.8148  0.4661    0.5405
2   BiLSTM-CRF   Word2Vec     0.8723  0.5528    0.6180
3   BiLSTM-CRF      GloVe     0.8168  0.4503    0.5305
4  Transformer  BERT-base     0.7607  0.7951    0.7775


In [40]:
import torch
from transformers import BertTokenizerFast, BertForTokenClassification

model_path = "/content/drive/MyDrive/news_project/ner_trans/bert_ner_model"
tokenizer = BertTokenizerFast.from_pretrained(model_path)
model = BertForTokenClassification.from_pretrained(model_path)
model.eval()

# LOAD REAL TAG MAP FROM MODEL
id2tag = model.config.id2label

In [44]:
def extract_entities(tokens, tags):
    entities = {}
    current_entity_tokens = []
    current_type = None

    for token, tag in zip(tokens, tags):

        if tag.startswith("B-"):
            # save previous entity
            if current_type:
                entity_text = " ".join(current_entity_tokens)
                entities.setdefault(current_type, []).append(entity_text)

            # start new entity
            current_type = tag.split("-")[1]
            current_entity_tokens = [token]

        elif tag.startswith("I-") and current_type:
            current_entity_tokens.append(token)

        else:
            # save previous entity if exists
            if current_type:
                entity_text = " ".join(current_entity_tokens)
                entities.setdefault(current_type, []).append(entity_text)

            current_type = None
            current_entity_tokens = []

    # catch last entity
    if current_type:
        entity_text = " ".join(current_entity_tokens)
        entities.setdefault(current_type, []).append(entity_text)

    ## FIX: remove single-letter splits like ['V','ira','t'] → 'Virat'
    for ent_type, ent_list in entities.items():
        fixed_list = []
        for ent in ent_list:
            # remove spaces inside a broken name
            if len(ent.split()) > 1 and len(ent.replace(" ", "")) <= 15:
                ent = ent.replace(" ", "")
            fixed_list.append(ent)
        entities[ent_type] = fixed_list

    return entities


In [45]:
def predict_entities(input_text):
    encoding = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        padding=True
    )

    with torch.no_grad():
        output = model(
            input_ids=encoding["input_ids"],
            attention_mask=encoding["attention_mask"]
        )
        logits = output.logits
        predictions = torch.argmax(logits, dim=2)[0].tolist()

    tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"][0])
    tags = [id2tag[p] for p in predictions]

    # Remove special tokens [CLS] and [SEP]
    clean_tokens = []
    clean_tags = []

    for token, tag in zip(tokens, tags):
        if token not in ["[CLS]", "[SEP]"]:
            clean_tokens.append(token.replace("##", ""))  # remove BPE splits
            clean_tags.append(tag)

    entities = extract_entities(clean_tokens, clean_tags)
    return clean_tokens, clean_tags, entities


In [47]:
sample_text = "Oliver Gavin says he finds it baffling that Corvette stablemate Marcel Fassler was judged to blame for the crash that eliminated the car from the Le Mans Hours Fassler in the car he shared with Gavin and Tommy Milner suffered a heavy crash at the Porsche Curves just beyond one quarter distance after the Swiss driver tagged the Dempsey Proton Porsche of Japanese gentleman driver Satoshi Hoshino The three time Le Mans winner was taken to the medical centre and then a local hospital for a CT scan but escape the impact with nothing worse than bruising While the stewards assessed Fassler a euro fine and six penalty points on his licence Gavin said he held Hoshino entirely to blame for the incident pointing out the bronze rated driver had changed his line at the last second What happened with Marcel is just crazy Gavin told Motorsport com For us to get turned into the wall by an Am driver who is clearly out of his depth looks terrified he d been off numerous times already When you are competing in multi class racing and you re making split decisions you re looking for cues you re picking up all the body language from the car in front And he Hoshino gave absolutely every single indication he was keeping out of the way He was km h slower he let the previous two cars go by on the inside and then he just does something random and pull right down on the racing line takes us clean out of the race That sort of thing is something that needs to be looked at again He added The guy driving our car Fassler has won Le Mans three times so he certainly knows what he s doing He knows the deal and how to get it done So for the fine to be given to him and the penalty points it s just I m battling to understand how that s right Fassler s crash left the car of Antonio Garcia and Mike Rockenfeller and Jan Magnussen carrying Corvette s hopes and heading into the morning hours the trio were locked in a close fight with the leading AF Corse Ferrari However when Magnussen pitted under the safety car in hour he was held at the end of the pitlane giving the Ferrari crew a one minute lead it could nurse to the finish Shortly after Magnussen spun at the Porsche Curves while trying to claw back the lost ground hitting the barriers an incident for which the Dane accepted the blame When the safety car split us the win was gone for sure but we could have still had second Magnussen told Motorsport com Then I didn t keep enough heat in the tyres So when I clipped the kerb at the Porsche Curves I spun and hit the wall and damaged the suspension Horrible feeling I can t even begin how to describe how I felt for the guys all that hard work to come away from nothing They deserved a lot better Gavin said watching the challenge of the crew fall apart in the final hours was painful for him to watch It was very hard to see what happened to the sister car with the safety car and then Jan s accident he said They deserved the victory today they were fast all race and led for long periods That s just very tough but it s the nature of the race It chooses you and it chose the Ferrari car today"

tokens, labels, entities = predict_entities(sample_text)

print("\nToken Predictions:")
for t, l in zip(tokens, labels):
    print(f"{t:12} --> {l}")

print("\nExtracted Entities:")
print(entities)



Token Predictions:
Oliver       --> B-PERSON
Gavin        --> I-PERSON
says         --> O
he           --> O
finds        --> O
it           --> O
b            --> O
af           --> O
f            --> O
ling         --> O
that         --> O
Co           --> B-ORG
rvette       --> B-ORG
stable       --> O
mate         --> O
Marcel       --> B-PERSON
F            --> I-PERSON
ass          --> I-PERSON
ler          --> I-PERSON
was          --> O
judged       --> O
to           --> O
blame        --> O
for          --> O
the          --> O
crash        --> O
that         --> O
eliminated   --> O
the          --> O
car          --> O
from         --> O
the          --> B-EVENT
Le           --> I-EVENT
Mans         --> I-EVENT
Hours        --> I-EVENT
F            --> B-PERSON
ass          --> B-PERSON
ler          --> B-PERSON
in           --> O
the          --> O
car          --> O
he           --> O
shared       --> O
with         --> O
Gavin        --> B-PERSON
and          --> O
Tomm