In [1]:
pip install transformers seqeval pandas

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=7eba960f8c34d8085fa900889250a7eed7c550e372c60d72143533635eb27b9d
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [11]:
from google.colab import files
uploaded = files.upload()

Saving NER-test.tsv to NER-test (1).tsv


In [13]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
from seqeval.metrics import classification_report, f1_score

# Load test file
df = pd.read_csv("NER-test.tsv", sep="\t")

# Create sentence groups
sentences = df.groupby("sentence_id")["token"].apply(list).tolist()
sentence_ids = df["sentence_id"].unique()

# Load Jean-Baptiste model
ner_model = pipeline("ner", model="Jean-Baptiste/roberta-large-ner-english", aggregation_strategy="simple")

# Predict using Jean model
all_predictions = []
for tokens in tqdm(sentences):
    text = " ".join(tokens)
    preds = ner_model(text)
    all_predictions.append(preds)

Device set to use cpu
100%|██████████| 15/15 [00:07<00:00,  2.02it/s]


In [14]:
def get_labels_from_predictions(tokens, preds):
    labels = ['O'] * len(tokens)
    for ent in preds:
        entity = ent['entity_group']
        word_span = ent['word'].split()  # space-based tokenization
        start = ent['start']
        end = ent['end']
        label_prefix = 'B-'
        for i, tok in enumerate(tokens):
            if tok in word_span:
                labels[i] = f"{label_prefix}{entity}"
                label_prefix = 'I-'
    return labels


In [15]:
predicted_flat = []
for tokens, pred in zip(sentences, all_predictions):
    predicted_flat.extend(get_labels_from_predictions(tokens, pred))

# Add predicted labels to original df
df["predicted_label"] = predicted_flat

# ✅ OPTIONAL: Map Jean-Baptiste labels to match your gold test set (if needed)
label_map = {
    'B-PER': 'B-PERSON', 'I-PER': 'I-PERSON',
    'B-ORG': 'B-ORG', 'I-ORG': 'I-ORG',
    'B-LOC': 'B-LOCATION', 'I-LOC': 'I-LOCATION',
    'B-MISC': 'B-WORK_OF_ART', 'I-MISC': 'I-WORK_OF_ART',
    'O': 'O'
}
df['predicted_label'] = df['predicted_label'].replace(label_map)

# Group sentence-level predictions
true = df.groupby("sentence_id")["BIO_NER_tag"].apply(list).tolist()
pred = df.groupby("sentence_id")["predicted_label"].apply(list).tolist()

# ✅ Evaluation
print(classification_report(true, pred))
print("F1-score:", f1_score(true, pred))

              precision    recall  f1-score   support

    LOCATION       0.75      1.00      0.86         3
         ORG       0.67      0.50      0.57         8
      PERSON       0.92      0.92      0.92        12
 WORK_OF_ART       0.62      0.83      0.71         6

   micro avg       0.77      0.79      0.78        29
   macro avg       0.74      0.81      0.76        29
weighted avg       0.77      0.79      0.77        29

F1-score: 0.7796610169491527


In [2]:
from transformers import pipeline
import pandas as pd
from seqeval.metrics import classification_report, f1_score


In [3]:
ner_model = pipeline("ner", model="Jean-Baptiste/roberta-large-ner-english", aggregation_strategy="simple")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/849 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cpu


In [5]:
from google.colab import files
uploaded = files.upload()

Saving NER-test.tsv to NER-test.tsv


In [6]:
df_test = pd.read_csv("NER-test.tsv", sep="\t")
df_test = df_test.dropna(subset=["token", "BIO_NER_tag"])  # ensure clean rows


In [7]:
sentences = df_test.groupby("sentence_id")["token"].apply(list).tolist()
true_labels = df_test.groupby("sentence_id")["BIO_NER_tag"].apply(list).tolist()

In [8]:
pred_labels = []

for tokens in sentences:
    sentence = " ".join(tokens)
    ner_results = ner_model(sentence)

    # Create a default list with 'O' for each token
    predicted = ['O'] * len(tokens)

    for entity in ner_results:
        word = entity['word']
        entity_label = entity['entity_group']
        start = entity['start']
        end = entity['end']
        word_text = sentence[start:end]

        # match token index
        for idx, token in enumerate(tokens):
            if token in word_text and predicted[idx] == 'O':
                predicted[idx] = f'B-{entity_label}'
                break

    pred_labels.append(predicted)


In [9]:
label_map = {
    'PERSON': 'PER',
    'ORGANIZATION': 'ORG',
    'LOCATION': 'LOC',
    'WORK_OF_ART': 'MISC',
    'MISC': 'MISC',
    'PER': 'PER',
    'ORG': 'ORG',
    'LOC': 'LOC'
}

# Flattened map
pred_labels_mapped = [[label_map.get(tag.split("-")[-1], 'O') if tag != 'O' else 'O' for tag in seq] for seq in pred_labels]
true_labels_mapped = [[label_map.get(tag.split("-")[-1], 'O') if tag != 'O' else 'O' for tag in seq] for seq in true_labels]


In [10]:
print(classification_report(true_labels_mapped, pred_labels_mapped))
print("F1-score:", f1_score(true_labels_mapped, pred_labels_mapped))


              precision    recall  f1-score   support

          ER       0.08      0.08      0.08        12
         ISC       0.12      0.17      0.14         6
          OC       0.25      0.33      0.29         3

   micro avg       0.12      0.14      0.13        21
   macro avg       0.15      0.19      0.17        21
weighted avg       0.12      0.14      0.13        21

F1-score: 0.13333333333333333


