<a href="https://colab.research.google.com/github/avichbe/mlops-home_ass/blob/main/ModelEvaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Load the dataset that will be the base to the evaluation

In [6]:
!pip install datasets
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=1de74ba90d5629f6997c9be805444157167da6991faa1e71c619a2c4f94af070
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [1]:
def load_data(file_path):
    sentences = []
    sentence = []

    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line:
                if len(line.split()) == 2:
                    word, label = line.split()
                    sentence.append((word, label))
                else:
                    continue
            else:
                if sentence:
                    sentences.append(sentence)
                    sentence = []
        if sentence:
            sentences.append(sentence)

    return sentences

# Load the dataset from '/content/test.txt'
sentences = load_data('/content/test.txt')
print(f"Loaded {len(sentences)} sentences from the dataset.")

Loaded 664 sentences from the dataset.


After loading the data, I want to preprocsses it

In [2]:
# Prepare data for evaluation
def prepare_data(sentences):
    texts = [' '.join([word for word, tag in sentence]) for sentence in sentences]
    true_entities = [[tag for word, tag in sentence] for sentence in sentences]
    return texts, true_entities


In [14]:
# Flatten nested lists of tags for metric calculation
def flatten(list_of_lists):
    return [item for sublist in list_of_lists for item in sublist]

In [3]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import time

In [4]:
from sklearn.metrics import precision_recall_fscore_support
from datasets import load_metric

In [11]:
# Evaluate the model
# Evaluate the model
def evaluate_model(model_name, texts, true_entities):
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name)

    # Create NER pipeline
    ner_pipeline = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")

    # Load metric
    metric = load_metric("seqeval")

    predictions = []
    total_latency = 0

    # Run NER and calculate latency
    for text in texts:
        start_time = time.time()
        entities = ner_pipeline(text)
        end_time = time.time()

        # Extract predicted tags
        predicted_tags = ["O"] * len(text.split())  # Initialize with "O"
        for entity in entities:
            # Check if the entity start index is within the bounds of predicted_tags
            if entity['start'] < len(predicted_tags):
                predicted_tags[entity['start']] = entity['entity_group']

        predictions.append(predicted_tags)
        total_latency += (end_time - start_time)

    # Calculate precision, recall, and F1
    results = metric.compute(predictions=predictions, references=true_entities)

    average_latency = total_latency / len(texts)

    print(f"Model: {model_name}")
    print(f"Precision: {results['overall_precision']:.2f}")
    print(f"Recall: {results['overall_recall']:.2f}")
    print(f"F1-Score: {results['overall_f1']:.2f}")
    print(f"Average Latency: {average_latency:.4f} seconds")

    return results, average_latency

if __name__ == "__main__":
    # Path to the test file
    test_file_path = 'test.txt'  # Replace with your actual test file path

    # Load test data
    sentences = load_data(test_file_path)
    texts, true_entities = prepare_data(sentences)

    # Evaluate SecureBert-NER
    securebert_results, securebert_latency = evaluate_model("CyberPeace-Institute/SecureBERT-NER", texts, true_entities)


KeyboardInterrupt: 

In [15]:
def evaluate_model(model_name, texts, true_entities):
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name)

    # Create NER pipeline
    ner_pipeline = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")

    # Load seqeval metric
    metric = load_metric("seqeval")

    predictions = []
    total_latency = 0

    # Run NER and calculate latency
    for text in texts:
        start_time = time.time()
        entities = ner_pipeline(text)
        end_time = time.time()

        # Initialize predicted tags
        predicted_tags = ["O"] * len(text.split())

        # Extract predicted tags
        for entity in entities:
            entity_label = entity['entity_group']
            start_idx, end_idx = entity['start'], entity['end']

            # Convert the character index to word index
            tokenized_text = text.split()
            for i in range(start_idx, end_idx):
                # Ensure indices are within bounds
                if i < len(predicted_tags):
                    predicted_tags[i] = entity_label

        predictions.append(predicted_tags)
        total_latency += (end_time - start_time)

    # Flatten the true and predicted entities for seqeval
    flat_true_entities = true_entities
    flat_predicted_entities = predictions

    # Compute metrics
    results = metric.compute(predictions=flat_predicted_entities, references=flat_true_entities)

    average_latency = total_latency / len(texts)

    print(f"Model: {model_name}")
    print(f"Precision: {results['overall_precision']:.2f}")
    print(f"Recall: {results['overall_recall']:.2f}")
    print(f"F1-Score: {results['overall_f1']:.2f}")
    print(f"Average Latency: {average_latency:.4f} seconds")

    return results, average_latency

if __name__ == "__main__":
    # Path to the test file
    test_file_path = 'test.txt'  # Replace with your actual test file path

    # Load test data
    sentences = load_data(test_file_path)
    texts, true_entities = prepare_data(sentences)

    # Evaluate SecureBert-NER
    securebert_precision, securebert_recall, securebert_f1, securebert_latency = evaluate_model("CyberPeace-Institute/SecureBERT-NER", texts, true_entities)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model: CyberPeace-Institute/SecureBERT-NER
Precision: 0.00
Recall: 0.00
F1-Score: 0.00
Average Latency: 0.2245 seconds


ValueError: not enough values to unpack (expected 4, got 2)