In [2]:
import logging
from abc import ABC, abstractmethod
from dataclasses import dataclass, field

import numpy as np
import pandas as pd
from seqeval.metrics import classification_report as seq_classification
from seqeval.metrics import f1_score as seq_f1
from seqeval.metrics import precision_score as seq_precision
from seqeval.metrics import recall_score as seq_recall
from sklearn.metrics import classification_report as skl_classification
from sklearn.metrics import f1_score as skl_f1
from sklearn.metrics import precision_score as skl_precision
from sklearn.metrics import recall_score as skl_recall
from torch import nn
from seqeval.scheme import IOB1, IOB2, IOE1, IOE2, IOBES, BILOU, auto_detect

VALID_SCHEMES = {
    'IOB1':IOB1,
    'IOB2':IOB2,
    'IOE1':IOE1,
    'IOE2':IOE2,
    'IOBES':IOBES,
    'BILOU':BILOU
    }
class EvaluationStrategy(ABC):
    def __init__(self, inv_map):
        self.inv_map = inv_map
        self.ignore_index = nn.CrossEntropyLoss().ignore_index

    def align_predictions(self, predictions, truth):
        predictions = np.argmax(predictions, axis=2)
        batch_size, seq_len = predictions.shape

        truth_list = [[] for _ in range(batch_size)]
        pred_list = [[] for _ in range(batch_size)]

        for i in range(batch_size):
            for j in range(seq_len):
                if truth[i, j] != self.ignore_index:
                    truth_list[i].append(self.inv_map[truth[i][j]])
                    pred_list[i].append(self.inv_map[predictions[i][j]])
                    
        if len(truth_list) != len(pred_list):
            raise ValueError("Aligned predictions and truth have mismatched lengths.")
        return truth_list, pred_list

    def create_classification_report(self, results):
        lines = []
        for line in results.strip().split("\n")[1:]:
            if line.strip():
                tokens = line.split()
                # Remove intermediate aggregation if exists (multi-class)
                if len(tokens) > 5:
                    del tokens[1]
                lines.append(tokens)
        report = pd.DataFrame(
            lines, columns=["Tag", "Precision", "Recall", "F1", "Support"]
        )
        return report

class TokenEvaluationStrategy(EvaluationStrategy):
    def compute_metrics(self, true_labels, predictions):
        try:
            truth_list, pred_list = self.align_predictions(predictions, true_labels)
        except:
            logging.info('The labels already aligned, proceed with evaluation')
            truth_list, pred_list = true_labels, predictions
        
        flat_truth = [item for sublist in truth_list for item in sublist]
        flat_preds = [item for sublist in pred_list for item in sublist]
        report = skl_classification(y_true=flat_truth, y_pred=flat_preds, digits=4)
        report = self.create_classification_report(report)
        cleaned_report = self.clean_report(report)
        return {
            "Precision": skl_precision(
                y_true=flat_truth, y_pred=flat_preds, average="macro"
            ),
            "Recall": skl_recall(
                y_true=flat_truth, y_pred=flat_preds, average="macro"
                ),
            "F1": skl_f1(
                y_true=flat_truth, y_pred=flat_preds, average="macro"
                ),
            "classification": cleaned_report,
            "output": {"y_true": flat_truth, "y_pred": flat_preds},
        }

    def clean_report(self, report):
        report = report.copy()
        mask = report["Tag"] == "accuracy"
        accuracy_row = report[mask]
        if not accuracy_row.empty:
            # Get the accuracy value
            accuracy_value = accuracy_row["Precision"].values[
                0
            ]  # Assuming accuracy is stored in the 'Precision' column
            accuracy_support = accuracy_row["Recall"].values[
                0
            ]  # Assuming accuracy is stored in the 'Precision' column

            # Set the precision, recall, and F1-score to the accuracy value
            report.loc[mask, "Precision"] = accuracy_value
            report.loc[mask, "Recall"] = accuracy_value
            report.loc[mask, "F1"] = accuracy_value
            report.loc[mask, "Support"] = accuracy_support

            # Rename the tag from 'accuracy' to 'accuracy/micro' for clarity
            report.loc[report["Tag"] == "accuracy", "Tag"] = "accuracy/micro"
        return report
    
class EntityEvaluationStrategy(EvaluationStrategy):
    def compute_metrics(self, true_labels, predictions, entity_config):
        scheme = entity_config.get('scheme')  # Default to 'none' if not specified

        # Check if the scheme is valid and not 'none'
        try:
            truth_list, pred_list = self.align_predictions(predictions, true_labels)
        except:
            logging.info('The labels already aligned, proceed with evaluation')
            truth_list, pred_list = true_labels, predictions
            
        strict_outputs = self._evaluate_strict(truth_list, pred_list, scheme)
        non_strict_outputs = self._evaluate_non_strict(truth_list, pred_list)
        
        return {
            "strict": strict_outputs,
            "non_strict": non_strict_outputs,
            "output": {"y_true": truth_list, "y_pred": pred_list}
        }
        
        
    
    def _evaluate_strict(self, truth_list, pred_list, scheme):
        
        if scheme is not None and scheme in VALID_SCHEMES:
            scheme_class = VALID_SCHEMES[scheme]
            report = seq_classification(
                    y_true=truth_list,
                    y_pred=pred_list,
                    digits=4,
                    mode='strict',
                    scheme=scheme_class,
                )
            precision = seq_precision(
                    y_true=truth_list, y_pred=pred_list, average="micro", mode='strict', scheme = scheme_class
                )
            recall = seq_recall(
                    y_true=truth_list, y_pred=pred_list, average="micro", mode='strict', scheme = scheme_class
                )
            f1 = seq_f1(
                    y_true=truth_list, y_pred=pred_list, average="micro", mode='strict', scheme = scheme_class
                )
            
        else:
            logging.info("The scheme is unspecified; seqeval will auto-detect the scheme.")
            report = seq_classification(
                    y_true=truth_list,
                    y_pred=pred_list,
                    digits=4,
                    mode='strict',
                )
            scheme_class = auto_detect(pred_list, False)
            precision = seq_precision(
                    y_true=truth_list, y_pred=pred_list, average="micro", mode='strict', scheme = scheme_class
                )
            recall = seq_recall(
                    y_true=truth_list, y_pred=pred_list, average="micro", mode='strict', scheme = scheme_class
                )
            f1 = seq_f1(
                    y_true=truth_list, y_pred=pred_list, average="micro", mode='strict', scheme = scheme_class
                )
        
        return {
            "Precision": precision,
            "Recall": recall,
            "F1": f1,
            "classification": self.create_classification_report(report),
                
            }
    
        
    def _evaluate_non_strict(self, truth_list, pred_list,):
        
        report = seq_classification(
                    y_true=truth_list,
                    y_pred=pred_list,
                    digits=4,
                )
        precision = seq_precision(
                y_true=truth_list, y_pred=pred_list, average="micro"
            )
        recall = seq_recall(
                    y_true=truth_list, y_pred=pred_list, average="micro"
                )
        f1 = seq_f1(
                y_true=truth_list, y_pred=pred_list, average="micro"
            )
        return {
            "Precision": precision,
            "Recall": recall,
            "F1": f1,
            "classification": self.create_classification_report(report),
                
            }



class Evaluation:
    def __init__(self, inv_map, y_true, y_pred, evaluation_config):
        self.truths = y_true
        self.predictions = y_pred
        self.evaluation_config = evaluation_config
        self.token_strategy = TokenEvaluationStrategy(inv_map)
        self.entity_strategy = EntityEvaluationStrategy(inv_map)

    

    def evaluate(self):
        token_metrics = self.token_strategy.compute_metrics(
            self.truths, self.predictions
        )
        entity_metrics = self.entity_strategy.compute_metrics(
            self.truths, self.predictions, self.evaluation_config
        )

        # Combine or store results as needed
        return {"Token_Level": token_metrics, "Entity_Level": entity_metrics}

    def _prepare_results(self, metrics):
        results = pd.DataFrame.from_dict(self._round_and_slice(metrics))
        report = metrics["classification"]
        output = metrics["output"]
        return results, report, output
    
    def _prepare_entity_results(self, metrics):
        strict = metrics["strict"]
        non_strict = metrics["non_strict"]
        entity_strict_results = pd.DataFrame.from_dict(self._round_and_slice(strict))
        entity_non_strict_results = pd.DataFrame.from_dict(self._round_and_slice(non_strict))
        entity_strict_report = strict['classification']
        entity_non_strict_report = non_strict['classification']
        output = metrics["output"]
        return {
            'entity_strict_results': entity_strict_results,
            'entity_non_strict_results': entity_non_strict_results,
            'entity_strict_report': entity_strict_report,
            'entity_non_strict_report': entity_non_strict_report,
            'output': output,
        }

    def _round_and_slice(self, dictionary):
        # Slicing and rounding results for cleaner presentation
        
        keys_for_slicing = ["Precision", "Recall", "F1"]
        sliced_dict = {key: [round(dictionary[key], 4)] for key in keys_for_slicing}
        return sliced_dict
    
    def generate_results(self):
        metrics = self.evaluate()
        token_results, token_report, token_outputs = self._prepare_results(
            metrics["Token_Level"]
        )
        entity_level_outputs = self._prepare_entity_results(
            metrics["Entity_Level"]
        )
        
        

        return {
            "token_results": token_results,
            "token_report": token_report,
            "token_outputs": token_outputs,
            "entity_strict_results": entity_level_outputs['entity_strict_results'],
            "entity_non_strict_results": entity_level_outputs['entity_non_strict_results'],
            "entity_strict_report": entity_level_outputs['entity_strict_report'],
            "entity_non_strict_report": entity_level_outputs['entity_non_strict_report'],
            "entity_outputs": entity_level_outputs['output'],
        }


@dataclass
class Metrics:
    token_results: pd.DataFrame = field(default_factory=pd.DataFrame)
    token_report: pd.DataFrame = field(default_factory=pd.DataFrame)
    token_outputs: dict = field(default_factory=dict)
    entity_strict_results: pd.DataFrame = field(default_factory=pd.DataFrame)
    entity_non_strict_results: pd.DataFrame = field(default_factory=pd.DataFrame)
    entity_strict_report: pd.DataFrame = field(default_factory=pd.DataFrame)
    entity_non_strict_report: pd.DataFrame = field(default_factory=pd.DataFrame)
    entity_outputs: dict = field(default_factory=dict)

    @staticmethod
    def from_dict(data: dict):
        """Create an instance from a dictionary."""
        required_keys = [
        "token_results", "token_report", "token_outputs",
        "entity_strict_results", "entity_non_strict_results",
        "entity_strict_report", "entity_non_strict_report", "entity_outputs"
        ]
        missing_keys = [key for key in required_keys if key not in data]
        if missing_keys:
            raise ValueError(f"Missing required keys in data: {missing_keys}")
        return Metrics(**data)

    def to_dict(self):
        return {
            "token_results": self.token_results.to_dict(orient="records"),
            "token_report": self.token_report.to_dict(orient="records"),
            "token_outputs": self.token_outputs,
            "entity_strict_results": self.entity_strict_results.to_dict(orient="records"),     
            "entity_non_strict_results": self.entity_non_strict_results.to_dict(orient="records"),
            "entity_strict_report": self.entity_strict_report.to_dict(orient="records"), 
            "entity_non_strict_report": self.entity_non_strict_report.to_dict(orient="records"), 
            "entity_outputs": self.entity_outputs,
        }
def combine_results(metrics):

        entity_results = metrics.entity_non_strict_results.copy()
        entity_strict_results = metrics.entity_strict_results.copy()
        token_results = metrics.token_results.copy()
        entity_results["Type"] = "Non Strict"
        entity_strict_results["Type"] = "IOB2"
        token_results["Type"] = "Token"
        df_combined = pd.concat([entity_results, entity_strict_results, token_results]).reset_index(drop=True)
        return df_combined

In [None]:
import json
import pandas as pd
from pathlib import Path

def run_evaluation(dataset, model):
    base_dir = Path('/Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/BaseLineExperiment')
    variant = f'{dataset}_{model}'
    variant_folder = base_dir / variant

    # File paths
    evaluation_metrics_file = variant_folder / 'fine_tuning/evaluation_metrics.json'
    corpus_file = Path('/Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/ExperimentData/corpora.json')

    # Load evaluation metrics and corpus
    with open(evaluation_metrics_file, 'r') as file:
        entity_outputs = json.load(file)
    with open(corpus_file, 'r') as file:
        corpora = json.load(file)

    # More paths based on the model folder
    results_folder = variant_folder / 'extractions/results'
    # replace the current ones with a backup
    replacements = {
        'token_report': 'token_report.json',
        'entity_report': 'entity_report.json',
        'results': 'results.json'
    }
    
    for key, filename in replacements.items():
        original_path = results_folder / filename
        backup_path = results_folder / f"{filename.split('.')[0]}-backup.json"
        df = pd.read_json(original_path, lines=True)
        df.to_json(backup_path, lines=True, orient='records')
        print(f"Backed up {filename} to {backup_path}")


    # Extract necessary data for evaluation
    y_true = entity_outputs['entity_outputs']['y_true']
    y_pred = entity_outputs['entity_outputs']['y_pred']
    corpus = corpora[dataset]
    labels_map = corpus["labels_map"]
    inv_labels_map = {v: k for k, v in labels_map.items()}
    
    # Simulated evaluation process (replace with your actual evaluation code)
    evaluation_config = {'scheme': 'IOB2'}
    evaluator = Evaluation(
                inv_labels_map, y_true, y_pred, evaluation_config
            )
    results = evaluator.generate_results()
    metrics = Metrics.from_dict(results)


    savings = {
        'token_report': metrics.token_report,
        'entity_non_strict_report': metrics.entity_non_strict_report,
        'entity_strict_report': metrics.entity_strict_report,
        'results': combine_results(metrics)  # Assuming this DataFrame contains the combined results
    }
    # Save updated reports
    for key, report_df in savings.items():
        if report_df.empty:
            continue  # Skip if no data, or adapt as necessary
        report_path = results_folder / f"{key}.json"
        report_df.to_json(report_path, lines=True, orient='records')
        print(f"Saved updated {key} to {report_path}")

In [None]:
model_name = 'bert'
dataset_name = 'conll2003'

run_evaluation(dataset_name, model_name)


Backed up token_report.json to /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/BaseLineExperiment/conll2003_bert/extractions/results/token_report-backup.json
Backed up entity_report.json to /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/BaseLineExperiment/conll2003_bert/extractions/results/entity_report-backup.json
Backed up results.json to /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/BaseLineExperiment/conll2003_bert/extractions/results/results-backup.json
Saved updated token_report to /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/BaseLineExperiment/conll2003_bert/extractions/results/token_report.json
Saved updated entity_non_strict_

In [None]:
model_name = 'ANERCorp_CamelLab'
dataset_name = 'arabertv02'


run_evaluation(model_name, dataset_name)


Backed up token_report.json to /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/BaseLineExperiment/ANERCorp_CamelLab_arabertv02/extractions/results/token_report-backup.json
Backed up entity_report.json to /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/BaseLineExperiment/ANERCorp_CamelLab_arabertv02/extractions/results/entity_report-backup.json
Backed up results.json to /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/BaseLineExperiment/ANERCorp_CamelLab_arabertv02/extractions/results/results-backup.json
Saved updated token_report to /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/BaseLineExperiment/ANERCorp_CamelLab_arabertv02/extractions/re

In [33]:
base_test = Path('/Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/BaseLineExperiment/conll2003_bert/extractions/results')

In [35]:

scrapped = pd.read_json(
	base_test / 'entity_report-backup.json',
	lines=True
)


entity = pd.read_json(
	base_test / 'entity_non_strict_report.json',
	lines=True
)


strict = pd.read_json(
	base_test / 'entity_strict_report.json',
	lines=True
)




In [41]:
entity['F1'] == scrapped['F1'].round(4)

0    True
1    True
2    True
3    True
4    True
5    True
6    True
Name: F1, dtype: bool

In [None]:
entity['F1']

Unnamed: 0,Tag,Precision,Recall,F1,Support
0,LOC,0.9206,0.9317,0.9261,1668
1,MISC,0.7907,0.8234,0.8067,702
2,ORG,0.8886,0.9073,0.8978,1661
3,PER,0.9604,0.9604,0.9604,1617
4,micro,0.9058,0.9193,0.9125,5648
5,macro,0.8901,0.9057,0.8978,5648
6,weighted,0.9064,0.9193,0.9128,5648


# looose

In [None]:
import json
from pathlib import Path
corpus_name = 'conll2003'
model_name = 'bert'
variant = f'{corpus_name}_{model_name}'

file_name = f'/Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/BaseLineExperiment/{variant}/fine_tuning/evaluation_metrics.json'
corpus_file = '/Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/ExperimentData/corpora.json'
with open(file_name, 'r') as file:
    entity_outputs = json.load(file)  # Use json.load() to read file, not json.loads()
    
with open(corpus_file, 'r') as file:
    corpora = json.load(file)  # Use json.load() to read file, not json.loads()


base_folder = Path('/Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/BaseLineExperiment/ANERCorp_CamelLab_arabertv02/extractions/results')
original_token_report_path = base_folder / 'token_report.json'
scrapped_token_report_path = base_folder / 'token_report-backup.json'
original_entity_report_path = base_folder / 'entity_report.json'
scrapped_entity_report_path = base_folder / 'entity_report-backup.json'
strict_entity_report_path = base_folder / 'strict_entity_report.json'
original_results_path = base_folder / 'results.json'
scrapped_results_path = base_folder / 'results-backup.json'

In [26]:
y_true = entity_outputs['entity_outputs']['y_true']
y_pred = entity_outputs['entity_outputs']['y_pred']
corpus = corpora[corpus_name]
labels_map = corpus["labels_map"]
inv_labels_map = {v: k for k, v in labels_map.items()}
evaluation_config = {
	'scheme': 'IOB2'
}


In [27]:
evaluator = Evaluation(
                inv_labels_map, y_true, y_pred, evaluation_config
            )
results = evaluator.generate_results()
metrics = Metrics.from_dict(results)

In [28]:
combine_results(metrics)

Unnamed: 0,Precision,Recall,F1,Type
0,0.9058,0.9193,0.9125,Non Strict
1,0.9126,0.9184,0.9155,IOB2
2,0.8945,0.9156,0.9047,Token


In [31]:
metrics.token_report

Unnamed: 0,Tag,Precision,Recall,F1,Support
0,B-LOC,0.9287,0.9371,0.9329,1668
1,B-MISC,0.8296,0.839,0.8343,702
2,B-ORG,0.9082,0.9169,0.9125,1661
3,B-PER,0.9665,0.9647,0.9656,1617
4,I-LOC,0.8495,0.9222,0.8843,257
5,I-MISC,0.688,0.7454,0.7156,216
6,I-ORG,0.8962,0.9305,0.913,835
7,I-PER,0.9871,0.9905,0.9888,1156
8,O,0.9968,0.9939,0.9953,38323
9,accuracy/micro,0.9829,0.9829,0.9829,46435


In [30]:
metrics.entity_strict_report

Unnamed: 0,Tag,Precision,Recall,F1,Support
0,LOC,0.9234,0.9317,0.9275,1668
1,MISC,0.8099,0.8191,0.8144,702
2,ORG,0.8974,0.9061,0.9017,1661
3,PER,0.9622,0.9604,0.9613,1617
4,micro,0.9126,0.9184,0.9155,5648
5,macro,0.8982,0.9043,0.9012,5648
6,weighted,0.9127,0.9184,0.9155,5648


In [29]:
metrics.entity_non_strict_report

Unnamed: 0,Tag,Precision,Recall,F1,Support
0,LOC,0.9206,0.9317,0.9261,1668
1,MISC,0.7907,0.8234,0.8067,702
2,ORG,0.8886,0.9073,0.8978,1661
3,PER,0.9604,0.9604,0.9604,1617
4,micro,0.9058,0.9193,0.9125,5648
5,macro,0.8901,0.9057,0.8978,5648
6,weighted,0.9064,0.9193,0.9128,5648


In [6]:
token_report = pd.read_json(
	base_folder / 'token_report.json',
	lines=True
)
token_report.to_json(
	base_folder / scrapped_token_report_path,
	lines=True, orient='records'
)
entity_report = pd.read_json(
	base_folder / 'entity_report.json',
	lines=True
)
entity_report.to_json(
	base_folder / scrapped_entity_report_path,
	lines=True, orient='records'
)
results = pd.read_json(
	base_folder / 'results.json',
	lines=True
)
results.to_json(
	base_folder / scrapped_results_path,
	lines=True, orient='records'
)

In [None]:
combine_results(metrics).to_json(
	base_folder / original_results_path,
	lines=True, orient='records'
)
metrics.token_report.to_json(
	base_folder / original_token_report_path,
	lines=True, orient='records'
)

metrics.entity_non_strict_report.to_json(
	base_folder / original_entity_report_path,
	lines=True, orient='records'
)
metrics.entity_strict_report.to_json(
	base_folder / strict_entity_report_path,
	lines=True, orient='records'
)


In [None]:
combine_results(metrics).to_json(
	base_folder / original_results_path,
	lines=True, orient='records'
)
metrics.token_report.to_json(
	base_folder / original_token_report_path,
	lines=True, orient='records'
)

metrics.entity_non_strict_report.to_json(
	base_folder / original_entity_report_path,
	lines=True, orient='records'
)
metrics.entity_strict_report.to_json(
	base_folder / strict_entity_report_path,
	lines=True, orient='records'
)