In [35]:
import json
from pathlib import Path
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from collections import Counter

class TokenConfusion:
    def __init__(self, token_outputs):
        """
        Initialize the TokenConfusion class.
        
        Args:
            token_outputs (dict): A dictionary containing 'y_true' and 'y_pred' lists.
        """
        self.y_true = token_outputs['y_true']
        self.y_pred = token_outputs['y_pred']
        self.tags = list(set(self.y_true) | set(self.y_pred))
        self.cm = confusion_matrix(self.y_true, self.y_pred, labels=self.tags)
        self.cm_df = pd.DataFrame(self.cm, index=self.tags, columns=self.tags)
        self.zero_diagonal_cm_df = self.cm_df.copy()
        np.fill_diagonal(self.zero_diagonal_cm_df.values, 0)  # Zero out the diagonal elements

    def generate_confusion_matrix(self):
        """
        Generate and process the confusion matrix to remove diagonal elements.
        """
        cm = confusion_matrix(self.y_true, self.y_pred, labels=self.tags)
        cm_df = pd.DataFrame(cm, index=self.tags, columns=self.tags)
        np.fill_diagonal(cm_df.values, 0)  # Zero out the diagonal elements
        self.misclassification_matrix = cm_df

    def analyze_errors(self):
        """
        Analyze the confusion matrix to calculate TP, FP, FN, and TN for each tag.
        Also detail the breakdown of FPs and FNs.
        """
        results = {
            'confusion_matrix': {},
            'false_negatives': {tag: Counter() for tag in self.tags},
            'false_positives': {tag: Counter() for tag in self.tags}
        }
        # cm = confusion_matrix(self.y_true, self.y_pred, labels=self.tags)
        # cm_df = pd.DataFrame(cm, index=self.tags, columns=self.tags)
        # np.fill_diagonal(cm_df.values, 0)  # Zero out the diagonal elements
        # diagonal_cm_df = pd.DataFrame(cm.copy(), index=self.tags, columns=self.tags)
        # print(diagonal_cm_df)
        

        # Populate results
        for tag in self.tags:
            TP = int(self.cm_df.at[tag, tag])
            FP = int(self.zero_diagonal_cm_df[tag].sum())
            FN = int(self.zero_diagonal_cm_df.loc[tag].sum())
            TN = int(self.cm_df.values.sum()) - (TP + FP + FN)
            results['confusion_matrix'][tag] = {'TP': TP, 'FP': FP, 'FN': FN, 'TN': TN}

            # Analyzing false negatives
            for i, actual_tag in enumerate(self.tags):
                if actual_tag != tag:
                    FN_count = int(self.cm_df.at[actual_tag, tag])
                    FN_count = int(self.cm_df.at[actual_tag, tag])
                    if FN_count > 0:
                        results['false_negatives'][tag][actual_tag] += FN_count

            # Analyzing false positives
            for j, pred_tag in enumerate(self.tags):
                if pred_tag != tag:
                    FP_count = int(self.cm_df.at[tag, pred_tag])
                    FP_count = int(self.cm_df.at[tag, pred_tag])
                    if FP_count > 0:
                        results['false_positives'][tag][pred_tag] += FP_count

        self.results = results

    def get_results(self):
        """
        Return the analysis results.
        
        Returns:
            dict: A dictionary containing the confusion matrix, and details on false negatives and false positives.
        """
        self.generate_confusion_matrix()
        self.analyze_errors()
        return self.results
    
    def get_misclassification(self):
        """
        Return the analysis results.
        
        Returns:
            dict: A dictionary containing the confusion matrix, and details on false negatives and false positives.
        """
        return self.misclassification_matrix


def save_json(file_path, data):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)


def process_token_confusion(model_name, dataset_name):
    base_folder = Path(f'/Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/BaseLineExperiment/{dataset_name}_{model_name}')
    evaluation_metrics_file = base_folder / 'fine_tuning/evaluation_metrics.json'
    token_confusion_matrix_path = base_folder / "extractions/results/token_confusion_matrix.json"
    token_misclassifications_path = base_folder / "extractions/results/token_misclassifications.json"

    with open(evaluation_metrics_file, 'r') as file:
        output = json.load(file)
    token_outputs = output['token_outputs']

    confusion = TokenConfusion(token_outputs)
    results = confusion.get_results()
    misclassification = confusion.get_misclassification()
    save_json(token_confusion_matrix_path, results)
    misclassification.to_json(token_misclassifications_path, orient='index')
    import pprint
    pprint.pprint(results)
    pprint.pprint(misclassification)

In [36]:
model_name = 'arabertv02'
dataset_name = 'ANERCorp_CamelLab'
process_token_confusion(model_name, dataset_name)

{'confusion_matrix': {'B-LOC': {'FN': 35, 'FP': 66, 'TN': 24257, 'TP': 633},
                      'B-MISC': {'FN': 76, 'FP': 34, 'TN': 24722, 'TP': 159},
                      'B-ORG': {'FN': 95, 'FP': 76, 'TN': 24465, 'TP': 355},
                      'B-PERS': {'FN': 104, 'FP': 88, 'TN': 24045, 'TP': 754},
                      'I-LOC': {'FN': 15, 'FP': 16, 'TN': 24892, 'TP': 68},
                      'I-MISC': {'FN': 97, 'FP': 15, 'TN': 24811, 'TP': 68},
                      'I-ORG': {'FN': 76, 'FP': 49, 'TN': 24667, 'TP': 199},
                      'I-PERS': {'FN': 64, 'FP': 51, 'TN': 24299, 'TP': 577},
                      'O': {'FN': 144, 'FP': 311, 'TN': 3064, 'TP': 21472}},
 'false_negatives': {'B-LOC': Counter({'B-ORG': 19,
                                       'O': 14,
                                       'I-MISC': 12,
                                       'B-PERS': 8,
                                       'I-LOC': 7,
                                       'B-MISC':

In [37]:
model_name = 'bert'
dataset_name = 'conll2003'
process_token_confusion(model_name, dataset_name)

{'confusion_matrix': {'B-LOC': {'FN': 105, 'FP': 120, 'TN': 44647, 'TP': 1563},
                      'B-MISC': {'FN': 113, 'FP': 121, 'TN': 45612, 'TP': 589},
                      'B-ORG': {'FN': 138, 'FP': 154, 'TN': 44620, 'TP': 1523},
                      'B-PER': {'FN': 57, 'FP': 54, 'TN': 44764, 'TP': 1560},
                      'I-LOC': {'FN': 20, 'FP': 42, 'TN': 46136, 'TP': 237},
                      'I-MISC': {'FN': 55, 'FP': 73, 'TN': 46146, 'TP': 161},
                      'I-ORG': {'FN': 58, 'FP': 90, 'TN': 45510, 'TP': 777},
                      'I-PER': {'FN': 11, 'FP': 15, 'TN': 45264, 'TP': 1145},
                      'O': {'FN': 235, 'FP': 123, 'TN': 7989, 'TP': 38088}},
 'false_negatives': {'B-LOC': Counter({'B-ORG': 57,
                                       'B-MISC': 25,
                                       'O': 24,
                                       'B-PER': 12,
                                       'I-ORG': 2}),
                     'B-MISC': Counte

# Loose 

In [112]:
model_name = 'bert'
dataset_name = 'conll2003'
base_folder = Path(f'/Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/BaseLineExperiment/{dataset_name}_{model_name}')
token_confusion_matrix_path = base_folder / "extractions/results/token_confusion_matrix.json"
with open(token_confusion_matrix_path, 'r') as file:
	output = json.load(file)  # Use json.load() to read file, not json.loads()

In [113]:
output

{'confusion_matrix': {'I-ORG': {'TP': 0, 'FP': 49, 'FN': 76, 'TN': 581},
  'B-PERS': {'TP': 0, 'FP': 88, 'FN': 104, 'TN': 514},
  'B-MISC': {'TP': 0, 'FP': 34, 'FN': 76, 'TN': 596},
  'I-MISC': {'TP': 0, 'FP': 15, 'FN': 97, 'TN': 594},
  'B-ORG': {'TP': 0, 'FP': 76, 'FN': 95, 'TN': 535},
  'I-PERS': {'TP': 0, 'FP': 51, 'FN': 64, 'TN': 591},
  'I-LOC': {'TP': 0, 'FP': 16, 'FN': 15, 'TN': 675},
  'O': {'TP': 0, 'FP': 311, 'FN': 144, 'TN': 251},
  'B-LOC': {'TP': 0, 'FP': 66, 'FN': 35, 'TN': 605}},
 'false_negatives': {'I-ORG': {'B-PERS': 2,
   'I-MISC': 14,
   'B-ORG': 1,
   'I-PERS': 5,
   'O': 25,
   'B-LOC': 2},
  'B-PERS': {'I-ORG': 2,
   'I-MISC': 1,
   'B-ORG': 15,
   'I-PERS': 35,
   'O': 34,
   'B-LOC': 1},
  'B-MISC': {'B-PERS': 4,
   'I-MISC': 5,
   'B-ORG': 7,
   'I-PERS': 1,
   'O': 14,
   'B-LOC': 3},
  'I-MISC': {'I-ORG': 2, 'B-PERS': 1, 'B-MISC': 1, 'I-PERS': 1, 'O': 10},
  'B-ORG': {'I-ORG': 9,
   'B-PERS': 12,
   'B-MISC': 10,
   'I-MISC': 2,
   'I-PERS': 1,
   'O': 39,


In [73]:
model_name = 'arabertv02'
dataset_name = 'ANERCorp_CamelLab'
base_folder = Path(f'/Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/BaseLineExperiment/{dataset_name}_{model_name}')
evaluation_metrics_file = base_folder / 'fine_tuning/evaluation_metrics.json'
with open(evaluation_metrics_file, 'r') as file:
	output = json.load(file)  # Use json.load() to read file, not json.loads()
token_outputs = output['token_outputs']
confusion = TokenConfusion(token_outputs)
confusion.generate_confusion_matrix()
confusion.analyze_errors()
results = confusion.get_results()
misclassification = confusion.get_misclassification()

import pprint
pprint.pprint(results)
pprint.pprint(misclassification)


{'confusion_matrix': {'B-LOC': {'FN': 35, 'FP': 66, 'TN': 605, 'TP': 0},
                      'B-MISC': {'FN': 76, 'FP': 34, 'TN': 596, 'TP': 0},
                      'B-ORG': {'FN': 95, 'FP': 76, 'TN': 535, 'TP': 0},
                      'B-PERS': {'FN': 104, 'FP': 88, 'TN': 514, 'TP': 0},
                      'I-LOC': {'FN': 15, 'FP': 16, 'TN': 675, 'TP': 0},
                      'I-MISC': {'FN': 97, 'FP': 15, 'TN': 594, 'TP': 0},
                      'I-ORG': {'FN': 76, 'FP': 49, 'TN': 581, 'TP': 0},
                      'I-PERS': {'FN': 64, 'FP': 51, 'TN': 591, 'TP': 0},
                      'O': {'FN': 144, 'FP': 311, 'TN': 251, 'TP': 0}},
 'false_negatives': {'B-LOC': Counter({'B-ORG': 19,
                                       'O': 14,
                                       'I-MISC': 12,
                                       'B-PERS': 8,
                                       'I-LOC': 7,
                                       'B-MISC': 4,
                               

In [75]:
model_name = 'bert'
dataset_name = 'conll2003'
base_folder = Path(f'/Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/BaseLineExperiment/{dataset_name}_{model_name}')
evaluation_metrics_file = base_folder / 'fine_tuning/evaluation_metrics.json'
with open(evaluation_metrics_file, 'r') as file:
	output = json.load(file)  # Use json.load() to read file, not json.loads()
token_outputs = output['token_outputs']
confusion = TokenConfusion(token_outputs)
confusion.generate_confusion_matrix()
confusion.analyze_errors()
results = confusion.get_results()
misclassification = confusion.get_misclassification()

import pprint
pprint.pprint(results)
pprint.pprint(misclassification)


{'confusion_matrix': {'B-LOC': {'FN': 105, 'FP': 120, 'TN': 567, 'TP': 0},
                      'B-MISC': {'FN': 113, 'FP': 121, 'TN': 558, 'TP': 0},
                      'B-ORG': {'FN': 138, 'FP': 154, 'TN': 500, 'TP': 0},
                      'B-PER': {'FN': 57, 'FP': 54, 'TN': 681, 'TP': 0},
                      'I-LOC': {'FN': 20, 'FP': 42, 'TN': 730, 'TP': 0},
                      'I-MISC': {'FN': 55, 'FP': 73, 'TN': 664, 'TP': 0},
                      'I-ORG': {'FN': 58, 'FP': 90, 'TN': 644, 'TP': 0},
                      'I-PER': {'FN': 11, 'FP': 15, 'TN': 766, 'TP': 0},
                      'O': {'FN': 235, 'FP': 123, 'TN': 434, 'TP': 0}},
 'false_negatives': {'B-LOC': Counter({'B-ORG': 57,
                                       'B-MISC': 25,
                                       'O': 24,
                                       'B-PER': 12,
                                       'I-ORG': 2}),
                     'B-MISC': Counter({'O': 53,
                             

In [76]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import plotly.figure_factory as ff
import numpy as np
import plotly.express as px


fig = px.imshow(
    misclassification,
    labels=dict(x="Predicted label", y="True label", color="Number of Tokens"),
    x=misclassification.columns,
    y=misclassification.index,
    text_auto=True,  # Automatically add text on each cell
    color_continuous_scale='Blues'
)

# Update layout for better clarity
fig.update_layout(
    title="Confusion Matrix",
    xaxis_title="Predicted label",
    yaxis_title="True label",
    xaxis=dict(tickangle=-45),
    margin=dict(t=50, l=200),
    width=800,
    height=600
)

fig.show()


In [34]:
cm_df

Unnamed: 0,I-ORG,B-PERS,B-MISC,I-MISC,B-ORG,I-PERS,I-LOC,O,B-LOC
I-ORG,0,2,0,2,9,6,5,50,2
B-PERS,2,0,4,1,12,33,0,44,8
B-MISC,0,0,0,1,10,4,0,57,4
I-MISC,14,1,5,0,2,1,3,59,12
B-ORG,1,15,7,0,0,0,4,49,19
I-PERS,5,35,1,1,1,0,1,20,0
I-LOC,0,0,0,0,0,0,0,8,7
O,25,34,14,10,39,6,2,0,14
B-LOC,2,1,3,0,3,1,1,24,0


In [21]:
from sklearn.metrics import confusion_matrix
import numpy as np
from collections import Counter

# Calculate confusion matrix
cm = confusion_matrix(y_true, y_pred, labels=tags)
cm_df = pd.DataFrame(cm, index=tags, columns=tags)

# Structures to hold results
results = {
    'confusion_matrix': {},
    'false_negatives': {tag: Counter() for tag in tags},
    'false_positives': {tag: Counter() for tag in tags}
}
# Populate results
for tag in tags:
    TP = cm_df.loc[tag, tag]
    FP = cm_df[tag].sum() - TP
    FN = cm_df.loc[tag].sum() - TP
    TN = cm_df.sum().sum() - (TP + FP + FN)  # Total all cells - (TP + FP + FN for this tag)

    results['confusion_matrix'][tag] = {'TP': TP, 'FP': FP, 'FN': FN, 'TN': TN}
    
    # Analyzing false negatives
    for i, actual_tag in enumerate(tags):
        if actual_tag != tag:
            FN_count = cm_df.at[actual_tag, tag]
            if FN_count > 0:
                results['false_negatives'][tag][actual_tag] += FN_count
    
    # Analyzing false positives
    for j, pred_tag in enumerate(tags):
        if pred_tag != tag:
            FP_count = cm_df.at[tag, pred_tag]
            if FP_count > 0:
                results['false_positives'][tag][pred_tag] += FP_count

# Example of output
import pprint
pprint.pprint(results)

{'confusion_matrix': {'B-LOC': {'FN': 35, 'FP': 66, 'TN': 24257, 'TP': 633},
                      'B-MISC': {'FN': 76, 'FP': 34, 'TN': 24722, 'TP': 159},
                      'B-ORG': {'FN': 95, 'FP': 76, 'TN': 24465, 'TP': 355},
                      'B-PERS': {'FN': 104, 'FP': 88, 'TN': 24045, 'TP': 754},
                      'I-LOC': {'FN': 15, 'FP': 16, 'TN': 24892, 'TP': 68},
                      'I-MISC': {'FN': 97, 'FP': 15, 'TN': 24811, 'TP': 68},
                      'I-ORG': {'FN': 76, 'FP': 49, 'TN': 24667, 'TP': 199},
                      'I-PERS': {'FN': 64, 'FP': 51, 'TN': 24299, 'TP': 577},
                      'O': {'FN': 144, 'FP': 311, 'TN': 3064, 'TP': 21472}},
 'false_negatives': {'B-LOC': Counter({'B-ORG': 19,
                                       'O': 14,
                                       'I-MISC': 12,
                                       'B-PERS': 8,
                                       'I-LOC': 7,
                                       'B-MISC':

In [18]:
results


{'confusion_matrix': {'I-ORG': {'TP': 199, 'FP': 49, 'FN': 76},
  'B-PERS': {'TP': 754, 'FP': 88, 'FN': 104},
  'B-MISC': {'TP': 159, 'FP': 34, 'FN': 76},
  'I-MISC': {'TP': 68, 'FP': 15, 'FN': 97},
  'B-ORG': {'TP': 355, 'FP': 76, 'FN': 95},
  'I-PERS': {'TP': 577, 'FP': 51, 'FN': 64},
  'I-LOC': {'TP': 68, 'FP': 16, 'FN': 15},
  'O': {'TP': 21472, 'FP': 311, 'FN': 144},
  'B-LOC': {'TP': 633, 'FP': 66, 'FN': 35}},
 'false_negatives': {'I-ORG': Counter({'O': 25,
           'I-MISC': 14,
           'I-PERS': 5,
           'B-PERS': 2,
           'B-LOC': 2,
           'B-ORG': 1}),
  'B-PERS': Counter({'I-PERS': 35,
           'O': 34,
           'B-ORG': 15,
           'I-ORG': 2,
           'I-MISC': 1,
           'B-LOC': 1}),
  'B-MISC': Counter({'O': 14,
           'B-ORG': 7,
           'I-MISC': 5,
           'B-PERS': 4,
           'B-LOC': 3,
           'I-PERS': 1}),
  'I-MISC': Counter({'O': 10,
           'I-ORG': 2,
           'B-PERS': 1,
           'B-MISC': 1,
         

In [10]:
import pandas as pd
from sklearn.metrics import confusion_matrix
import numpy as np

# Example data
y_true = ['B-PER', 'I-PER', 'O', 'B-ORG', 'I-ORG', 'O', 'B-LOC', 'O', 'I-PER', 'B-LOC', 'I-PER', 'B-PER']
y_pred = ['B-PER', 'B-PER', 'O', 'B-ORG', 'I-ORG', 'O', 'B-LOC', 'B-ORG', 'O', 'O', 'O', 'O']
tags = list(set(y_true) | set(y_pred))

# Calculate confusion matrix and create DataFrame
cm = confusion_matrix(y_true, y_pred, labels=tags)
cm_df = pd.DataFrame(cm, index=tags, columns=tags)

# FP and FN analysis
fp_details = {}
fn_details = {}

for tag in tags:
    # For FP, we need to find where this tag was predicted, but wasn't the true tag
    fp_index = cm_df.columns != tag
    fp_details[tag] = cm_df.loc[tag, fp_index]

    # For FN, we need to find where this tag was true, but wasn't predicted
    fn_index = cm_df.index != tag
    fn_details[tag] = cm_df.loc[fn_index, tag]



In [9]:
fp_details

{'I-ORG': {},
 'B-ORG': {'O': 1},
 'I-PER': {},
 'O': {'I-PER': 2, 'B-LOC': 1, 'B-PER': 1},
 'B-LOC': {},
 'B-PER': {'I-PER': 1}}

In [6]:
# FP and FN detailed breakdown
fp_details = {}
fn_details = {}

for tag in tags:
    # FP: sum of the column for 'tag' excluding the diagonal
    fp_details[tag] = {pred_tag: cm_df.at[pred_tag, tag] for pred_tag in tags if pred_tag != tag and cm_df.at[pred_tag, tag] > 0}

    # FN: sum of the row for 'tag' excluding the diagonal
    fn_details[tag] = {true_tag: cm_df.at[tag, true_tag] for true_tag in tags if true_tag != tag and cm_df.at[tag, true_tag] > 0}


In [7]:
# Print the results in a more digestible format
print("False Positives Detail:")
for tag, errors in fp_details.items():
    print(f"  {tag} was incorrectly predicted as:")
    for err_tag, count in errors.items():
        print(f"    {err_tag}: {count} times")

print("\nFalse Negatives Detail:")
for tag, errors in fn_details.items():
    print(f"  {tag} was missed and incorrectly labeled as:")
    for err_tag, count in errors.items():
        print(f"    {err_tag}: {count} times")


False Positives Detail:
  I-ORG was incorrectly predicted as:
  B-ORG was incorrectly predicted as:
    O: 1 times
  I-PER was incorrectly predicted as:
  O was incorrectly predicted as:
    I-PER: 2 times
    B-LOC: 1 times
    B-PER: 1 times
  B-LOC was incorrectly predicted as:
  B-PER was incorrectly predicted as:
    I-PER: 1 times

False Negatives Detail:
  I-ORG was missed and incorrectly labeled as:
  B-ORG was missed and incorrectly labeled as:
  I-PER was missed and incorrectly labeled as:
    O: 2 times
    B-PER: 1 times
  O was missed and incorrectly labeled as:
    B-ORG: 1 times
  B-LOC was missed and incorrectly labeled as:
    O: 1 times
  B-PER was missed and incorrectly labeled as:
    O: 1 times
