In [1]:
import pandas as pd
from pathlib import Path
import logging
from seqeval.scheme import auto_detect, Entities
from seqeval.metrics.sequence_labeling import get_entities
from tqdm.auto import tqdm
tqdm.pandas()
from experiment_utils import env_setup
from experiment_utils.pipelines import DataExtractionPhase
from experiment_utils.tokenization import TokenizationWorkflowManager
from experiment_utils.analysis import DataExtractor

2024-12-12 18:17:38 - INFO - PyTorch version 2.2.2 available.


In [2]:
class EntityAnnotator:
    def __init__(self, y_true, y_pred=None):
        """Initialize the annotator with true and predicted labels."""
        self.y_true = y_true
        self.y_pred = y_pred
        self.scheme = auto_detect(self.y_true, False)
    
    def extract_entities(self):
        """Extract entities based on the scheme detected."""
        self.entities_strict_true = Entities(self.y_true, self.scheme, False)
        if self.y_pred:
            self.entities_strict_pred = Entities(self.y_pred, self.scheme, False)
            logging.info('Predictions are skipped')
        else:
            self.entities_strict_pred = None
        
        self.entities_true = get_entities(self.y_true)
        if self.y_pred:
            self.entities_pred = get_entities(self.y_pred)
            logging.info('Predictions are skipped')
        else:
            self.entities_pred = None

    def process_strict_entities(self, y_true, entities_true, sen_id):
        """Process entities strictly, labeling full spans in a sentence."""
        max_len = len(y_true[sen_id])
        results = ['O'] * max_len
        for idx in range(max_len):
            for entity in entities_true[sen_id]:
                _, t, s, e = entity.to_tuple()
                if s == idx and (e-s) > 0:
                    for i in range(e-s):
                        results[s + i] = t
                elif (e-s) == 0:
                    results[s] = t
        return results
    
    def process_non_strict_entities(self, y_true, sen_id):
        """Process entities non-strictly, marking only the start and end of each entity."""
        max_len = len(y_true[sen_id])
        results = ['O'] * max_len
        for entity in get_entities(y_true[sen_id]):
            t, s, e = entity
            if s == e:
                # If start and end are the same, only mark the start
                results[s] = t
            else:
                # Mark all indices from start to end inclusive
                for i in range(s, e + 1):
                    results[i] = t
        return results

    def process_sentences(self, analysis_data, y_data, entities, label_column, strict=False):
        """Annotate sentences with entity information, either strictly or non-strictly."""
        entity_annotations = []
        for sentence_id, sentence_df in analysis_data.groupby('sentence_ids'):
            if strict: 
                results = self.process_strict_entities(y_data, entities, sentence_id)
            else:
                results = self.process_non_strict_entities(y_data, sentence_id)
            original_series = sentence_df[label_column]
            is_metadata = original_series.apply(lambda x: x not in ['[CLS]', '[SEP]', 'IGNORED'])
            new_series = original_series.copy()
            new_series.loc[is_metadata] = results
            entity_annotations.append(new_series)
        return pd.concat(entity_annotations)

    def annotate_entity_info(self, analysis_data):
        """Add annotated entity information to the analysis data for both true and predicted labels."""
        self.extract_entities()  # Ensure entities are extracted before processing
        analysis_data['strict_true_entities'] = self.process_sentences(analysis_data, self.y_true, self.entities_strict_true.entities, 'true_labels', True)
        if self.entities_strict_pred:
            analysis_data['strict_pred_entities'] = self.process_sentences(analysis_data, self.y_pred, self.entities_strict_pred.entities, 'pred_labels', True)
        analysis_data['true_entities'] = self.process_sentences(analysis_data, self.y_true, self.entities_true, 'true_labels')
        if self.entities_pred:
            analysis_data['pred_entities'] = self.process_sentences(analysis_data, self.y_pred, self.entities_pred, 'pred_labels')
        
        # The `return analysis_data` statement in the `annotate_entity_info` method of the `EntityAnnotator` class is returning the analysis data with the added annotated entity information for both true and predicted labels. This method processes the sentences in the analysis data, annotates them with entity information either strictly or non-strictly based on the specified parameters, and then adds these annotations as new columns in the analysis data DataFrame. Finally, it returns the updated analysis data DataFrame with the added entity annotations.
        return analysis_data


def annotate_error(true_label, pred_label):
        # If both are the same, it's correct (no error)
        if true_label == pred_label:
            return "No Errors"
        
        # Handle cases where one or both labels are 'O'
        if true_label == 'O' and pred_label != 'O':
            return "Chunk"  # False entity predicted
        if true_label != 'O' and pred_label == 'O':
            return "Exclusion"  # Missed entity and chunk boundary
        
        # Extract entity types without position tags (like "B-", "I-")
        true_entity = true_label.split("-")[-1] if "-" in true_label else true_label
        pred_entity = pred_label.split("-")[-1] if "-" in pred_label else pred_label

        # If entity types are different (e.g., LOC vs. PER)
        if true_entity != pred_entity:
            # If both entity type and position (B- vs I-) are wrong
            return "Type and Chunk" if true_label[0] != pred_label[0] else "Type"

        # If entity types are the same but position tags (B- vs I-) are wrong
        return "Chunk"
    
    
    
def global_ids_from_df(df):
        return (
            df["token_ids"].astype(str)
            + "_"
            + df["sentence_ids"].astype(str)
            + "_"
            + df["token_positions"].astype(str)
            + "_"
            + df["labels"].astype(str)
        ).values

In [3]:

def process_and_update_analysis_data(dataset_name, model_name, data_type, train_meta_data):
    # Setup file paths
    base_path = Path(f"/Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/BaseLineExperiment/{dataset_name}_{model_name}/extractions/analysis")
    original_data_path = base_path / f"{data_type}.json"
    scrapped_data_path = base_path / f"{data_type}-backup.json"
    
    # Read original data
    analysis_data = pd.read_json(original_data_path, lines=True)

    train_data = analysis_data.merge(train_meta_data, on='global_id')
    # Process data
    core_data = analysis_data[analysis_data['labels'] != -100].copy()
    y_true = core_data.groupby('sentence_ids')['true_labels'].apply(list).tolist()

    # Modify data
    annotator = EntityAnnotator(y_true)  # Assuming EntityAnnotator is defined elsewhere
    updated_analysis_data = annotator.annotate_entity_info(train_data)

    # Misalignment checking
    updated_analysis_data['true_aligned_scheme'] = updated_analysis_data['true_entities'] == updated_analysis_data['strict_true_entities']
    # updated_analysis_data['pred_aligned_scheme'] = updated_analysis_data['pred_entities'] == updated_analysis_data['strict_pred_entities']

    # Drop unused columns
    # updated_analysis_data = updated_analysis_data.drop(['tr_entity', 'pr_entity'], axis=1)
    # return updated_analysis_data

    # Annotate error types
    # updated_analysis_data['error_type'] = updated_analysis_data.apply(
    #     lambda row: annotate_error(row['true_labels'], row['pred_labels']), 
    #     axis=1
    # )

    # Backup original data
    analysis_data.to_json(scrapped_data_path, lines=True, orient='records')
    print("Original data backed up.")

    # Save updated data
    updated_analysis_data.to_json(original_data_path, lines=True, orient='records')
    print("Updated data saved.")
    
    



def extract_train_meta_data(experiment_base_folder, experiment_name, variant):
    data_extractor = DataExtractionPhase(experiment_base_folder, experiment_name, variant)

    tokenization_outputs_manager = TokenizationWorkflowManager(
                    data_extractor.data_manager.corpus, data_extractor.extraction_manager.tokenization_config
                )
    split = 'train'
    analysis_flat_data = DataExtractor(
                tokenization_outputs=tokenization_outputs_manager.get_split(split),
            )
    analysis_df = analysis_flat_data.to_df()

    vocab_map = tokenization_outputs_manager.tokenizer.vocab
    label_map = data_extractor.data_manager.labels_map

    analysis_df['token_ids'] = analysis_df['tokens'].progress_apply(lambda x: vocab_map.get(x, -1))
    analysis_df['labels'] = analysis_df['true_labels'].progress_apply(lambda x: label_map.get(x, -100))


    analysis_df["global_id"] = global_ids_from_df(analysis_df).copy()

    return analysis_df[[col for col in analysis_df.columns if col not in ['sentence_ids', 'token_positions', 'true_labels', 'token_ids',  'labels']]]



In [7]:
dataset_name = 'ANERCorp_CamelLab'
model_name = 'arabertv02'
data_type = 'train_data'
base_path = Path(f"/Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/BaseLineExperiment/{dataset_name}_{model_name}/extractions/analysis")
original_data_path = base_path / f"{data_type}.json"
analysis_data = pd.read_json(original_data_path, lines=True)

In [4]:
base_folder = env_setup.init('My Drive', drive_mount='drive')
experiment_base_folder = base_folder / 'Final Year Experiments/Thesis-Experiments/Experiments'
experiment_name = "BaseLineExperiment-Test"


2024-12-12 18:17:38 - INFO - Found Google Drive directory for account ahmed.younes.sam@gmail.com: /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com


In [5]:
dataset_name = 'ANERCorp_CamelLab'
model_name = 'arabertv02'
variant = f"{dataset_name}_{model_name}"
data_type = 'train_data'
train_meta_data = extract_train_meta_data(experiment_base_folder, experiment_name, variant)
process_and_update_analysis_data(dataset_name,  model_name, data_type, train_meta_data)

2024-12-12 18:17:38 - INFO - Experiment manager set up successfully.
2024-12-12 18:17:38 - INFO - Extraction manager set up successfully.
2024-12-12 18:17:38 - INFO - Results manager set up successfully.
2024-12-12 18:17:38 - INFO - Fine Tuning manager set up successfully.
2024-12-12 18:17:38 - INFO - Tokenization Config validated successfully
2024-12-12 18:17:39 - INFO - Dataset manager set up successfully.
2024-12-12 18:17:39 - INFO - Tokenization Config validated successfully
2024-12-12 18:17:39 - INFO - Loading Tokenizer aubmindlab/bert-base-arabertv02, Lower Case False
2024-12-12 18:17:39 - INFO - Loading Preprocessor aubmindlab/bert-base-arabertv02
2024-12-12 18:17:39 - INFO - Processing train split


  0%|          | 0/4149 [00:00<?, ?it/s]

2024-12-12 18:17:53 - INFO - Extracting train subwords


  0%|          | 0/4149 [00:00<?, ?it/s]

2024-12-12 18:17:53 - INFO - Processing test split


  0%|          | 0/961 [00:00<?, ?it/s]

2024-12-12 18:17:56 - INFO - Extracting tokenization features...


  0%|          | 0/147082 [00:00<?, ?it/s]

  0%|          | 0/147082 [00:00<?, ?it/s]

Original data backed up.
Updated data saved.


In [6]:
model_name = 'bert'
dataset_name = 'conll2003'
variant = f"{dataset_name}_{model_name}"
data_type = 'train_data'
train_meta_data = extract_train_meta_data(experiment_base_folder, experiment_name, variant)
process_and_update_analysis_data(dataset_name,  model_name, data_type, train_meta_data)
train_meta_data = extract_train_meta_data(experiment_base_folder, experiment_name, variant)

2024-12-12 18:18:05 - INFO - Experiment manager set up successfully.
2024-12-12 18:18:06 - INFO - Extraction manager set up successfully.
2024-12-12 18:18:08 - INFO - Results manager set up successfully.
2024-12-12 18:18:09 - INFO - Fine Tuning manager set up successfully.
2024-12-12 18:18:09 - INFO - Tokenization Config validated successfully
2024-12-12 18:18:09 - INFO - Dataset manager set up successfully.
2024-12-12 18:18:09 - INFO - Tokenization Config validated successfully
2024-12-12 18:18:09 - INFO - Loading Tokenizer bert-base-cased, Lower Case False


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

2024-12-12 18:18:10 - INFO - Processing train split


  0%|          | 0/14041 [00:00<?, ?it/s]

2024-12-12 18:18:23 - INFO - Extracting train subwords


  0%|          | 0/14041 [00:00<?, ?it/s]

2024-12-12 18:18:23 - INFO - Processing validation split


  0%|          | 0/3250 [00:00<?, ?it/s]

2024-12-12 18:18:26 - INFO - Processing test split


  0%|          | 0/3453 [00:00<?, ?it/s]

2024-12-12 18:18:29 - INFO - Extracting tokenization features...


  0%|          | 0/300677 [00:00<?, ?it/s]

  0%|          | 0/300677 [00:00<?, ?it/s]

Original data backed up.


2024-12-12 18:18:48 - INFO - Experiment manager set up successfully.
2024-12-12 18:18:48 - INFO - Extraction manager set up successfully.
2024-12-12 18:18:48 - INFO - Results manager set up successfully.
2024-12-12 18:18:48 - INFO - Fine Tuning manager set up successfully.
2024-12-12 18:18:48 - INFO - Tokenization Config validated successfully


Updated data saved.


2024-12-12 18:18:48 - INFO - Dataset manager set up successfully.
2024-12-12 18:18:48 - INFO - Tokenization Config validated successfully
2024-12-12 18:18:48 - INFO - Loading Tokenizer bert-base-cased, Lower Case False
2024-12-12 18:18:49 - INFO - Processing train split


  0%|          | 0/14041 [00:00<?, ?it/s]

2024-12-12 18:19:01 - INFO - Extracting train subwords


  0%|          | 0/14041 [00:00<?, ?it/s]

2024-12-12 18:19:01 - INFO - Processing validation split


  0%|          | 0/3250 [00:00<?, ?it/s]

2024-12-12 18:19:04 - INFO - Processing test split


  0%|          | 0/3453 [00:00<?, ?it/s]

2024-12-12 18:19:07 - INFO - Extracting tokenization features...


  0%|          | 0/300677 [00:00<?, ?it/s]

  0%|          | 0/300677 [00:00<?, ?it/s]

# loose

In [11]:
dataset_name = 'ANERCorp_CamelLab'
model_name = 'arabertv02'
data = 'train_data'
original_data_path = Path(f"/Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/BaseLineExperiment/{dataset_name}_{model_name}/extractions/analysis/{data}.json")
scrapped_data_path = Path(f"/Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/BaseLineExperiment/{dataset_name}_{model_name}/extractions/analysis/{data}-backup.json")
analysis_data = pd.read_json(
	original_data_path,
	lines=True
)


In [30]:
base_folder = env_setup.init('My Drive', drive_mount='drive')
experiment_base_folder = base_folder / 'Final Year Experiments/Thesis-Experiments/Experiments'
experiment_name = "BaseLineExperiment-Test"
variant = "ANERCorp_CamelLab_arabertv02"

data_extractor = DataExtractionPhase(experiment_base_folder, experiment_name, variant)

tokenization_outputs_manager = TokenizationWorkflowManager(
                data_extractor.data_manager.corpus, data_extractor.extraction_manager.tokenization_config
            )
split = 'train'
analysis_flat_data = DataExtractor(
            tokenization_outputs=tokenization_outputs_manager.get_split(split),
        )
analysis_df = analysis_flat_data.to_df()

vocab_map = tokenization_outputs_manager.tokenizer.vocab
label_map = data_extractor.data_manager.labels_map

analysis_df['token_ids'] = analysis_df['tokens'].progress_apply(lambda x: vocab_map.get(x, -1))
analysis_df['labels'] = analysis_df['true_labels'].progress_apply(lambda x: label_map.get(x, -100))


analysis_df["global_id"] = global_ids_from_df(analysis_df).copy()

analysis_df[['global_id', 'words', 'tokens', 'core_tokens', 'word_pieces', 'token_selector_id']]


2024-12-12 17:56:44 - INFO - Found Google Drive directory for account ahmed.younes.sam@gmail.com: /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com
2024-12-12 17:56:44 - INFO - Experiment manager set up successfully.
2024-12-12 17:56:44 - INFO - Extraction manager set up successfully.
2024-12-12 17:56:44 - INFO - Results manager set up successfully.
2024-12-12 17:56:44 - INFO - Fine Tuning manager set up successfully.
2024-12-12 17:56:44 - INFO - Tokenization Config validated successfully
2024-12-12 17:56:44 - INFO - Dataset manager set up successfully.
2024-12-12 17:56:44 - INFO - Tokenization Config validated successfully
2024-12-12 17:56:44 - INFO - Loading Tokenizer aubmindlab/bert-base-arabertv02, Lower Case False
2024-12-12 17:56:44 - INFO - Loading Preprocessor aubmindlab/bert-base-arabertv02
2024-12-12 17:56:44 - INFO - Processing train split


  0%|          | 0/4149 [00:00<?, ?it/s]

2024-12-12 17:56:59 - INFO - Extracting train subwords


  0%|          | 0/4149 [00:00<?, ?it/s]

2024-12-12 17:57:00 - INFO - Processing test split


  0%|          | 0/961 [00:00<?, ?it/s]

2024-12-12 17:57:03 - INFO - Extracting tokenization features...


  0%|          | 0/147082 [00:00<?, ?it/s]

  0%|          | 0/147082 [00:00<?, ?it/s]

Unnamed: 0,global_id,words,tokens,core_tokens,word_pieces,token_selector_id
0,2_0_0_-100,[CLS],[CLS],[CLS],[CLS],[CLS]@#0@#0
1,19876_0_1_5,فرانكفورت,فرانكفورت,فرانكفورت,[فرانكفورت],فرانكفورت@#1@#0
2,14_0_2_0,(د,(,(,"[(, د]",(@#2@#0
3,120_0_3_-100,(د,د,IGNORED,"[(, د]",IGNORED@#3@#0
4,113_0_4_0,ب,ب,ب,[ب],ب@#4@#0
...,...,...,...,...,...,...
147077,1259_4148_28_0,مختلف,مختلف,مختلف,[مختلف],مختلف@#28@#4148
147078,4537_4148_29_0,أنحاء,أنحاء,أنحاء,[أنحاء],أنحاء@#29@#4148
147079,10776_4148_30_0,المصنع,المصنع,المصنع,[المصنع],المصنع@#30@#4148
147080,20_4148_31_0,.,.,.,[.],.@#31@#4148


In [33]:
train_meta_data = analysis_df[['global_id', 'words', 'tokens', 'core_tokens', 'word_pieces', 'token_selector_id']]

In [35]:
merge = analysis_data.merge(train_meta_data, on='global_id')

In [38]:
(merge['token_ids_x'] == merge['token_ids_y']).value_counts()

True    147082
Name: count, dtype: int64

In [13]:
core_data = analysis_data[analysis_data['labels']!= -100].copy()
y_true = core_data.groupby('sentence_ids')['true_labels'].apply(list).tolist()


In [20]:
annotator = EntityAnnotator(y_true)
updated_analysis_data = annotator.annotate_entity_info(analysis_data)


In [22]:
misalignment_mask = updated_analysis_data['true_entities'] == updated_analysis_data['strict_true_entities']
updated_analysis_data['aligned_scheme'] = misalignment_mask

In [23]:
updated_analysis_data['aligned_scheme'].value_counts()

aligned_scheme
True     147040
False        42
Name: count, dtype: int64

In [24]:
updated_analysis_data[~updated_analysis_data['aligned_scheme']]

Unnamed: 0,sentence_ids,token_positions,x,y,labels,losses,token_ids,global_id,true_labels,strict_true_entities,true_entities,aligned_scheme
3048,90,14,11.905856,14.384854,8,3.961651,810,810_90_14_8,I-MISC,O,MISC,False
17596,542,195,11.775333,17.912809,6,6.372937,549,549_542_195_6,I-LOC,O,LOC,False
24512,735,3,8.009236,19.33563,4,4.770611,6850,6850_735_3_4,I-ORG,O,ORG,False
24513,735,4,6.925725,19.213617,4,0.003018,7469,7469_735_4_4,I-ORG,O,ORG,False
24776,743,12,5.914831,23.640896,2,0.028497,11082,11082_743_12_2,I-PERS,O,PERS,False
27606,833,22,5.808229,23.061789,2,0.002659,23400,23400_833_22_2,I-PERS,O,PERS,False
27655,834,28,5.809838,23.057411,2,0.00297,23400,23400_834_28_2,I-PERS,O,PERS,False
27755,838,5,5.806953,23.055321,2,0.003825,23400,23400_838_5_2,I-PERS,O,PERS,False
27772,839,8,5.664155,23.239843,2,0.00397,39378,39378_839_8_2,I-PERS,O,PERS,False
27788,839,24,5.727841,23.209948,2,0.001895,13532,13532_839_24_2,I-PERS,O,PERS,False


In [7]:
updated_analysis_data = updated_analysis_data.drop(['tr_entity', 'pr_entity'], axis=1)

In [8]:
updated_analysis_data['error_type'] = updated_analysis_data.apply(
            lambda row: annotate_error(
                row['true_labels'], 
                row['pred_labels']
                ), 
            axis=1
            )


In [9]:
updated_analysis_data['error_type'].value_counts()

error_type
No Errors         29005
Exclusion           311
Chunk               236
Type                124
Type and Chunk       35
Name: count, dtype: int64

In [12]:
# I have saved the modified data in the same name as the original and scrapped the other ones.
updated_analysis_data.to_json(
	original_data_path,
	lines=True, orient='records'
)


analysis_data.to_json(
	scrapped_data_path,
	lines=True, orient='records'
)


# Examples

In [12]:
updated_analysis_data[['true_entities', 'strict_true_entities', 'tr_entity', 'true_labels']].iloc[200:250]

Unnamed: 0,true_entities,strict_true_entities,tr_entity,true_labels
200,O,O,O,O
201,LOC,LOC,LOC,B-LOC
202,O,O,O,O
203,O,O,O,O
204,O,O,O,O
205,O,O,O,O
206,O,O,O,O
207,O,O,O,O
208,O,O,O,O
209,O,O,O,O


In [20]:
updated_analysis_data[updated_analysis_data['true_entities'] != updated_analysis_data['strict_true_entities']][['true_entities', 'strict_true_entities', 'tr_entity', 'true_labels', 'agreements']].head(50)

Unnamed: 0,true_entities,strict_true_entities,tr_entity,true_labels,agreements
379,PERS,O,PERS,I-PERS,False
415,ORG,O,ORG,I-ORG,False
1171,PERS,O,PERS,I-PERS,False
1196,MISC,O,MISC,I-MISC,False
1197,MISC,O,MISC,I-MISC,False
1198,MISC,O,MISC,I-MISC,False
1207,PERS,O,PERS,I-PERS,False
2810,ORG,O,ORG,I-ORG,False
2928,LOC,O,LOC,I-LOC,False
3208,PERS,O,PERS,I-PERS,False


In [22]:
updated_analysis_data[updated_analysis_data['sentence_ids'] == 130]

Unnamed: 0,sentence_ids,token_positions,words,tokens,word_pieces,core_tokens,true_labels,token_selector_id,pred_labels,agreements,...,prediction_entropy,prediction_max_entropy,confidence,variability,pre_x,pre_y,strict_true_entities,strict_pred_entities,true_entities,pred_entities
4560,130,0,[CLS],[CLS],[CLS],[CLS],[CLS],[CLS]@#0@#130,[CLS],True,...,0.003089,3.169925,0.999807,0.314201,2.536585,-0.162095,[CLS],[CLS],[CLS],[CLS]
4561,130,1,فيينا,فيينا,[فيينا],فيينا,B-LOC,فيينا@#1@#130,O,False,...,0.004867,3.169925,0.999685,0.314158,1.04824,4.441457,LOC,O,LOC,O
4562,130,2,المعلومات,المعلومات,[المعلومات],المعلومات,O,المعلومات@#2@#130,O,True,...,0.001409,3.169925,0.99992,0.314241,0.268964,12.65027,O,O,O,O
4563,130,3,التي,التي,[التي],التي,O,التي@#3@#130,O,True,...,0.000854,3.169925,0.999954,0.314254,-3.795791,14.442162,O,O,O,O
4564,130,4,بثها,بثها,[بثها],بثها,O,بثها@#4@#130,O,True,...,0.001411,3.169925,0.999921,0.314242,-2.03539,12.893243,O,O,O,O
4565,130,5,موقع,موقع,[موقع],موقع,O,موقع@#5@#130,O,True,...,0.003905,3.169925,0.999746,0.31418,1.437478,12.406041,O,O,O,O
4566,130,6,غلوبو,غلوب,"[غلوب, ##و]",غلوب,B-ORG,غلوب@#6@#130,B-ORG,True,...,0.524218,3.169925,0.924509,0.287821,0.794667,7.32289,ORG,ORG,ORG,ORG
4567,130,7,غلوبو,##و,"[غلوب, ##و]",IGNORED,IGNORED,IGNORED@#7@#130,IGNORED,True,...,1.640223,3.169925,0.654458,0.198654,0.597,8.079787,IGNORED,IGNORED,IGNORED,IGNORED
4568,130,8,على,على,[على],على,O,على@#8@#130,O,True,...,0.00208,3.169925,0.99988,0.314227,-2.472588,15.070498,O,O,O,O
4569,130,9,شبكة,شبكة,[شبكة],شبكة,O,شبكة@#9@#130,O,True,...,0.004785,3.169925,0.999698,0.314163,-1.828349,13.865242,O,O,O,O


In [352]:
updated_analysis_data[updated_analysis_data['pred_entities'] != updated_analysis_data['strict_pred_entities']][['pred_entities', 'strict_pred_entities', 'pred_labels', 'true_labels']]

Unnamed: 0,pred_entities,strict_pred_entities,pred_labels,true_labels
1181,ORG,O,I-ORG,I-ORG
1182,ORG,O,I-ORG,I-ORG
1183,ORG,O,I-ORG,I-ORG
1210,ORG,O,I-ORG,I-ORG
1384,LOC,O,I-LOC,I-ORG
3266,PERS,O,I-PERS,I-PERS
4324,ORG,O,I-ORG,I-ORG
4437,MISC,O,I-MISC,O
4580,MISC,O,I-MISC,I-MISC
4739,ORG,O,I-ORG,I-ORG


In [355]:
analysis_data[analysis_data['sentence_ids'] == 33][['pred_entities', 'strict_pred_entities', 'pred_labels', 'true_labels']]

Unnamed: 0,pred_entities,strict_pred_entities,pred_labels,true_labels
1176,[CLS],[CLS],[CLS],[CLS]
1177,O,O,O,O
1178,O,O,O,O
1179,O,O,O,O
1180,O,O,O,B-ORG
1181,ORG,O,I-ORG,I-ORG
1182,ORG,O,I-ORG,I-ORG
1183,ORG,O,I-ORG,I-ORG
1184,O,O,O,O
1185,O,O,O,O


In [None]:
for e, e_df in updated_analysis_data.groupby('error_type'):

    display(e_df.sample(20))