In [14]:
from tqdm.auto import tqdm
tqdm.pandas()
from experiment_utils import env_setup
from experiment_utils.pipelines import DataExtractionPhase
from experiment_utils.tokenization import TokenizationWorkflowManager
from experiment_utils.analysis import DataExtractor

In [3]:
base_folder = env_setup.init('My Drive', drive_mount='drive')

2024-12-12 17:32:25 - INFO - Found Google Drive directory for account ahmed.younes.sam@gmail.com: /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com


In [4]:
experiment_base_folder = base_folder / 'Final Year Experiments/Thesis-Experiments/Experiments'

In [None]:
def global_ids_from_df(df):
        return (
            df["token_ids"].astype(str)
            + "_"
            + df["sentence_ids"].astype(str)
            + "_"
            + df["token_positions"].astype(str)
            + "_"
            + df["labels"].astype(str)
        ).values

In [8]:
variant = "ANERCorp_CamelLab_arabertv02"
# this folder has the most up to date configs
experiment_name = "BaseLineExperiment-Test"
data_extractor = DataExtractionPhase(experiment_base_folder, experiment_name, variant)

2024-12-12 17:36:20 - INFO - Experiment manager set up successfully.
2024-12-12 17:36:20 - INFO - Extraction manager set up successfully.
2024-12-12 17:36:20 - INFO - Results manager set up successfully.
2024-12-12 17:36:20 - INFO - Fine Tuning manager set up successfully.
2024-12-12 17:36:20 - INFO - Tokenization Config validated successfully
2024-12-12 17:36:21 - INFO - Dataset manager set up successfully.


In [11]:
tokenization_outputs_manager = TokenizationWorkflowManager(
                data_extractor.data_manager.corpus, data_extractor.extraction_manager.tokenization_config
            )
split = 'train'
analysis_flat_data = DataExtractor(
            tokenization_outputs=tokenization_outputs_manager.get_split(split),
        )
analysis_df = analysis_flat_data.to_df()

2024-12-12 17:38:57 - INFO - Tokenization Config validated successfully
2024-12-12 17:38:57 - INFO - Loading Tokenizer aubmindlab/bert-base-arabertv02, Lower Case False
2024-12-12 17:38:58 - INFO - Loading Preprocessor aubmindlab/bert-base-arabertv02
2024-12-12 17:38:58 - INFO - Processing train split


  0%|          | 0/4149 [00:00<?, ?it/s]

2024-12-12 17:39:12 - INFO - Extracting train subwords


  0%|          | 0/4149 [00:00<?, ?it/s]

2024-12-12 17:39:12 - INFO - Processing test split


  0%|          | 0/961 [00:00<?, ?it/s]

2024-12-12 17:39:16 - INFO - Extracting tokenization features...


In [33]:
vocab_map = tokenization_outputs_manager.tokenizer.vocab
label_map = data_extractor.data_manager.labels_map

analysis_df['token_ids'] = analysis_df['tokens'].progress_apply(lambda x: vocab_map.get(x, -1))
analysis_df['labels'] = analysis_df['true_labels'].progress_apply(lambda x: label_map.get(x, -100))


  0%|          | 0/147082 [00:00<?, ?it/s]

  0%|          | 0/147082 [00:00<?, ?it/s]

In [34]:
analysis_df["global_id"] = global_ids_from_df(analysis_df).copy()

In [15]:
analysis_df[['global_id', 'words', 'tokens', 'core_tokens', 'word_pieces', 'token_selector_id']]


Unnamed: 0,sentence_ids,token_positions,words,tokens,word_pieces,core_tokens,true_labels,token_selector_id
0,0,0,[CLS],[CLS],[CLS],[CLS],[CLS],[CLS]@#0@#0
1,0,1,فرانكفورت,فرانكفورت,[فرانكفورت],فرانكفورت,B-LOC,فرانكفورت@#1@#0
2,0,2,(د,(,"[(, د]",(,O,(@#2@#0
3,0,3,(د,د,"[(, د]",IGNORED,IGNORED,IGNORED@#3@#0
4,0,4,ب,ب,[ب],ب,O,ب@#4@#0
...,...,...,...,...,...,...,...,...
147077,4148,28,مختلف,مختلف,[مختلف],مختلف,O,مختلف@#28@#4148
147078,4148,29,أنحاء,أنحاء,[أنحاء],أنحاء,O,أنحاء@#29@#4148
147079,4148,30,المصنع,المصنع,[المصنع],المصنع,O,المصنع@#30@#4148
147080,4148,31,.,.,[.],.,O,.@#31@#4148


In [25]:
import pandas as pd
from pathlib import Path

dataset_name = 'ANERCorp_CamelLab'
model_name = 'arabertv02'
data = 'train_data'
original_data_path = Path(f"/Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/BaseLineExperiment/{dataset_name}_{model_name}/extractions/analysis/{data}.json")
test = pd.read_json(
	original_data_path,
	lines=True
)
