In [2]:
if 'google.colab' in str(get_ipython()):
    from google.colab import userdata
    access_token = userdata.get('DEFORMER_TOKEN')
    !pip install git+https://$access_token@github.com/ay94/deformer-extractor.git@error-handling

In [5]:
from experiment_utils.model_outputs import ModelOutputWorkflowManager, PretrainedModelOutputWorkflowManager
from experiment_utils.tokenization import TokenizationWorkflowManager
from experiment_utils.evaluation import Metrics, Evaluation
from experiment_utils.train import DatasetManager
from experiment_utils.configurations import  ConfigWorkflowManager
from experiment_utils.analysis import DataTransformer, LabelAligner, DataExtractor, ClusterAnalysis, TrainingImpact, Entity, UtilityFunctions



In [6]:
from experiment_utils import colab
from experiment_utils.general_utils import FileHandler
base_folder = colab.init('My Drive')
config_path = base_folder / 'Final Year Experiments/Class Imbalance/1_fineTuning'
fh = FileHandler(config_path)


import torch
model_name='arabertv02'
data_name='ANERCorp_CamelLab'
training_outputs = fh.load_pickle(
            f"evalOutputs/{model_name}_{data_name}_regular_outputs.pkl"
        )

load_model_path = fh.file_path / f"trainOutputs/{model_name}_{data_name}_regular.bin"
model = torch.load(load_model_path, map_location=torch.device('cpu'))

2024-08-05 07:48:50 - INFO - Found Google Drive directory for account ahmed.younes.sam@gmail.com: /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com


In [7]:
results_dict = {
            "token_results": training_outputs.test_metrics.skl_results,
            "token_report": training_outputs.test_metrics.skl_report,
            "token_outputs": training_outputs.test_metrics.skl_output,
            "entity_results": training_outputs.test_metrics.seq_results,
            "entity_report": training_outputs.test_metrics.seq_report,
            "entity_outputs": training_outputs.test_metrics.seq_output
        }

results = Metrics.from_dict(results_dict)

In [9]:
results.entity_outputs

{'y_true': [['B-LOC',
   'B-LOC',
   'O',
   'B-PERS',
   'I-PERS',
   'O',
   'O',
   'O',
   'O',
   'B-PERS',
   'I-PERS',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-LOC',
   'B-LOC',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O'],
  ['B-PERS',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-LOC',
   'B-LOC',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O'],
  ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O'],
  ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O'],
  ['O',
   'O',
   'O',
   'O',
   'O',
 

In [5]:
corpora_path = base_folder / 'Final Year Experiments/Thesis-Experiments/ExperimentData'
corpora_fh = FileHandler(corpora_path)
corpora  = corpora_fh.load_json('corpora.json')

config_path = base_folder / 'Final Year Experiments/Thesis-Experiments/scripts'
config_fh = FileHandler(config_path)
config = config_fh.load_yaml('config.yaml')

In [6]:
config_manager = ConfigWorkflowManager(config_path, 'config.yaml')

In [7]:
config_manager.dataset_name

'ANERCorp_CamelLab'

In [8]:
# NERData = dict()
# NERData['splits'] = {'train': [{'id': sen[0], 'words': sen[1], 'tags': sen[2]}for sen in training_outputs.data['train']],
#                      'test': [{'id': sen[0], 'words': sen[1], 'tags': sen[2]}for sen in training_outputs.data['test']]}
# NERData['labels'] = training_outputs.data['labels']
# NERData['labels_map'] = training_outputs.data['labels_map']
# NERData['inv_labels'] = training_outputs.data['inv_labels']

In [9]:
data_manager = DatasetManager(corpora_path, config_manager.dataset_name, config_manager.tokenization_config)
# model_outputs = ModelOutputWorkflowManager(model, data_manager, config_manager.training_config, split)

2024-08-05 06:27:12 - INFO - Tokenization Config validated successfully


In [10]:
from transformers import AutoModel

In [11]:
pretrained_model = AutoModel.from_pretrained(config_manager.model_path)

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
tokenization_outputs = TokenizationWorkflowManager(data_manager.corpus, config_manager.tokenization_config)

2024-08-05 06:27:16 - INFO - Tokenization Config validated successfully
2024-08-05 06:27:16 - INFO - Loading Tokenizer aubmindlab/bert-base-arabertv02
2024-08-05 06:27:17 - INFO - Loading Preprocessor aubmindlab/bert-base-arabertv02
2024-08-05 06:27:17 - INFO - Processing train split


  0%|          | 0/4149 [00:00<?, ?it/s]

2024-08-05 06:27:37 - INFO - Extracting train subwords


  0%|          | 0/4149 [00:00<?, ?it/s]

2024-08-05 06:27:37 - INFO - Processing test split


  0%|          | 0/961 [00:00<?, ?it/s]

In [None]:
ld = data_manager.get_dataloader('test', 16)

2024-08-05 06:09:34 - INFO - Loading Preprocessor: aubmindlab/bert-base-arabertv02
2024-08-05 06:09:34 - INFO - Loading Tokenizer: aubmindlab/bert-base-arabertv02, lower_case: False


In [None]:
model_outputs = PretrainedModelOutputWorkflowManager(pretrained_model, data_manager, config_manager.training_config, 'test')

2024-08-05 06:09:34 - INFO - Training Config validated successfully
2024-08-05 06:09:34 - INFO - Loading Preprocessor: aubmindlab/bert-base-arabertv02
2024-08-05 06:09:34 - INFO - Loading Tokenizer: aubmindlab/bert-base-arabertv02, lower_case: False


  0%|          | 0/121 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [1]:
model_outputs

NameError: name 'model_outputs' is not defined

In [24]:
from dataclasses import dataclass, field, asdict
from typing import Optional, List
import torch
import numpy as np
import pandas as pd

@dataclass
class DataExtractor:
    tokenization_outputs: Optional[list] = field(default_factory=list)
    model_outputs: Optional[list] = field(default_factory=list)
    aligner: Optional['LabelAligner'] = field(default=None, repr=False)
    transformer: Optional['DataTransformer'] = field(default=None, repr=False)
    last_hidden_states: Optional[torch.Tensor] = field(init=False, default=None)
    labels: Optional[torch.Tensor] = field(init=False, default=None)
    losses: Optional[torch.Tensor] = field(init=False, default=None)
    token_ids: Optional[torch.Tensor] = field(init=False, default=None)
    words: Optional[list] = field(init=False, default_factory=list)
    tokens: Optional[list] = field(init=False, default_factory=list)
    word_pieces: Optional[list] = field(init=False, default_factory=list)
    core_tokens: Optional[list] = field(init=False, default_factory=list)
    true_labels: Optional[list] = field(init=False, default_factory=list)
    pred_labels: Optional[list] = field(init=False, default_factory=list)
    sentence_ids: Optional[list] = field(init=False, default_factory=list)
    token_positions: Optional[list] = field(init=False, default_factory=list)
    token_selector_id: Optional[list] = field(init=False, default_factory=list)
    agreements: Optional[list] = field(init=False, default_factory=list)
    x: Optional[list] = field(init=False, default_factory=list)
    y: Optional[list] = field(init=False, default_factory=list)

    def process_outputs(self):
        if self.model_outputs and self.tokenization_outputs:
            self.extract_features()
        if self.aligner:
            self.align_labels()
        if self.transformer:
            self.apply_umap()

    def extract_features(self):
        """Extract features from the model outputs."""
        self.last_hidden_states = torch.cat([s.last_hidden_states for s in self.model_outputs])
        if hasattr(self.model_outputs[0], 'labels'):
            self.labels = torch.cat([s.labels for s in self.model_outputs])
        if hasattr(self.model_outputs[0], 'losses'):
            self.losses = torch.cat([s.losses for s in self.model_outputs])
        if hasattr(self.model_outputs[0], 'input_ids'):
            self.token_ids = torch.cat([s.input_ids for s in self.model_outputs])
        
        self.words = [word for sentence in self.tokenization_outputs for word in sentence.words_df]
        self.tokens = [token for sentence in self.tokenization_outputs for token in sentence.tokens_df]
        self.word_pieces = [wp for sentence in self.tokenization_outputs for wp in sentence.word_pieces_df]
        self.core_tokens = [ct for sentence in self.tokenization_outputs for ct in sentence.core_tokens_df]
        self.true_labels = [label for sentence in self.tokenization_outputs for label in sentence.labels_df]
        self.sentence_ids = [index for sentence in self.tokenization_outputs for index in sentence.sentence_index_df]
        self.token_positions = [position for sentence in self.tokenization_outputs for position in range(len(sentence.tokens_df))]
        self.token_selector_id = [
            f"{core_token}@#{token_position}@#{sentence_index}"
            for core_token, token_position, sentence_index in
            zip(self.core_tokens, self.token_positions, self.sentence_ids)
        ]
        return self

    def align_labels(self):
        """Align labels according to aligner's method."""
        aligned_labels = self.aligner.align_labels()
        self.pred_labels = [label for sentence in aligned_labels for label in sentence]
        self.agreements = np.array(self.true_labels) == np.array(self.pred_labels)
        return self

    def apply_umap(self):
        """Apply dimension reduction using UMAP."""
        coordinates = self.transformer.apply_umap(self.last_hidden_states)
        self.x, self.y = coordinates
        return self

    def to_dict(self):
        """Convert extracted data to a dictionary."""
        data_dict = {}
        for field_name in self.__dataclass_fields__:
            value = getattr(self, field_name)
            if value is not None and value != []:
                if isinstance(value, torch.Tensor):
                    data_dict[field_name] = value.tolist()
                elif isinstance(value, np.ndarray):
                    data_dict[field_name] = value.tolist()
                else:
                    data_dict[field_name] = value
        return data_dict

    def to_df(self):
        """Convert data to pandas DataFrame and compute global ID."""
        data_dict = self.to_dict()
        df = pd.DataFrame(data_dict)
        df['global_id'] = UtilityFunctions.global_ids_from_df(df)
        return df


In [None]:
# Instantiate the DataExtractor class
data_extractor = DataExtractor(
    tokenization_outputs=tokenization_outputs.test,  # Replace with your tokenization outputs
    model_outputs=model_outputs.test,                # Replace with your model outputs
    # aligner=aligner,                            # Replace with your aligner (Optional)
    # transformer=transformer                     # Replace with your transformer (Optional)
)

# Extract features if necessary
data_extractor.extract_features()

# # Align labels if aligner is provided
# if data_extractor.aligner:
#     data_extractor.align_labels()

# # Apply UMAP if transformer is provided
# if data_extractor.transformer:
#     data_extractor.apply_umap()




In [None]:
output = data_extractor.to_dict()

In [None]:
output

In [None]:
class OutputGenerationPipeline:
    def __init__(self, model, data_manager, config_manager):
        self.model = model
        self.data_manager = data_manager
        self.config_manager = config_manager

    def run(self, split):
        model_outputs_manager = ModelOutputWorkflowManager(
            self.model, self.data_manager, self.config_manager.training_config, split
        )
        # pretrained_model_outputs workflow
        tokenization_outputs_manager = TokenizationWorkflowManager(
            self.data_manager.corpus, self.config_manager.tokenization_config
        )
        return {
            "model_outputs": model_outputs_manager,
            "tokenization_outputs": tokenization_outputs_manager
        }


In [None]:
output_generator = OutputGenerationWorkflow(model, data_manager, config_manager)

In [None]:
output_generator.run('test')

2024-08-04 16:24:33 - INFO - Training Config validated successfully
2024-08-04 16:24:33 - INFO - Specific Split test being processed
2024-08-04 16:24:33 - INFO - Loading Preprocessor: aubmindlab/bert-base-arabertv02
2024-08-04 16:24:33 - INFO - Loading Tokenizer: aubmindlab/bert-base-arabertv02, lower_case: False


  0%|          | 0/121 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2024-08-04 16:28:52 - INFO - Tokenization Config validated successfully
2024-08-04 16:28:52 - INFO - Loading Tokenizer aubmindlab/bert-base-arabertv02
2024-08-04 16:28:52 - INFO - Loading Preprocessor aubmindlab/bert-base-arabertv02
2024-08-04 16:28:52 - INFO - Processing train split


  0%|          | 0/4149 [00:00<?, ?it/s]

2024-08-04 16:29:04 - INFO - Extracting train subwords


  0%|          | 0/4149 [00:00<?, ?it/s]

2024-08-04 16:29:05 - INFO - Processing test split


  0%|          | 0/961 [00:00<?, ?it/s]

{'model_outputs': <experiment_utils.model_outputs.ModelOutputWorkflowManager at 0x11122a740>,
 'tokenization_outputs': <experiment_utils.tokenization.TokenizationWorkflowManager at 0x11122a5f0>}

In [None]:
class AnalysisExtractionPipeline:
    def __init__(self, output_pipeline, metrics, config_manager):
        # Data Analysis workflow
        self.analysis_manager = AnalysisWrokflowManager(config_manager, results, output_pipeline.get('tokenization_outputs'), output_pipeline.get('model_outputs'), data_manager)

        self.entity_evaluation = Entity(
            metrics.entity_outputs
        )
        # Training impact workflow
        self.training_impact = TrainingImpact(
            model_outputs.data['test'], tokenization_outputs, 'aubmindlab/bert-base-arabertv02', model.bert
        )
        

    def run(self, split):
        analysis_data, average_silhouette_score, kmeans_metrics = self.analysis_manager.run(split)
        return {
            "analysis_data": analysis_data,
            "average_silhouette_score": average_silhouette_score,
            "kmeans_metrics": kmeans_metrics,
        }
