In [1]:
if 'google.colab' in str(get_ipython()):
    from google.colab import userdata
    access_token = userdata.get('DEFORMER_TOKEN')
    !pip install git+https://$access_token@github.com/ay94/deformer-extractor.git@error-handling

In [2]:
from experiment_utils import colab
from experiment_utils.general_utils import FileHandler
from experiment_utils.pipelines import OutputGenerationPipeline, AnalysisExtractionPipeline, ExperimentInitializer, ResultsSaver
from experiment_utils.evaluation import Metrics
from experiment_utils.train import DatasetManager
from experiment_utils.configurations import ExperimentConfig, ExtractionConfigManager, ResultsConfigManager


2024-08-12 12:38:52 - INFO - PyTorch version 1.13.1 available.


In [3]:
base_folder = colab.init('My Drive')
experiment_base_folder = base_folder / 'Final Year Experiments/Thesis-Experiments/Experiments'

2024-08-12 12:39:01 - INFO - Found Google Drive directory for account ahmed.younes.sam@gmail.com: /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com


In [15]:
experiment_config = {
    "experiment_name": "DashboardTest-Shuffle",
    "corpora_path": "ExperimentData",
    "variant": "ANERCorp_CamelLab_arabertv02",
    "dataset_name": "ANERCorp_CamelLab",
    "model_name": "arabertv02",
    "model_path": "aubmindlab/bert-base-arabertv02",
    "results_config": "results_config.yaml",
    "extraction_config": "extraction_config.yaml",
    "fine_tuning_config": "fine_tuning_config.yaml"
}


extraction_config = {
    "fine_tuning": {
        "args": {
            "train_batch_size": 16,
            "test_batch_size": 8,
            "shuffle": True,
            "num_workers": 4,
            "epochs": 4,
            "splits": 4,
            "learning_rate": 5e-5,
            "warmup_ratio": 0.1,
            "max_grad_norm": 1.0,
            "accumulation_steps": 1,
            "logging_step": 50
        },
        "model": {
            "model_path": "aubmindlab/bert-base-arabertv02",
            "dropout_rate": 0.3,
            "enable_attentions": False,
            "enable_hidden_states": False,
            "initialize_output_layer": False
        },
        "evaluation": {
            "scheme": None,
            "mode": None
        }
    },
    "extraction": {
        "tokenization": {
            "tokenizer_path": "aubmindlab/bert-base-arabertv02",
            "preprocessor_path": "aubmindlab/bert-base-arabertv02",
            "max_seq_len": 256,
            "strategy": {
                "type": "core",
                "index": 0,
                "schema": None
            }
        },
        "umap": {
            "n_neighbors": 5,
            "min_dist": 0.1,
            "metric": "cosine",
            "random_state": 1,
            "verbose": True,
            "normalize_embeddings": False
        },
        "clustering": {
            "init_method": "k-means++",
            "n_init": 10,
            "random_state": 1,
            "n_clusters": [3, 4, 9],
            "n_clusters_map": {
                3: "boundary_clusters",
                4: "entity_clusters",
                9: "token_clusters"
            },
            "silhouette_metric": "cosine",
            "norm": "l2"
        }
    }
}

results_config = {
    "results_dir": "extractions",
    "analysis_data": {
        "folder": "analysis",
        "filename": "analysis_data",
        "format": "json"
    },
    "entity_report": {
        "folder": "results",
        "filename": "entity_report",
        "format": "json"
    },
    "token_report": {
        "folder": "results",
        "filename": "token_report",
        "format": "json"
    },
    "results": {
        "folder": "results",
        "filename": "results",
        "format": "json"
    },
    "kmeans_results": {
        "folder": "results",
        "filename": "kmeans_results",
        "format": "json"
    },
    "entity_confusion_data": {
        "folder": "results",
        "filename": "entity_confusion_data",
        "format": "json"
    },
    "attention_weights_similarity": {
        "folder": "matrices",
        "filename": "attention_weights_similarity",
        "format": "json"
    },
    "centroids_avg_similarity_matrix": {
        "folder": "matrices",
        "filename": "centroids_avg_similarity_matrix",
        "format": "json"
    },
    "attention_similarity_matrix": {
        "folder": "matrices",
        "filename": "attention_similarity_matrix",
        "format": "json"
    },
    "train_df": {
        "folder": "analysis",
        "filename": "train_df",
        "format": "json"
    },
}

fine_tuning_config = {
    "save_dir": "fine_tuning",  # Base directory for saving outputs
    "metrics": {
        "folder": "metrics",
        "filename": "evaluation_metrics",
        "format": "json"
    },
    "model": {
        "state_dict": {
            # "folder": "models",
            "filename": "model_state_dict",
            "format": "pth"
        },
        "binary": {
            # "folder": "models",
            "filename": "model_binary",
            "format": "bin"
        }
    },

}

In [16]:
# setup experiment folder
manager = ExperimentInitializer(experiment_base_folder, experiment_config, extraction_config, results_config, fine_tuning_config)
manager.setup_experiment()

# Previous Extraction

In [7]:
# setup managers
experiment_manager = ExperimentConfig.from_dict(experiment_base_folder, 'BaseLineExperiment', 'ANERCorp_CamelLab_arabertv02')
extraction_manager = ExtractionConfigManager(experiment_manager.extraction_dir)
results_manager = ResultsConfigManager(experiment_manager.results_dir)
data_manager = DatasetManager(experiment_manager.corpora_dir, experiment_manager.dataset_name, extraction_manager.tokenization_config)


ValueError: Experiment Config doesn't exist please review the path

In [8]:
finetuning_path = base_folder / 'Final Year Experiments/Class Imbalance/1_fineTuning'
fh = FileHandler(finetuning_path)
# Fine tuning saving

import torch
model_name='arabertv02'
data_name='ANERCorp_CamelLab'
training_outputs = fh.load_pickle(
            f"evalOutputs/{model_name}_{data_name}_regular_outputs.pkl"
        )

load_model_path = fh.file_path / f"trainOutputs/{model_name}_{data_name}_regular.bin"
model = torch.load(load_model_path, map_location=torch.device('cpu'))

results_dict = {
            "token_results": training_outputs.test_metrics.skl_results,
            "token_report": training_outputs.test_metrics.skl_report,
            "token_outputs": training_outputs.test_metrics.skl_output,
            "entity_results": training_outputs.test_metrics.seq_results,
            "entity_report": training_outputs.test_metrics.seq_report,
            "entity_outputs": training_outputs.test_metrics.seq_output
        }

evaluation_results = Metrics.from_dict(results_dict)

In [None]:
# Data Extraction Pipeline

In [10]:
output_generation_pipeline = OutputGenerationPipeline(model, data_manager, extraction_manager, experiment_manager.model_path)

In [11]:
output_generation_pipeline.run('test')

INFO:root:Loading pretrained model from: aubmindlab/bert-base-arabertv02


config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02 were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
INFO:root:Generating model outputs for split: test
INFO:root:Training Config validated successfully
INFO:root:Specific Split test being processed
INFO:root:Loading Preprocesso

tokenizer_config.json:   0%|          | 0.00/381 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/825k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.64M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

  0%|          | 0/121 [00:00<?, ?it/s]

  self.pid = os.fork()
INFO:root:Generating pretrained model outputs for split: test
INFO:root:Training Config validated successfully
INFO:root:Loading Preprocessor: aubmindlab/bert-base-arabertv02
INFO:root:Loading Tokenizer: aubmindlab/bert-base-arabertv02, lower_case: False


  0%|          | 0/121 [00:00<?, ?it/s]

INFO:root:Generating tokenization outputs
INFO:root:Tokenization Config validated successfully
INFO:root:Loading Tokenizer aubmindlab/bert-base-arabertv02
INFO:root:Loading Preprocessor aubmindlab/bert-base-arabertv02
INFO:root:Processing train split


  0%|          | 0/4149 [00:00<?, ?it/s]

INFO:root:Extracting train subwords


  0%|          | 0/4149 [00:00<?, ?it/s]

INFO:root:Processing test split


  0%|          | 0/961 [00:00<?, ?it/s]

In [12]:
analysis_extraction_pipeline = AnalysisExtractionPipeline(
    output_pipeline=output_generation_pipeline,
    evaluation_results=evaluation_results,
    extraction_manager=extraction_manager,
    results_manager=results_manager,
    split='test'
)


In [13]:
analysis_extraction_pipeline.run()

INFO:root:UMAP Config validated successfully
INFO:root:Analysis extraction pipeline initialized successfully.
INFO:root:Extracting model features...
INFO:root:Extracting tokenization features...
INFO:root:Aligning labels...
INFO:root:Applying UMAP...
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


UMAP(angular_rp_forest=True, metric='cosine', n_jobs=1, random_state=1, verbose=True)
Sat Aug 10 05:24:04 2024 Construct fuzzy simplicial set
Sat Aug 10 05:24:04 2024 Finding Nearest Neighbors
Sat Aug 10 05:24:04 2024 Building RP forest with 14 trees
Sat Aug 10 05:24:10 2024 NN descent for 15 iterations
	 1  /  15
	 2  /  15
	 3  /  15
	 4  /  15
	 5  /  15
	 6  /  15
	Stopping threshold met -- exiting after 6 iterations
Sat Aug 10 05:24:27 2024 Finished Nearest Neighbor Search
Sat Aug 10 05:24:30 2024 Construct embedding


Epochs completed:   0%|            0/200 [00:00]

	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs


INFO:root:Extracting model features...
INFO:root:Extracting tokenization features...
INFO:root:Applying UMAP...
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


Sat Aug 10 05:24:51 2024 Finished embedding
UMAP(angular_rp_forest=True, metric='cosine', n_jobs=1, random_state=1, verbose=True)
Sat Aug 10 05:24:51 2024 Construct fuzzy simplicial set
Sat Aug 10 05:24:51 2024 Finding Nearest Neighbors
Sat Aug 10 05:24:51 2024 Building RP forest with 14 trees
Sat Aug 10 05:24:52 2024 NN descent for 15 iterations
	 1  /  15
	 2  /  15
	 3  /  15
	 4  /  15
	 5  /  15
	 6  /  15
	 7  /  15
	Stopping threshold met -- exiting after 7 iterations
Sat Aug 10 05:24:57 2024 Finished Nearest Neighbor Search
Sat Aug 10 05:24:57 2024 Construct embedding


Epochs completed:   0%|            0/200 [00:00]

	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs


INFO:root:Clustering Config validated successfully
INFO:root:Calculating Silhouette Score
INFO:root:Processing K=3
INFO:root:Processing K=4
INFO:root:Processing K=9
INFO:root:Calculating Centorid Average Similarity Matrix
INFO:root:Annotating all...
INFO:root:Annotating consistency...


	completed  180  /  200 epochs
Sat Aug 10 05:25:15 2024 Finished embedding


Extract Subwords:   0%|          | 0/23049 [00:00<?, ?it/s]

Calculate Consistency:   0%|          | 0/29711 [00:00<?, ?it/s]

INFO:root:Annotating token entropy...
INFO:root:Annotating word entropy...
INFO:root:Annotating entity...
INFO:root:Annotating error types...
INFO:root:Annotating prediction entropy...
INFO:root:Annotating pretrained coordinates...
INFO:root:Analysis workflow execution time: 108.85310864448547 seconds
INFO:root:Computing attention similarities


Computing attention similarities:   0%|          | 0/961 [00:00<?, ?it/s]

INFO:root:Comparing weights


In [16]:
saver = DataSaver(results_manager)

In [17]:
saver.save_all(analysis_extraction_pipeline.outputs)