In [1]:
import sys
import os
from pathlib import Path
# This appends the directory one level up (the root of your project) to the sys.path.
# Modify the path depending on the location of modules you want to import.
sys.path.append(os.path.abspath('../../'))

from config.config_managers import DashboardConfigManager
from dataManager import DataManager
from dash import Dash
import pandas as pd
import plotly.express as px
from abc import ABC, abstractmethod
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

2025-03-25 00:35:12 - INFO - PyTorch version 2.2.2 available.


In [None]:
import logging
from dataclasses import dataclass, field
from typing import Any, Dict
import re
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from experiment_utils.utils import FileHandler
from flask_caching import Cache
from tqdm.autonotebook import tqdm
import torch
from torch.nn import Module
from typing import Optional
from transformers import PreTrainedModel
from transformers import AutoModelForTokenClassification
from experiment_utils.train import DatasetManager
from experiment_utils.config_managers import ExtractionConfigManager


MODEL_MAP = {
    "ANERCorp_CamelLab_arabertv02": "aubmindlab/bert-base-arabertv02",
    "conll2003_bert": "bert-base-cased",
}

DATA_MAP = {
    "ANERCorp_CamelLab_arabertv02": "ANERCorp_CamelLab",
    "conll2003_bert": "conll2003",
}


@dataclass
class DashboardData:
    analysis_data: pd.DataFrame = field(default_factory=pd.DataFrame)
    train_data: pd.DataFrame = field(default_factory=pd.DataFrame)
    kmeans_results: pd.DataFrame = field(default_factory=pd.DataFrame)
    results: pd.DataFrame = field(default_factory=pd.DataFrame)
    entity_non_strict_report: pd.DataFrame = field(default_factory=pd.DataFrame)
    entity_strict_report: pd.DataFrame = field(default_factory=pd.DataFrame)
    token_report: pd.DataFrame = field(default_factory=pd.DataFrame)
    token_confusion_matrix: pd.DataFrame = field(default_factory=pd.DataFrame)
    token_misclassifications: pd.DataFrame = field(default_factory=pd.DataFrame)
    entity_non_strict_confusion_data: pd.DataFrame = field(default_factory=pd.DataFrame)
    non_strict_entity_misclassifications: pd.DataFrame = field(default_factory=pd.DataFrame)
    entity_strict_confusion_data: pd.DataFrame = field(default_factory=pd.DataFrame)
    strict_entity_misclassifications: pd.DataFrame = field(default_factory=pd.DataFrame)
    centroids_avg_similarity_matrix: pd.DataFrame = field(default_factory=pd.DataFrame)
    attention_weights_similarity_heatmap: go.Figure = field(default_factory=go.Figure)
    attention_weights_similarity_matrix: np.ndarray = field(
        default_factory=lambda: np.array([])
    )
    attention_similarity_heatmap: go.Figure = field(default_factory=go.Figure)
    attention_similarity_matrix: np.ndarray = field(
        default_factory=lambda: np.array([])
    )
    pretrained_model: Optional[PreTrainedModel] = None
    fine_tuned_model: Optional[Module] = None
    train_dataset: Optional[Module] = None
    test_dataset: Optional[Module] = None

    def __post_init__(self):
        # Round float columns to four decimal places
        self.round_floats(self.analysis_data)
        self.round_floats(self.kmeans_results)
        self.round_floats(self.results)

        # Convert list to string in the 'Word Pieces' column of analysis_data if it exists
        if "Word Pieces" in self.analysis_data.columns:
            self.analysis_data["Word Pieces"] = self.analysis_data["Word Pieces"].apply(
                lambda x: ", ".join(x) if isinstance(x, list) else ("" if pd.isna(x) else x)
            )
        
        # Normalize PER Entity Tag
        tag_mapping = {
            'B-PERS': 'B-PER',
            'I-PERS': 'I-PER'
        }
        
        if "True Labels" in self.analysis_data.columns:
            self.analysis_data["True Labels"] = self.analysis_data["True Labels"].replace(tag_mapping)
        if "Pred Labels" in self.analysis_data.columns:
            self.analysis_data["Pred Labels"] = self.analysis_data["Pred Labels"].replace(tag_mapping)

        self.analysis_data["Consistency Ratio"] = np.where(
            self.analysis_data["Total Train Occurrences"]
            != 0,  # Condition to check for non-zero denominator
            self.analysis_data["Consistency Count"]
            / self.analysis_data[
                "Total Train Occurrences"
            ],  # Normal calculation if denominator is not zero
            0,
        )

        self.analysis_data["Inconsistency Ratio"] = np.where(
            self.analysis_data["Total Train Occurrences"] != 0,
            self.analysis_data["Inconsistency Count"]
            / self.analysis_data["Total Train Occurrences"],
            0,
        )
        self.analysis_data["Confusion Components"] = DashboardData.classify_ner(
            self.analysis_data, "True Labels", "Pred Labels"
        )
        self.analysis_data[
            "Token Ambiguity"
        ] = DashboardData.normalized_entropy(
            self.analysis_data, "Local Token Entropy", "Token Max Entropy"
        )  # filling 0/0 division as it generates Nan
        self.analysis_data[
            "Word Ambiguity"
        ] = DashboardData.normalized_entropy(
            self.analysis_data, "Local Word Entropy", "Word Max Entropy"
        )  # filling 0/0 division as it generates Nan
        self.analysis_data[
            "Prediction Uncertainty"
        ] = DashboardData.normalized_entropy(
            self.analysis_data, "Prediction Entropy", "Prediction Max Entropy"
        )  # filling 0/0 division as it generates Nan


    def is_loaded(self, attribute):
        """Checks if the given attribute is loaded based on its type."""
        attr_value = getattr(self, attribute)
        if isinstance(attr_value, pd.DataFrame):
            return not attr_value.empty
        elif isinstance(attr_value, go.Figure):
            return len(attr_value.data) > 0
        elif isinstance(attr_value, np.ndarray):
            return attr_value.size > 0
        elif isinstance(attr_value, dict):
            return bool(attr_value)  # Returns True if the dictionary is non-empty
        return False  # Default case if the attribute type is unrecognized

    @staticmethod
    def round_floats(df):
        for col in df.select_dtypes(include=["float"]).columns:
            df[col] = df[col].round(4)

    @staticmethod
    def from_dict(dict_data: Dict[str, Any]):
        return DashboardData(**dict_data)

    @staticmethod
    def normalized_entropy(df, raw_entropy, max_entropy):
        result = np.full(df.shape[0], np.nan)
        zero_mask = (df[raw_entropy] == 0) & (df[max_entropy] == 0)
        result[zero_mask] = 0
        negative_one_mask = (df[raw_entropy] == -1) & (df[max_entropy] == -1)
        result[negative_one_mask] = -1
        valid_mask = (df[max_entropy] != 0) & ~zero_mask & ~negative_one_mask
        result[valid_mask] = df[raw_entropy][valid_mask] / df[max_entropy][valid_mask]
        zero_div_mask = (df[max_entropy] == 0) & (df[raw_entropy] != 0)
        result[zero_div_mask] = 0
        return result
    
    @staticmethod   
    def classify_ner(df, true_label_col, pred_label_col):
        conditions = [
            (df[pred_label_col] == df[true_label_col]) & (df[pred_label_col] != "O"),
            (df[pred_label_col] != df[true_label_col]) & (df[pred_label_col] != "O"),
            (df[pred_label_col] != df[true_label_col]) & (df[true_label_col] != "O") & (df[pred_label_col] == "O"),
        ]
        choices = ["TP", "FP", "FN"]
        return np.select(conditions, choices, default="TN")  # Return only the classification column

    
class DataLoader:
    def __init__(self, config_manager, variant_name):
        self.config_manager = config_manager
        
        self.variant = variant_name
        self.data_dir = config_manager.data_dir / variant_name
        self.dashboard_data = {}

    def load(self, file_name, file_config):
        file_handler = FileHandler(self.data_dir / file_config["folder"])
        file_type = file_config.get("type", None)
        file_format = file_config["format"]
        file_path = file_handler.file_path / f"{file_name}.{file_format}"

        try:
            if file_path.exists():
                match file_format:
                    case "npy": 
                        return file_handler.load_numpy(file_path.with_suffix(".npy"))

                    # Handle regular JSON data files
                    case "json":
                        # Check if there's a specific type of JSON handling required
                        if file_type and file_type =='dict':
                            return file_handler.load_json(file_path)
                        elif file_type and file_type =='index':
                            return pd.read_json(file_path, orient='index')
                        else:
                            data = file_handler.read_json(file_path)
                        if (
                            "column_mappings" in file_config
                            and file_config["column_mappings"]
                        ):
                            data = self.apply_column_mappings(
                                data, file_config["column_mappings"]
                            )
                        return data
                    case _:
                        logging.warning("File does not exist: %s", file_path)
                        
        except Exception as e:
            logging.error("Failed to load data from %s: %s", file_path, e)
            return None

    def apply_column_mappings(
        self, data: pd.DataFrame, column_mappings: dict
    ) -> pd.DataFrame:
        """Rename columns in the DataFrame based on provided mappings."""
        return data.rename(columns=column_mappings)
    
    def load_model(self):
        
        model_name = MODEL_MAP.get(self.variant)

        if not model_name:
            raise ValueError(f"No pretrained model mapping found for variant: {self.variant}")

        # Path to the .bin file
        model_path = self.data_dir / "fine_tuning" / "model_binary.bin"

        if not model_path.exists():
            raise FileNotFoundError(f"Fine-tuned model not found at: {model_path}")

        model = torch.load(model_path, map_location="cpu")

        model.eval()

        return model
    
    def load_data(self, split):
        extraction_config_dir = self.config_manager.data_dir / self.variant / 'configs/extraction_config.yaml'
        
        corpora_dir = self.config_manager.corpora_dir

        extraction_config = ExtractionConfigManager(extraction_config_dir)

        tokenization_config = extraction_config.tokenization_config
        data_manager = DatasetManager(
                corpora_dir,
                DATA_MAP[self.variant],
                tokenization_config,
                False
            )
        return data_manager.get_dataset(split)

            

    def load_all(self):

        logging.info("Loading Dashboard Data from  %s", self.data_dir)
        for file_name, file_config in tqdm(self.config_manager.data_config.items()):
            self.dashboard_data[file_name] = self.load(file_name, file_config)
        logging.info('Loading Fine tuned Model')
        self.dashboard_data["fine_tuned_model"] = self.load_model()
        logging.info('Loading Pre Trained Model')
        self.dashboard_data["pretrained_model"] = AutoModelForTokenClassification.from_pretrained(
            MODEL_MAP[self.variant]
        )
        logging.info('Loading Data')
        self.dashboard_data["train_dataset"] = self.load_data('train')
        self.dashboard_data["test_dataset"] = self.load_data('test')


class DataManager:
    def __init__(self, config_manager, server) -> None:
        self.config_manager = config_manager
        self.variants = config_manager.variants
        self.cache = Cache(
            server,
            config={
                "CACHE_TYPE": "filesystem",
                "CACHE_DIR": "cache-directory",
                "CACHE_DEFAULT_TIMEOUT": 3600,  # Cache timeout of 1 hour
            },
        )
        self.cache.init_app(server)
        self.variants_data = self.load_all_variants_from_cache()

    def load_all_variants_from_cache(self):
        data = {}
        for variant in self.variants:
            cached_data = self.cache.get(variant)
            if cached_data:
                data[variant] = cached_data
        return data

    def load_variant(self, variant):
        """Loads data for a specific variant, with caching."""
        cached_data = self.cache.get(variant)
        if cached_data is None:
            loader = DataLoader(self.config_manager, variant)
            loader.load_all()
            data = DashboardData.from_dict(loader.dashboard_data)
            self.variants_data[variant] = data
            self.cache.set(variant, data)  # Cache the newly loaded data
            return data  # Return the new data
        self.variants_data[variant] = cached_data
        return cached_data  # Return the cached data if it was already loaded

    def load_data(self):
        """Loads data for all variants using the load_variant method for consistency."""
        for variant in self.variants:
            # Delegate the loading and caching to load_variant method
            self.variants_data[variant] = self.load_variant(variant)
        return self.variants_data

    def is_data_loaded(self):
        """Checks if all variants have data loaded in the cache."""
        for variant in self.variants:
            if self.cache.get(variant) is None:
                return False  # Return False if any variant is not loaded
        return True  # Return True if all variants are loaded

    def is_any_variant_loaded(self):
        """
        Check if any variant is loaded in the cache.

        Returns:
            bool: True if at least one variant is loaded, False otherwise.
        """
        for variant in self.variants:
            if self.cache.get(variant) is not None:
                return True  # Return True if any variant is loaded
        return False  # Return False if no variants are loaded


In [56]:
CONFIG_PATH = Path("/Users/ay227/Desktop/Final-Year/Thesis-Experiments/Online-Dashboard-Phase/analysis-config.yaml")
config_manager = DashboardConfigManager(CONFIG_PATH)
dev_config = config_manager.development_config    

app = Dash(__name__, suppress_callback_exceptions=True)

app_config = config_manager.app_config
server = app.server  # Flask server instance for caching
variants_data = None

data_manager = DataManager(config_manager, server)
dash_data = data_manager.load_data()

2025-03-25 01:01:24 - INFO - Found Google Drive directory for account ahmed.younes.sam@gmail.com: /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com
2025-03-25 01:01:24 - INFO - Loading Dashboard Data from  /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/BaseLineExperiment/ANERCorp_CamelLab_arabertv02


  0%|          | 0/18 [00:00<?, ?it/s]

2025-03-25 01:01:28 - INFO - Loading Fine tuned Model
2025-03-25 01:01:28 - INFO - Loading Pre Trained Model
Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-03-25 01:01:29 - INFO - Loading Data
2025-03-25 01:01:29 - INFO - Found Google Drive directory for account ahmed.younes.sam@gmail.com: /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com
2025-03-25 01:01:29 - INFO - Found Google Drive directory for account ahmed.younes.sam@gmail.com: /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com
2025-03-25 01:01:29 - INFO - Tokenization Config validated successfully
2025-03-25 01:01:29 - INFO - Loading Preprocessor: aubmindlab/bert-base-arabertv02
2025-03-25 01:01:29 - INFO - Loading Tokenizer: 

  0%|          | 0/18 [00:00<?, ?it/s]

2025-03-25 01:01:44 - INFO - Loading Fine tuned Model
2025-03-25 01:02:09 - INFO - Loading Pre Trained Model


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-03-25 01:02:17 - INFO - Loading Data
2025-03-25 01:02:17 - INFO - Found Google Drive directory for account ahmed.younes.sam@gmail.com: /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com
2025-03-25 01:02:17 - INFO - Found Google Drive directory for account ahmed.younes.sam@gmail.com: /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com
2025-03-25 01:02:17 - INFO - Tokenization Config validated successfully
2025-03-25 01:02:17 - INFO - Loading Tokenizer: bert-base-cased, lower_case: False
2025-03-25 01:02:17 - INFO - Found Google Drive directory for account ahmed.younes.sam@gmail.com: /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com
20

In [58]:
dash_data['ANERCorp_CamelLab_arabertv02'].pretrained_model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(64000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

2025-03-25 00:38:54 - INFO - Tokenization Config validated successfully


In [16]:
variant.split('_')[0]

'ANERCorp'

In [35]:
corpora_dir

PosixPath('/Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/ExperimentData')

ANERCorp_CamelLab


In [38]:
data_manager.get_dataset('train').__getitem__(0)

2025-03-25 00:44:28 - INFO - Loading Tokenizer: bert-base-cased, lower_case: False


{'input_ids': tensor([  101,   585, 19775, 21273, 28494, 28492, 28496, 19775, 28477,   113,
           573,   566,   562,   114,   562, 28490, 28495, 17754,   565, 28477,
         28480, 25717,   579, 17754, 28475, 28490, 23525,   565, 28495, 28484,
         16070, 28475, 19775, 28475, 28477,   585, 16070,   562, 28495, 26259,
         21273, 16070, 28475,   565, 26259, 28484,   565, 28495, 28475, 28496,
         28495,   562, 17754,   578, 19775, 28494, 28475, 28477,   579, 17754,
         28475, 28490, 23525,   565, 28495, 28484, 16070, 28475, 19775, 28475,
         28477,   585, 16070,   562, 28495, 26259, 21273, 16070, 28475,   568,
         28496, 28475, 28479, 15389,   583, 28475, 26259, 28475,   579, 28490,
         28476, 28475,   585, 16070,   582, 28495,   575, 28494, 28496, 18191,
           565, 28495, 28484, 28496, 28493,   565, 28495, 18191, 28475, 28481,
         28495, 16070, 23525,   592, 28475, 28495, 28486, 25717, 19775, 28475,
         28477,   592, 15389, 16070,   