In [1]:
import pandas as pd
from experiment_utils.env_setup import init
from experiment_utils.utils import FileHandler
from pathlib import Path


2024-08-30 22:28:18 - INFO - PyTorch version 2.2.2 available.


In [2]:
from experiment_utils.utils import FileHandler
from experiment_utils.env_setup import init
from pathlib import Path
from dataclasses import dataclass, field
from typing import Dict, Any, List
import yaml

@dataclass
class DevelopmentConfig:
    debug: bool = False
    port: int = 8000
    def __post_init__(self):
        if not isinstance(self.debug, bool):
            raise ValueError(f"Expected boolean for debug, got {type(self.debug).__name__}")
        if not (1 <= self.port <= 65535):
            raise ValueError("Port must be between 1 and 65535")
    @staticmethod
    def from_dict(config_dict: Dict[str, Any]):
        return DevelopmentConfig(**config_dict)


@dataclass
class TabConfig:
    tab_value: str
    tab_label: str
    
@dataclass
class AppConfig:
    tabs: List[TabConfig] = field(default_factory=list)
    variants: List[str] = field(default_factory=list)

    def __post_init__(self):
        if not all(isinstance(tab, TabConfig) for tab in self.tabs):
            raise ValueError("Tabs must be a list of TabConfig instances")
        if not all(isinstance(variant, str) for variant in self.variants):
            raise ValueError("Variants must be a list of strings")

    @staticmethod
    def from_dict(config_dict: Dict[str, Any]):
        tabs = [TabConfig(**tab) for tab in config_dict.get('tabs', [])]
        variants = config_dict.get('variants', [])
        return AppConfig(tabs=tabs, variants=variants)

class DashboardConfigManager:
    def __init__(self, config_path: Path):
        self.config_path = config_path
        config_fh = FileHandler(config_path.parent)
        try:
            self.config = config_fh.load_yaml(config_path.name)
        except FileNotFoundError as e:
            raise FileNotFoundError(f"Configuration file not found at {config_path}") from e
        except yaml.YAMLError as e:
            raise ValueError("Error parsing YAML configuration.") from e
        except ValueError as e:
            raise ValueError("Validation error in configuration.") from e

 
    @property
    def development_config(self) -> DevelopmentConfig:
        return DevelopmentConfig.from_dict(
            self.config.get("development", {})
        )

    @property
    def app_config(self) -> AppConfig:
        return AppConfig.from_dict(
            self.config.get("dashboard", {})
        )
    @property
    def data_dir(self) -> Path:
        base_folder = init()
        return base_folder / self.config.get("dashboard", {}).get('data_dir', '')

    @property
    def data_config(self) -> Dict:
        return self.config.get("dashboard", {}).get("dashboard_data", {}).get("data", {})

    @property
    def variants(self) -> Dict:
        return self.config.get("dashboard", {}).get("variants", {})
    
    @property
    def dataset_tab(self) -> Dict:
        return self.config.get("dashboard", {}).get("dataset_tab", {})
    
    @property
    def decision_tab(self) -> Dict:
        return self.config.get("dashboard", {}).get("decision_tab", {})



In [3]:
import logging
from tqdm.autonotebook import tqdm
from experiment_utils.utils import FileHandler
import pandas as pd
from flask_caching import Cache
import numpy as np

from dataclasses import dataclass, field
from typing import Dict, Any
import plotly.graph_objects as go

@dataclass
class DashboardData:
    analysis_data: pd.DataFrame = field(default_factory=pd.DataFrame)
    train_data: pd.DataFrame = field(default_factory=pd.DataFrame)
    kmeans_results: pd.DataFrame = field(default_factory=pd.DataFrame)
    results: pd.DataFrame = field(default_factory=pd.DataFrame)
    entity_report: pd.DataFrame = field(default_factory=pd.DataFrame)
    token_report: pd.DataFrame = field(default_factory=pd.DataFrame)
    entity_confusion_data: pd.DataFrame = field(default_factory=pd.DataFrame)
    centroids_avg_similarity_matrix: pd.DataFrame = field(default_factory=pd.DataFrame)
    attention_weights_similarity_heatmap: go.Figure = field(default_factory=go.Figure)
    attention_weights_similarity_matrix: np.ndarray = field(default_factory=lambda: np.array([]))
    attention_similarity_heatmap: go.Figure = field(default_factory=go.Figure)
    attention_similarity_matrix: np.ndarray = field(default_factory=lambda: np.array([]))

    def __post_init__(self):
        # Round float columns to four decimal places
        self.round_floats(self.analysis_data)
        self.round_floats(self.kmeans_results)
        self.round_floats(self.results)

        # Convert list to string in the 'Word Pieces' column of analysis_data if it exists
        if 'Word Pieces' in self.analysis_data.columns:
            self.analysis_data['Word Pieces'] = self.analysis_data['Word Pieces'].apply(
                lambda x: ', '.join(x) if isinstance(x, list) else x
            )
        
        self.analysis_data['Consistency Ratio'] = np.where(
            self.analysis_data['Total Train Occurrences'] != 0,  # Condition to check for non-zero denominator
            self.analysis_data['Consistency Count'] / self.analysis_data['Total Train Occurrences'],  # Normal calculation if denominator is not zero
            0
        )

        self.analysis_data['Inconsistency Ratio'] = np.where(
            self.analysis_data['Total Train Occurrences'] != 0,
            self.analysis_data['Inconsistency Count'] / self.analysis_data['Total Train Occurrences'],
            0
        )
        self.analysis_data['Normalized Token Entropy'] = DashboardData.normalized_entropy(self.analysis_data, 'Local Token Entropy', 'Token Max Entropy')  # filling 0/0 division as it generates Nan
        self.analysis_data['Normalized Word Entropy'] = DashboardData.normalized_entropy(self.analysis_data, 'Local Token Entropy', 'Token Max Entropy')  # filling 0/0 division as it generates Nan
        self.analysis_data['Normalized Prediction Entropy'] = DashboardData.normalized_entropy(self.analysis_data, 'Prediction Entropy', 'Prediction Max Entropy')  # filling 0/0 division as it generates Nan
    
    def is_loaded(self, attribute):
        """Checks if the given attribute is loaded based on its type."""
        attr_value = getattr(self, attribute)
        if isinstance(attr_value, pd.DataFrame):
            return not attr_value.empty
        elif isinstance(attr_value, go.Figure):
            return len(attr_value.data) > 0  # Check if the figure has data
        return False  # Default case if the attribute type is unrecognized

    @staticmethod
    def round_floats(df):
        for col in df.select_dtypes(include=['float']).columns:
            df[col] = df[col].round(4)
            
    @staticmethod
    def from_dict(dict_data: Dict[str, Any]):
        return DashboardData(**dict_data)
    
    @staticmethod
    def normalized_entropy(df, raw_entropy, max_entropy):
        result = np.full(df.shape[0], np.nan)
        zero_mask = (df[raw_entropy] == 0) & (df[max_entropy] == 0)
        result[zero_mask] = 0
        negative_one_mask = (df[raw_entropy] == -1) & (df[max_entropy] == -1)
        result[negative_one_mask] = -1
        valid_mask = (df[max_entropy] != 0) & ~zero_mask & ~negative_one_mask
        result[valid_mask] = df[raw_entropy][valid_mask] / df[max_entropy][valid_mask]
        zero_div_mask = (df[max_entropy] == 0) & (df[raw_entropy] != 0)
        result[zero_div_mask] = 0
        return result

class DataLoader:
    def __init__(self, config_manager, variant_name):
        self.data_config = config_manager.data_config
        self.data_dir = config_manager.data_dir / variant_name
        self.dashboard_data = {}

    def load(self, file_name, file_config):
        file_handler = FileHandler(self.data_dir / file_config['folder'])
        file_type = file_config["format"]
        file_path = file_handler.file_path / f"{file_name}.{file_type}"

        try:
            if file_path.exists():
                # Load Plotly figures specifically
                if file_type == "npy":
                    return file_handler.load_numpy(file_path.with_suffix('.npy'))

                # Handle regular JSON data files
                elif file_type == "json":
                    data = file_handler.read_json(file_path)
                    if "column_mappings" in file_config and file_config["column_mappings"]:
                        data = self.apply_column_mappings(data, file_config["column_mappings"])
                    return data
            else:
                logging.warning("File does not exist: %s", file_path)
        except Exception as e:
            logging.error("Failed to load data from %s: %s", file_path, e)
            return None
        
    def apply_column_mappings(self, data: pd.DataFrame, column_mappings: dict) -> pd.DataFrame:
        """ Rename columns in the DataFrame based on provided mappings. """
        return data.rename(columns=column_mappings)


    def load_all(self):
        
        logging.info("Loading Dashboard Data from  %s", self.data_dir)
        for file_name, file_config in tqdm(self.data_config.items()):
            self.dashboard_data[file_name] = self.load(file_name, file_config)
            
            


class DataManager:
    def __init__(self, config_manager, server) -> None:
        self.config_manager = config_manager
        self.variants = config_manager.variants
        self.cache = Cache(server, config={
            'CACHE_TYPE': 'filesystem',
            'CACHE_DIR': 'cache-directory',
            'CACHE_DEFAULT_TIMEOUT': 3600  # Cache timeout of 1 hour
        })
        self.cache.init_app(server)
        self.variants_data = self.load_all_variants_from_cache()
    
    def load_all_variants_from_cache(self):
        data = {}
        for variant in self.variants:
            cached_data = self.cache.get(variant)
            if cached_data:
                data[variant] = cached_data
        return data

    
    def load_variant(self, variant):
        """Loads data for a specific variant, with caching."""
        cached_data = self.cache.get(variant)
        if cached_data is None:
            loader = DataLoader(self.config_manager, variant)
            loader.load_all()
            data = DashboardData.from_dict(loader.dashboard_data)
            self.variants_data[variant] = data
            self.cache.set(variant, data)  # Cache the newly loaded data
            return data  # Return the new data
        self.variants_data[variant] = cached_data
        return cached_data  # Return the cached data if it was already loaded
    
    def load_data(self):
        """Loads data for all variants using the load_variant method for consistency."""
        for variant in self.variants:
            # Delegate the loading and caching to load_variant method
            self.variants_data[variant] = self.load_variant(variant)
        return self.variants_data

    def is_data_loaded(self):
        """Checks if all variants have data loaded in the cache."""
        for variant in self.variants:
            if self.cache.get(variant) is None:
                return False  # Return False if any variant is not loaded
        return True  # Return True if all variants are loaded
    
    def is_any_variant_loaded(self):
        """
        Check if any variant is loaded in the cache.

        Returns:
            bool: True if at least one variant is loaded, False otherwise.
        """
        for variant in self.variants:
            if self.cache.get(variant) is not None:
                return True  # Return True if any variant is loaded
        return False  # Return False if no variants are loaded



In [4]:
CONFIG_PATH = Path("/Users/ay227/Desktop/Final-Year/Thesis-Experiments/Online-Dashboard-Phase/dashboard-config.yaml")
config_manager = DashboardConfigManager(CONFIG_PATH)
dev_config = config_manager.development_config    

In [5]:

from dash import Dash, dcc, html, Output, Input, State
app = Dash(__name__, suppress_callback_exceptions=True)

app_config = config_manager.app_config
server = app.server  # Flask server instance for caching
variants_data = None

data_manager = DataManager(config_manager, server)

In [6]:
data_manager.load_data()

2024-08-30 22:28:37 - INFO - Found Google Drive directory for account ahmed.younes.sam@gmail.com: /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com
2024-08-30 22:28:37 - INFO - Loading Dashboard Data from  /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/DashboardTest-v0/ANERCorp_CamelLab_arabertv02


  0%|          | 0/12 [00:00<?, ?it/s]

2024-08-30 22:28:40 - INFO - Found Google Drive directory for account ahmed.younes.sam@gmail.com: /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com
2024-08-30 22:28:40 - INFO - Loading Dashboard Data from  /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/DashboardTest-v0/conll2003_bert


  0%|          | 0/12 [00:00<?, ?it/s]



{'ANERCorp_CamelLab_arabertv02': DashboardData(analysis_data=       Sentence Ids  Token Positions     Words    Tokens Word Pieces  \
 0                 0                0     [CLS]     [CLS]       [CLS]   
 1                 0                1  الصالحية  الصالحية    الصالحية   
 2                 0                2    المفرق    المفرق      المفرق   
 3                 0                3         -         -           -   
 4                 0                4       غيث       غيث         غيث   
 ...             ...              ...       ...       ...         ...   
 29706           960               10    للوليد     ##ليد  للو, ##ليد   
 29707           960               11        بن        بن          بن   
 29708           960               12      طلال      طلال        طلال   
 29709           960               13         .         .           .   
 29710           960               14     [SEP]     [SEP]       [SEP]   
 
       Core Tokens True Labels Token Selector Id Pred Labels  

In [9]:
isinstance(data_manager.variants_data['ANERCorp_CamelLab_arabertv02'].attention_similarity_matrix, np.ndarray)

True

In [93]:
df = data_manager.variants_data['ANERCorp_CamelLab_arabertv02'].analysis_data

In [80]:
df.columns

Index(['Sentence Ids', 'Token Positions', 'Words', 'Tokens', 'Word Pieces',
       'Core Tokens', 'True Labels', 'Token Selector Id', 'Pred Labels',
       'Agreements', 'X', 'Y', 'Labels', 'Losses', 'Token Ids', 'Global Id',
       'True Token Score', 'Pred Token Score', 'K=3', 'Boundary Clusters',
       'K=4', 'Entity Clusters', 'K=9', 'Token Clusters', 'Consistency Count',
       'Inconsistency Count', 'Total Train Occurrences', 'Local Token Entropy',
       'Token Max Entropy', 'Dataset Token Entropy', 'Local Word Entropy',
       'Word Max Entropy', 'Dataset Word Entropy', 'Tokenization Rate',
       'TR Entity', 'PR Entity', 'Error Type', 'O Confidence',
       'B-PERS Confidence', 'I-PERS Confidence', 'B-ORG Confidence',
       'I-ORG Confidence', 'B-LOC Confidence', 'I-LOC Confidence',
       'B-MISC Confidence', 'I-MISC Confidence', 'Prediction Entropy',
       'Prediction Max Entropy', 'Token Confidence', 'Variability', 'Pre X',
       'Pre Y', 'Normalized Token Entropy', 'N

In [81]:
df['Word Pieces'].apply(
                lambda x: ', '.join(x) if isinstance(x, list) else x
            )

0             [CLS]
1          الصالحية
2            المفرق
3                 -
4               غيث
            ...    
29706    للو, ##ليد
29707            بن
29708          طلال
29709             .
29710         [SEP]
Name: Word Pieces, Length: 29711, dtype: object

In [75]:
# Example DataFrame
data = {
    'Raw Entropy': [0, -1, 5, 2.5, 3, 7],
    'Max Entropy': [0, -1, 10, 4, 0, 0]  # Including zero and non-zero cases
}

df = pd.DataFrame(data)

# Apply the function
df['Normalized Entropy'] = normalized_entropy(df, 'Raw Entropy', 'Max Entropy')

print(df)

   Raw Entropy  Max Entropy  Normalized Entropy
0          0.0            0               0.000
1         -1.0           -1              -1.000
2          5.0           10               0.500
3          2.5            4               0.625
4          3.0            0                 inf
5          7.0            0                 inf


In [62]:
df['Inconsistency Ratio'].nunique()

334

In [74]:
df[df['Token Max Entropy']==0]['Local Token Entropy'].isna().value_counts()

Local Token Entropy
False    16035
Name: count, dtype: int64