In [1]:
import sys
import os
from pathlib import Path
# This appends the directory one level up (the root of your project) to the sys.path.
# Modify the path depending on the location of modules you want to import.
sys.path.append(os.path.abspath('../../'))

from config.config_managers import DashboardConfigManager
from dataManager import DataManager
from dash import Dash
import pandas as pd
import plotly.express as px
from abc import ABC, abstractmethod
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

2025-05-13 06:36:16 - INFO - PyTorch version 2.2.2 available.


In [2]:
# CONFIG_PATH = Path("/Users/ay227/Desktop/Final-Year/Thesis-Experiments/Online-Dashboard-Phase/dashboard-config.yaml")
CONFIG_PATH = Path("/Users/ahmed/Desktop/Dashboard/analysis-config.yaml")
config_manager = DashboardConfigManager(CONFIG_PATH)
dev_config = config_manager.development_config    

app = Dash(__name__, suppress_callback_exceptions=True)

app_config = config_manager.app_config
server = app.server  # Flask server instance for caching
variants_data = None

data_manager = DataManager(config_manager, server)
dash_data = data_manager.load_data()

In [3]:
import pandas as pd
import plotly.express as px

class BaseAnalysis:
    """Base class for analyzing different variable groups across model variants."""

    VARIANT_MAPPING = {
        "ANERCorp_CamelLab_arabertv02": "Arabic",
        "conll2003_bert": "English"
    }

    def __init__(self, dash_data):
        """Initialize the analysis with the provided dataset."""
        self.dash_data = dash_data
        self.analysis_df = self._prepare_data()

    def _prepare_data(self):
        """Prepare and combine all dataset variants into a single DataFrame."""
        analysis_data = []
        for data_name, data_content in self.dash_data.items():
            data = data_content.analysis_data.copy()
            data["Language"] = self.VARIANT_MAPPING.get(data_name, data_name)
            analysis_data.append(data)
        return pd.concat(analysis_data)

    def filter_and_standardize(self, df, variables):
        """Filter out special tokens and standardize entity label names."""
        # df = df[variables + ["Language", "True Labels"]].copy()
        
        # Remove special tokens
        df = df[~df["True Labels"].isin(["[SEP]", "[CLS]", "IGNORED"])].copy()

        # Standardize label names
        df.loc[:, "True Labels"] = df["True Labels"].replace({"B-PERS": "B-PER", "I-PERS": "I-PER"})
        df.loc[:, "Pred Labels"] = df["Pred Labels"].replace({"B-PERS": "B-PER", "I-PERS": "I-PER"})
    
        
        return df

    def compute_summary(self, df, variables, groupby_cols):
        """Compute mean and standard deviation for each variable per group."""
        summary = df.groupby(groupby_cols)[variables].agg(["mean", "std"]).reset_index()
        # Flatten column names
        summary.columns = [' '.join(col).strip() if isinstance(col, tuple) else col for col in summary.columns]
        return summary

    def format_long_data(self, summary_df, groupby_cols, metric_label):
        """Convert summary DataFrame into long format for visualization."""
        mean_cols = [col for col in summary_df.columns if "mean" in col]
        std_cols = [col.replace("mean", "std") for col in mean_cols]

        # Melt mean values
        mean_long = summary_df.melt(
            id_vars=[col for col in summary_df.columns if col not in mean_cols + std_cols],
            value_vars=mean_cols,
            var_name=metric_label,
            value_name="Mean Value"
        )

        # Melt standard deviation values
        std_long = summary_df.melt(
            id_vars=[col for col in summary_df.columns if col not in mean_cols + std_cols],
            value_vars=std_cols,
            var_name=metric_label,
            value_name="Std Dev"
        )

        # Clean column names
        mean_long[metric_label] = mean_long[metric_label].str.replace(" mean", "")
        std_long[metric_label] = std_long[metric_label].str.replace(" std", "")

        # Merge mean and std DataFrames
        summary_long = mean_long.merge(std_long, on= groupby_cols + [metric_label], how="left")
        

        # Round values for readability
        summary_long["Mean Value"] = summary_long["Mean Value"].round(3)
        summary_long["Std Dev"] = summary_long["Std Dev"].round(3)

        # Modify text labels to include Std Dev
        summary_long["Text Label"] = summary_long.apply(
            lambda row: f"{row['Mean Value']} <br>±<br>{row['Std Dev']}", axis=1
        )

        return summary_long
    
    def plot_high_level(self, summary_long_df, metric_label, title):
        """Generate a bar plot for high-level (language-based) analysis."""
        fig = px.bar(
            summary_long_df,
            x=metric_label,
            y="Mean Value",
            color="Language",
            barmode="group",
            text="Text Label",
            title=title,
            labels={"Mean Value": "Score", metric_label: "Metrics"},
        )

        fig.update_traces(
            textfont=dict(size=12)
        )

        fig.update_layout(
            template="plotly_white",
            showlegend=True,
        )
        
        fig.show()
        
    def plot_confidence(self, summary_long_df, metric_label, title):
        """Generate a bar plot for high-level (language-based) analysis."""
        fig = px.bar(
            summary_long_df,
            x=metric_label,
            y="Mean Value",
            color="Language",
            facet_row="Type",
            barmode="group",
            text="Text Label",
            title=title,
            facet_row_spacing=0.15,
            labels={"Mean Value": "Average Score", metric_label: "Metrics"},
        )

        fig.update_traces(
            textfont=dict(size=12)
        )

        fig.update_layout(
            template="plotly_white",
            showlegend=True,
            height=600,
            margin=dict(t=50, b=50, l=50, r=50),
        )
        
        fig.show()
    
    def plot_token_confidence(self, summary_long_df, metric_label, title):
        """Generate a bar plot for high-level (language-based) analysis."""
        fig = px.bar(
            summary_long_df,
            x="True Labels",
            y="Mean Value",
            color="Language",
            facet_row="Type",
            barmode="group",
            text="Text Label",
            title=title,
            facet_row_spacing=0.15,
            labels={"Mean Value": "Average Score", metric_label: "Metrics"},
        )

        fig.update_traces(
            textfont=dict(size=12)
        )

        fig.update_layout(
            template="plotly_white",
            showlegend=True,
            height=600,
            margin=dict(t=50, b=50, l=50, r=50),
        )
        
        fig.show()

    def plot_entity_level(self, summary_long_df, metric_label, title, height):
        """Generate a bar plot for entity-based (NER tag) analysis."""
        fig = px.bar(
            summary_long_df,
            x="True Labels",
            y="Mean Value",
            color="Language",
            facet_row=metric_label,
            barmode="group",
            text="Text Label",
            title=title,
            labels={"Mean Value": "Average Score", metric_label: "Metrics"},
            height=height,
            facet_row_spacing=0.0001,
        )

        fig.update_traces(
            textfont=dict(size=12)
        )
        

        fig.update_layout(
            template="plotly_white",
            showlegend=True,
            margin=dict(t=60, b=60, l=80, r=80),
        )
        fig.for_each_yaxis(lambda yaxis: yaxis.update(matches=None, autorange=True))  # Independent y-axes
        
        
        fig.show()


In [4]:

class AmbiguityAnalysis(BaseAnalysis):
    """Class for analyzing ambiguity-related metrics."""

    VARIABLES = {
        "Token Level": ["Dataset Token Entropy", "Normalized Token Entropy"],
        "Word Level": ["Dataset Word Entropy", "Normalized Word Entropy"]
    }

    def analyze_token_high_level(self):
        """Perform high-level token ambiguity analysis across languages."""
        variables = self.VARIABLES["Token Level"]
        df = self.analysis_df.copy()
        
        # Remove OOV entropy (-1) specifically for token-level metrics
        df = df[(df[variables] != -1).all(axis=1)]

        summary = self.compute_summary(df, variables, groupby_cols=["Language"])
        summary_long = self.format_long_data(summary, ["Language"], "Ambiguity Metrics")
        self.plot_high_level(summary_long, "Ambiguity Metrics", "High-Level Token Ambiguity Analysis")

    def analyze_word_high_level(self):
        """Perform high-level word ambiguity analysis across languages."""
        variables = self.VARIABLES["Word Level"]
        df = self.analysis_df.copy()
        
        # Remove OOV entropy (-1) specifically for word-level metrics
        df = df[(df[variables] != -1).all(axis=1)]

        summary = self.compute_summary(df, variables, groupby_cols=["Language"])
        summary_long = self.format_long_data(summary, ["Language"], "Ambiguity Metrics")
        self.plot_high_level(summary_long, "Ambiguity Metrics", "High-Level Word Ambiguity Analysis")

    def analyze_token_entity_level(self, height=800):
        """Perform entity-level token ambiguity analysis."""
        variables = self.VARIABLES["Token Level"]
        df = self.filter_and_standardize(self.analysis_df, variables)

        # Remove OOV entropy (-1) specifically for token-level metrics
        df = df[(df[variables] != -1).all(axis=1)]

        summary = self.compute_summary(df, variables, groupby_cols=["Language", "True Labels"])
        summary_long = self.format_long_data(summary, ["Language", "True Labels"], "Ambiguity Metrics")
        self.plot_entity_level(summary_long, "Ambiguity Metrics", "Entity-Level Token Ambiguity Analysis", height)

    def analyze_word_entity_level(self, height=800):
        """Perform entity-level word ambiguity analysis."""
        variables = self.VARIABLES["Word Level"]
        df = self.filter_and_standardize(self.analysis_df, variables)

        # Remove OOV entropy (-1) specifically for word-level metrics
        df = df[(df[variables] != -1).all(axis=1)]

        summary = self.compute_summary(df, variables, groupby_cols=["Language", "True Labels"])
        summary_long = self.format_long_data(summary, ["Language", "True Labels"], "Ambiguity Metrics")
        self.plot_entity_level(summary_long, "Ambiguity Metrics", "Entity-Level Word Ambiguity Analysis", height)




# class ConsistencyAnalysis(BaseAnalysis):
#     """Class for analyzing consistency metrics across model variants."""

#     VARIABLES = {
#         "Consistency Variables": ["Consistency Ratio", "Inconsistency Ratio"],
#         "Consistency Absolute": ["Consistency Count", "Inconsistency Count"],
#         "OOV": ["Dataset Token Entropy", "Normalized Token Entropy"]
#     }

#     def analyze_high_level(self):
#         """Perform high-level consistency analysis across languages."""
#         variables = self.VARIABLES["Consistency Variables"]
#         oov_variables = self.VARIABLES["OOV"]
#         df = self.analysis_df.copy()
#         print(df.columns)
#         df = df[(df[oov_variables] != -1).all(axis=1)]
#         summary = self.compute_summary(df, variables, groupby_cols=["Language"])
#         summary_long = self.format_long_data(summary, ["Language"], "Consistency Metrics")
#         self.plot_high_level(summary_long, "Consistency Metrics", "High-Level Consistency Analysis")

#     def analyze_entity_level(self,  height=800):
#         """Perform entity-level consistency analysis."""
#         variables = self.VARIABLES["Consistency Variables"]
#         oov_variables = self.VARIABLES["OOV"]
#         df = self.filter_and_standardize(self.analysis_df, variables)
#         df = df[(df[oov_variables] != -1).all(axis=1)]
#         summary = self.compute_summary(df, variables, groupby_cols=["Language", "True Labels"])
#         summary_long = self.format_long_data(summary, ["Language", "True Labels"], "Consistency Metrics")
#         self.plot_entity_level(summary_long, "Consistency Metrics", "Entity-Level Consistency Analysis", height)



class ConsistencyAnalysis(BaseAnalysis):
    """Class for analyzing consistency metrics across model variants."""

    VARIABLES = {
        "Consistency Variables": ["Consistency Ratio", "Inconsistency Ratio"],
        "Consistency Absolute": ["Consistency Count", "Inconsistency Count"],
        "OOV": ["Dataset Token Entropy", "Normalized Token Entropy"]
    }

    def analyze_high_level(self):
        """Perform high-level consistency analysis across languages (Graph)."""
        variables = self.VARIABLES["Consistency Variables"]
        oov_variables = self.VARIABLES["OOV"]
        df = self.analysis_df.copy()
        df = df[(df[oov_variables] != -1).all(axis=1)]
        summary = self.compute_summary(df, variables, groupby_cols=["Language"])
        summary_long = self.format_long_data(summary, ["Language"], "Consistency Metrics")
        self.plot_high_level(summary_long, "Consistency Metrics", "High-Level Consistency Analysis")

    def analyze_entity_level(self, height=800):
        """Perform entity-level consistency analysis (Graph)."""
        variables = self.VARIABLES["Consistency Variables"]
        oov_variables = self.VARIABLES["OOV"]
        df = self.analysis_df.copy()
        df = df[(df[oov_variables] != -1).all(axis=1)]
        df = self.filter_and_standardize(df, variables)
        
        summary = self.compute_summary(df, variables, groupby_cols=["Language", "True Labels"])
        summary_long = self.format_long_data(summary, ["Language", "True Labels"], "Consistency Metrics")
        self.plot_entity_level(summary_long, "Consistency Metrics", "Entity-Level Consistency Analysis", height)

    def generate_high_level_table(self):
        """Generate a table for high-level consistency analysis."""
        variables = self.VARIABLES["Consistency Absolute"]
        oov_variables = self.VARIABLES["OOV"]
        df = self.analysis_df.copy()
        df = df[(df[oov_variables] != -1).all(axis=1)]
        summary = self.compute_summary(df, variables, groupby_cols=["Language"])
        
        
        table = pd.DataFrame(summary)
        table = table.round(2)
        # table.rename(columns={"Language": "Dataset", 
        #                       "Consistency Count": "Consistent Tokens", 
        #                       "Inconsistency Count": "Inconsistent Tokens"}, inplace=True)
        
        display(table)
    def generate_entity_level_table(self):
        """Generate a table for entity-level consistency analysis."""
        variables = self.VARIABLES["Consistency Absolute"]
        oov_variables = self.VARIABLES["OOV"]
        df = self.analysis_df.copy()
        df = df[(df[oov_variables] != -1).all(axis=1)]
        df = self.filter_and_standardize(self.analysis_df, variables)
        summary = self.compute_summary(df, variables, groupby_cols=["Language", "True Labels"])

        
        table = pd.DataFrame(summary)
        table = table.round(2)
        # table.rename(columns={"Language": "Dataset", 
        #                       "True Labels": "Entity Tag",
        #                       "Consistency Count": "Consistent Tokens", 
        #                       "Inconsistency Count": "Inconsistent Tokens"}, inplace=True)
        display(table)


class LossAnalysis(BaseAnalysis):
    """Class for analyzing loss metrics across model variants."""

    VARIABLES = {
        "Loss Metrics": ["Losses"]
    }

    def analyze_high_level(self):
        """Perform high-level loss analysis across languages."""
        variables = self.VARIABLES["Loss Metrics"]
        df = self.analysis_df.copy()
        
        summary = self.compute_summary(df, variables, groupby_cols=["Language"])
        summary_long = self.format_long_data(summary, ["Language"], "Loss Metrics")
        self.plot_high_level(summary_long, "Loss Metrics", "High-Level Loss Analysis")

    def analyze_entity_level(self, height=500):
        """Perform entity-level loss analysis."""
        variables = self.VARIABLES["Loss Metrics"]
        df = self.filter_and_standardize(self.analysis_df, variables)

        summary = self.compute_summary(df, variables, groupby_cols=["Language", "True Labels"])
        summary_long = self.format_long_data(summary, ["Language", "True Labels"], "Loss Metrics")
        self.plot_entity_level(summary_long, "Loss Metrics", "Entity-Level Loss Analysis", height)
        


class TokenizationAnalysis(BaseAnalysis):
    """Class for analyzing tokenization rates across model variants."""

    VARIABLES = {
        "Tokenization Metrics": ["Tokenization Rate"]
    }

    def analyze_high_level(self):
        """Perform high-level tokenization analysis across languages."""
        variables = self.VARIABLES["Tokenization Metrics"]
        df = self.analysis_df.copy()

        summary = self.compute_summary(df, variables, groupby_cols=["Language"])
        summary_long = self.format_long_data(summary, ["Language"], "Tokenization Metrics")
        self.plot_high_level(summary_long, "Tokenization Metrics", "High-Level Tokenization Analysis")

    def analyze_entity_level(self, height=500):
        """Perform entity-level tokenization analysis."""
        variables = self.VARIABLES["Tokenization Metrics"]
        df = self.filter_and_standardize(self.analysis_df, variables)

        summary = self.compute_summary(df, variables, groupby_cols=["Language", "True Labels"])
        summary_long = self.format_long_data(summary, ["Language", "True Labels"], "Tokenization Metrics")
        self.plot_entity_level(summary_long, "Tokenization Metrics", "Entity-Level Tokenization Analysis", height)
        


class PredictionAnalysis(BaseAnalysis):
    """Class for analyzing prediction variability across model variants."""

    VARIABLES = {
        "Prediction Metrics": ["Variability", "Normalized Prediction Entropy"]
    }

    def analyze_high_level(self):
        """Perform high-level prediction variability analysis across languages."""
        variables = self.VARIABLES["Prediction Metrics"]
        df = self.analysis_df.copy()

        summary = self.compute_summary(df, variables, groupby_cols=["Language"])
        summary_long = self.format_long_data(summary, ["Language"], "Prediction Metrics")
        self.plot_high_level(summary_long, "Prediction Metrics", "High-Level Prediction Distribution Analysis")

    def analyze_entity_level_correct(self, height=800):
        """Perform entity-level prediction variability analysis."""
        variables = self.VARIABLES["Prediction Metrics"]
        df = self.analysis_df.copy()
        df = df[df['Agreements'] == True]
        df = self.filter_and_standardize(df, variables)

        summary = self.compute_summary(df, variables, groupby_cols=["Language", "True Labels"])
        summary_long = self.format_long_data(summary, ["Language", "True Labels"], "Prediction Metrics")
        self.plot_entity_level(summary_long, "Prediction Metrics", "Entity-Level Correct Prediction Distribution Analysis", height)
    
    def analyze_entity_level_error(self, height=800):
        """Perform entity-level prediction variability analysis."""
        variables = self.VARIABLES["Prediction Metrics"]
        df = self.analysis_df.copy()
        df = df[df['Agreements'] == False]
        df = self.filter_and_standardize(df, variables)

        summary = self.compute_summary(df, variables, groupby_cols=["Language", "True Labels"])
        summary_long = self.format_long_data(summary, ["Language", "True Labels"], "Prediction Metrics")
        self.plot_entity_level(summary_long, "Prediction Metrics", "Entity-Level Incorrect Prediction Distribution Analysis", height)

class SilhouetteAnalysis(BaseAnalysis):
    """Class for analyzing silhouette scores across model variants."""

    VARIABLES = {
        "Silhouette Scores": ["True Silhouette Score", "Pred Silhouette Score"]
    }

    def analyze_high_level(self):
        """Perform high-level silhouette score analysis across languages."""
        variables = self.VARIABLES["Silhouette Scores"]
        df = self.analysis_df.copy()

        summary = self.compute_summary(df, variables, groupby_cols=["Language"])
        summary_long = self.format_long_data(summary, ["Language"], "Silhouette Scores")
        self.plot_high_level(summary_long, "Silhouette Scores", "High-Level Silhouette Score Analysis")

    def analyze_entity_level(self, height=800):
        """Perform entity-level silhouette score analysis."""
        variables = self.VARIABLES["Silhouette Scores"]
        df = self.filter_and_standardize(self.analysis_df, variables)

        summary = self.compute_summary(df, variables, groupby_cols=["Language", "True Labels"])
        summary_long = self.format_long_data(summary, ["Language", "True Labels"], "Silhouette Scores")
        self.plot_entity_level(summary_long, "Silhouette Scores", "Entity-Level Silhouette Score Analysis", height)


class ConfidenceAnalysis(BaseAnalysis):
    """Class for analyzing confidence metrics across model variants."""

    VARIABLES = {
        "High-Level Confidence": [
            'O Confidence', 
            'B-LOC Confidence', 'I-LOC Confidence',
            'B-PER Confidence', 'I-PER Confidence', 
            'B-ORG Confidence', 'I-ORG Confidence',
            'B-MISC Confidence', 'I-MISC Confidence'
        ],
        "Entity-Level Confidence": ['Token Confidence']
    }

    def analyze_high_level(self):
        """Perform high-level confidence analysis across languages."""
        variables = self.VARIABLES["High-Level Confidence"]
        df = self.analysis_df.copy()

        summary = self.compute_summary(df, variables, groupby_cols=["Language"])
        summary_long = self.format_long_data(summary, ["Language"], "Confidence Metrics")
        self.plot_high_level(summary_long, "Confidence Metrics", "High-Level Confidence Analysis")
        
    def analyze_high_level_error(self):
        """Perform high-level confidence analysis across languages."""
        variables = self.VARIABLES["High-Level Confidence"]
        df = self.analysis_df.copy()
        df = df[df['Agreements'] == False]

        summary = self.compute_summary(df, variables, groupby_cols=["Language"])
        summary_long = self.format_long_data(summary, ["Language"], "Confidence Metrics")
        self.plot_high_level(summary_long, "Confidence Metrics", "High-Level Error Prediction Confidence Analysis")
        
    def analyze_high_level_correct(self):
        """Perform high-level confidence analysis across languages."""
        variables = self.VARIABLES["High-Level Confidence"]
        df = self.analysis_df.copy()
        df = df[df['Agreements'] == True]

        summary = self.compute_summary(df, variables, groupby_cols=["Language"])
        summary_long = self.format_long_data(summary, ["Language"], "Confidence Metrics")
        self.plot_high_level(summary_long, "Confidence Metrics", "High-Level Correct Prediction Confidence Analysis")
        

    def analyze_high_level_summary(self):
        """Perform high-level confidence analysis across languages for both errors and correct predictions."""
        variables = self.VARIABLES["High-Level Confidence"]
        df = self.analysis_df.copy()

        # Compute summary for incorrect predictions
        df_errors = df[df['Agreements'] == False]
        df_errors = self.filter_and_standardize(df_errors, variables)
        summary_errors = self.compute_summary(df_errors, variables, groupby_cols=["Language"])
        summary_errors["Type"] = "Error"

        # Compute summary for correct predictions
        df_correct = df[df['Agreements'] == True]
        df_correct = self.filter_and_standardize(df_correct, variables)
        summary_correct = self.compute_summary(df_correct, variables, groupby_cols=["Language"])
        summary_correct["Type"] = "Correct"

        # Merge both summaries
        summary = pd.concat([summary_errors, summary_correct], ignore_index=True)

        # Convert to long format for plotting
        summary_long = self.format_long_data(summary, ["Language", "Type"], "Confidence Metrics")

        # Plot the results in a single bar chart
        self.plot_confidence(summary_long, "Confidence Metrics", "Comparison of Confidence Scores Across Entity Tags in Correct and Incorrect Predictions")


    def analyze_entity_level_error(self, height=500):
        """Perform entity-level confidence analysis (Token Confidence only)."""
        variables = self.VARIABLES["Entity-Level Confidence"]
        df = self.analysis_df.copy()
        df = df[df['Agreements'] == False]
        df = self.filter_and_standardize(df, variables)
        
        summary = self.compute_summary(df, variables, groupby_cols=["Language", "True Labels"])
        summary_long = self.format_long_data(summary, ["Language", "True Labels"], "Token Confidence")
        self.plot_entity_level(summary_long, "Token Confidence", "Entity-Level Error Prediction Confidence Analysis", height)


    def analyze_entity_level_correct(self, height=500):
            """Perform entity-level confidence analysis (Token Confidence only)."""
            variables = self.VARIABLES["Entity-Level Confidence"]
            df = self.analysis_df.copy()
            df = df[df['Agreements'] == True]
            df = self.filter_and_standardize(df, variables)
            
            summary = self.compute_summary(df, variables, groupby_cols=["Language", "True Labels"])
            
            summary_long = self.format_long_data(summary, ["Language", "True Labels"], "Token Confidence")
            self.plot_entity_level(summary_long, "Token Confidence", "Entity-Level Correct Prediction Confidence Analysis", height)
    
    def analyze_entity_level_summary(self, height=500):
        """
        Perform entity-level confidence analysis (Token Confidence only) for both correct and incorrect predictions
        and present them in a single comparison plot.
        
        Args:
            height (int): Height of the plot.
        """
        variables = self.VARIABLES["Entity-Level Confidence"]
        df = self.analysis_df.copy()

        # Compute summary for incorrect predictions
        df_errors = df[df['Agreements'] == False]
        df_errors = self.filter_and_standardize(df_errors, variables)
        summary_errors = self.compute_summary(df_errors, variables, groupby_cols=["Language", "True Labels"])
        summary_errors["Type"] = "Error"

        # Compute summary for correct predictions
        df_correct = df[df['Agreements'] == True]
        df_correct = self.filter_and_standardize(df_correct, variables)
        summary_correct = self.compute_summary(df_correct, variables, groupby_cols=["Language", "True Labels"])
        summary_correct["Type"] = "Correct"

        # Merge both summaries
        summary = pd.concat([summary_errors, summary_correct], ignore_index=True)

        # Convert to long format for plotting
        summary_long = self.format_long_data(summary, ["Language", "True Labels", "Type"], "Token Confidence")

        # Plot the results in a single bar chart
        self.plot_token_confidence(summary_long, "Token Confidence", "Comparison of Toke Confidence Score in Correct and Incorrect Predictions")

            
            
    # def detailed_entity_level_errors(self):
    #     """Perform entity-level confidence analysis (Token Confidence only)."""
    #     variables = self.VARIABLES["Entity-Level Confidence"]
    #     df = self.analysis_df.copy()
    #     df = df[df['Agreements'] == False].copy()
    #     df = self.filter_and_standardize(df, variables+['Pred Labels'])
        

    #     # Group by 'True Labels' and 'Pred Labels' and calculate weighted average
    #     grouped_df = df.groupby(['Language', 'True Labels', 'Pred Labels'])[['Token Confidence']].apply(
    #         lambda x: pd.Series({
    #             'Weighted Token Confidence': (x['Token Confidence'] * x['Token Confidence'].count()).sum() / x['Token Confidence'].count(),
    #             'Count': x['Token Confidence'].count()  # Add count for reference
    #         })
    #     ).reset_index()

    #     # Pivot to get heatmap format
    #     heatmap_data = grouped_df.pivot_table(
    #         index=['Language', 'True Labels'], 
    #         columns='Pred Labels', 
    #         values='Weighted Token Confidence', 
    #         fill_value=0
    #     )
        
        
    #     heatmap_arabic = heatmap_data.loc["Arabic"]
    #     heatmap_english = heatmap_data.loc["English"]
        

    #     # Create subplots
    #     fig = make_subplots(
    #         rows=1, cols=2,  # 1 row, 2 columns
    #         subplot_titles=["Arabic", "English"],  # Titles above each subplot
    #         shared_yaxes=True,  # Align y-axis labels
    #         horizontal_spacing=0.15  # Adjust space between heatmaps
    #     )

    #     # Add Arabic heatmap
    #     fig.add_trace(
    #         go.Heatmap(
    #             z=heatmap_arabic.values,
    #             x=heatmap_arabic.columns,
    #             y=heatmap_arabic.index,
    #             colorscale="RdBu_r",
    #             text=heatmap_arabic.round(2).astype(str),  # Convert confidence values to text
    #             texttemplate="%{text}",  # Format text display
    #             colorbar=dict(title="Confidence"),
    #             # zmin=0, zmax=1,  # Keep scale consistent
    #         ),
    #         row=1, col=1
    #     )
    #     # Add English heatmap
    #     fig.add_trace(
    #         go.Heatmap(
    #             z=heatmap_english.values,
    #             x=heatmap_english.columns,
    #             y=heatmap_english.index,
    #             colorscale="RdBu_r",
    #             text=heatmap_english.round(2).astype(str),  # Convert confidence values to text
    #             texttemplate="%{text}",  # Format text display
    #             colorbar=dict(title="Confidence"),
    #             # zmin=0, zmax=1,  # Keep scale consistent
    #         ),
    #         row=1, col=2
    #     )

    #     # Update layout for better display
    #     fig.update_layout(
    #         title_text="Misclassification Heatmap - Arabic vs. English",
    #         width=1200, height=600,  # Adjust size
    #         template="plotly_white",
    #         xaxis_tickangle=-45
    #     )


    #     fig.show()

    def detailed_entity_level_errors(self):
        """Perform entity-level confidence analysis (Token Confidence only)."""
        variables = self.VARIABLES["Entity-Level Confidence"]
        df = self.analysis_df.copy()
        df = df[df['Agreements'] == False].copy()
        df = self.filter_and_standardize(df, variables + ['Pred Labels'])

        # Group by 'True Labels' and 'Pred Labels' and calculate weighted average
        grouped_df = df.groupby(['Language', 'True Labels', 'Pred Labels'])[['Token Confidence']].apply(
            lambda x: pd.Series({
                'Weighted Token Confidence': (x['Token Confidence'] * x['Token Confidence'].count()).sum() / x['Token Confidence'].count(),
                'Count': x['Token Confidence'].count()  # Add count for reference
            })
        ).reset_index()

        # Pivot to get heatmap format
        heatmap_data = grouped_df.pivot_table(
            index=['Language', 'True Labels'],
            columns='Pred Labels',
            values='Weighted Token Confidence',
            fill_value=0
        )

        heatmap_arabic = heatmap_data.loc["Arabic"]
        heatmap_english = heatmap_data.loc["English"]

        # Get global min and max values for consistent scaling
        zmin, zmax = heatmap_data.min().min(), heatmap_data.max().max()

        # Create subplots
        fig = make_subplots(
            rows=1, cols=2,
            subplot_titles=["Arabic", "English"],
            shared_yaxes=True,  # Align y-axis labels
            horizontal_spacing=0.15
        )

        # Add Arabic heatmap
        fig.add_trace(
            go.Heatmap(
                z=heatmap_arabic.values,
                x=heatmap_arabic.columns,
                y=heatmap_arabic.index,
                colorscale="RdBu_r",
                text=heatmap_arabic.round(2).astype(str),  # Convert confidence values to text
                texttemplate="%{text}",  # Format text display
                colorbar=dict(title="Confidence"),
                zmin=zmin, zmax=zmax  # Set shared scale
            ),
            row=1, col=1
        )

        # Add English heatmap
        fig.add_trace(
            go.Heatmap(
                z=heatmap_english.values,
                x=heatmap_english.columns,
                y=heatmap_english.index,
                colorscale="RdBu_r",
                text=heatmap_english.round(2).astype(str),
                texttemplate="%{text}",
                colorbar=dict(title="Confidence"),
                zmin=zmin, zmax=zmax  # Set shared scale
            ),
            row=1, col=2
        )

        # Update layout
        fig.update_layout(
            title_text="Misclassification Heatmap - Arabic vs. English",
            width=1200, height=600,
            template="plotly_white",
            xaxis_title="Predicted Entity",  # Label x-axis
            yaxis_title="True Entity",  # Label y-axis
        )

        # Update x-axis labels to tilt for both subplots
        fig.update_xaxes(tickangle=-45)  # Tilt x-axis labels for readability

        fig.show()


In [5]:
ar = dash_data['ANERCorp_CamelLab_arabertv02'].analysis_data
en = dash_data['conll2003_bert'].analysis_data

# Analysis

### ambiguity

In [7]:
ambiguity_analysis = AmbiguityAnalysis(dash_data)
ambiguity_analysis.analyze_token_high_level()  # High-level ambiguity analysis
ambiguity_analysis.analyze_word_high_level()  # High-level ambiguity analysis
ambiguity_analysis.analyze_token_entity_level()  # Entity-level ambiguity analysis
ambiguity_analysis.analyze_word_entity_level()  # Entity-level ambiguity analysis


KeyError: "['Normalized Token Entropy'] not in index"

### Tokenisation

In [None]:
variable_analysis = TokenizationAnalysis(dash_data)
variable_analysis.analyze_high_level()  # High-level ambiguity analysis
variable_analysis.analyze_entity_level()  # Entity-level ambiguity analysis


### consistency

In [None]:
variable_analysis = ConsistencyAnalysis(dash_data)
variable_analysis.analyze_high_level()
variable_analysis.generate_high_level_table()
variable_analysis.analyze_entity_level()
variable_analysis.generate_entity_level_table()


### Loss

In [None]:
loss_analysis = LossAnalysis(dash_data)
loss_analysis.analyze_high_level()
loss_analysis.analyze_entity_level()


### Confidence

In [None]:
confidence_analysis = ConfidenceAnalysis(dash_data)
confidence_analysis.analyze_high_level_summary()
confidence_analysis.analyze_entity_level_summary()
# confidence_analysis.analyze_high_level_error()
# confidence_analysis.analyze_high_level_correct()
# confidence_analysis.analyze_entity_level_error()
# confidence_analysis.analyze_entity_level_correct()
confidence_analysis.detailed_entity_level_errors()


### Prediction Entropy

In [None]:
variable_analysis = PredictionAnalysis(dash_data)
variable_analysis.analyze_high_level()
variable_analysis.analyze_entity_level_correct()
variable_analysis.analyze_entity_level_error()


In [None]:
ar[(ar['Agreements'] ==  False)
         &(ar['True Labels'] ==  'I-MISC')
        #  &(ar['Pred Labels'] ==  'O')
        ][
            # confidence_variables + 
          ['True Labels', 'Pred Labels', 'Token Confidence', 'Losses', 'Prediction Max Entropy', 'Prediction Entropy', 'Normalized Prediction Entropy', 'Variability']].describe()


In [None]:
en[(en['Agreements'] ==  False)
         &(en['True Labels'] ==  'I-MISC')
        #  &(ar['Pred Labels'] ==  'O')
        ][
            # confidence_variables +
            ['True Labels', 'Pred Labels', 'Token Confidence', 'Losses', 'Prediction Max Entropy', 'Prediction Entropy', 'Normalized Prediction Entropy', 'Variability']].describe()


### Silhouette Score

In [None]:
silhouette_analysis = SilhouetteAnalysis(dash_data)
silhouette_analysis.analyze_high_level()
silhouette_analysis.analyze_entity_level()


# Investigation

In [None]:
class JointAnalysis:
    """Class to perform joint analysis on model variants."""

    # Define mapping for variant names
    VARIANT_MAPPING = {
        "ANERCorp_CamelLab_arabertv02": "Arabic",
        "conll2003_bert": "English"
    }

    def __init__(self, dash_data):
        """
        Initialize the analysis.

        Parameters:
        - dash_data (dict): Dictionary containing dataset variants.
        """
        self.dash_data = dash_data
        self.analysis_df = self._prepare_data()

    def _prepare_data(self):
        """Prepare and combine all dataset variants into a single DataFrame."""
        analysis_data = []
        for data_name, data_content in self.dash_data.items():
            data = data_content.analysis_data.copy()
            data["Language"] = self.VARIANT_MAPPING.get(data_name, data_name)  # Map variant names
            analysis_data.append(data)
        return pd.concat(analysis_data)

    def compute_summary(self, data, variables):
        """Compute mean and standard deviation for each variable per model variant."""
        summary = data[variables + ["Language"]].groupby("Language").agg(["mean", "std"]).reset_index()

        # Flatten column names
        summary.columns = [' '.join(col).strip() if isinstance(col, tuple) else col for col in summary.columns]
        return summary

    def format_long_data(self, summary_df, metric_label):
        """Convert summary DataFrame into long format for visualization."""
        mean_cols = [col for col in summary_df.columns if "mean" in col]
        std_cols = [col.replace("mean", "std") for col in mean_cols]

        # Melt mean values
        mean_long = summary_df.melt(
            id_vars=["Language"], 
            value_vars=mean_cols, 
            var_name=metric_label, 
            value_name="Mean Value"
        )

        # Melt standard deviation values
        std_long = summary_df.melt(
            id_vars=["Language"], 
            value_vars=std_cols, 
            var_name=metric_label, 
            value_name="Std Dev"
        )

        # Clean column names (remove "mean"/"std" suffix from metric names)
        mean_long[metric_label] = mean_long[metric_label].str.replace(" mean", "")
        std_long[metric_label] = std_long[metric_label].str.replace(" std", "")

        # Merge mean and std DataFrames
        summary_long = mean_long.merge(
            std_long, 
            on=["Language", metric_label], 
            how="left"
        )

        # Round values for better readability
        summary_long["Mean Value"] = summary_long["Mean Value"].round(2)
        summary_long["Std Dev"] = summary_long["Std Dev"].round(2)

        # Create text label to display above error bars (mean ± std)
        summary_long["Text Label"] = summary_long.apply(
            lambda row: f"{row['Mean Value']}<br>±<br>{row['Std Dev']}", axis=1
        )

        return summary_long

    def plot_summary(self, summary_long_df, metric_label, title):
        """Generate a bar plot with error bars representing standard deviation."""
        fig = px.bar(
            summary_long_df,
            x=metric_label,
            y="Mean Value",
            color="Language",
            barmode="group",
            text="Text Label",  # Show mean values inside bars
            title=title,
            labels={"Mean Value": "Score", metric_label: "Metrics"},
            height=800,
            width=1000
        )

        # Add standard deviation as error bars
        fig.update_traces(
            error_y=dict(
                type="data",
                array=summary_long_df["Text Label"],
                width=2,
                thickness=0.5,
                visible=True
            ),
            # textposition="outside",
            textfont=dict(size=12)
        )

      
     
        # Adjust layout for readability
        fig.update_layout(
            template="plotly_white",
            showlegend=True,
            # xaxis_tickangle=-45
        )
        
        fig.show()

    def analyze(self, variables, metric_label="Metric", title="Comparison of Variables Across Model Variants"):
        """Run the full analysis pipeline."""
        summary = self.compute_summary(variables)
        summary_long = self.format_long_data(summary, metric_label)
        self.plot_summary(summary_long, metric_label, title)
        
    def analyse_group(self):
        
        # Define variable groups
        variable_groups = {
            "Consistency Metrics": ["Consistency Ratio", "Inconsistency Ratio"],
            "Ambiguity Metrics": [
                "Dataset Token Entropy", "Dataset Word Entropy",
                "Normalized Token Entropy", "Normalized Word Entropy"
            ],
            "Tokenization Metrics": ['Tokenization Rate'],
            "Loss Metrics": ['Losses'],
            "Confidence Metrics": [
                'O Confidence', 'B-LOC Confidence', 'I-LOC Confidence',
                'B-PER Confidence', 'I-PER Confidence', 
                'B-ORG Confidence', 'I-ORG Confidence',
                'B-MISC Confidence', 'I-MISC Confidence'
            ],
            "Prediction Metrics": [
                'Variability', 'Normalized Prediction Entropy'
            ],
            "Silhouette Scores": ['True Silhouette Score', 'Pred Silhouette Score']
        }

        # Loop through each variable category
        for group_name, variables in variable_groups.items():
            # Check if any of the requested variables exist in the dataset
            relevant_vars = [var for var in variables if var in self.analysis_df.columns]
            if not relevant_vars:
                continue  # Skip this category if none of its variables exist
            
            # Handle Ambiguity Metrics separately (Remove -1 from OOV adjustments)
            if group_name == "Ambiguity Metrics":
                temp_df = self.analysis_df.copy()
                for var in relevant_vars:
                    temp_df = temp_df[temp_df[var] != -1] 
            else:
                temp_df = self.analysis_df  # Use the original data for other metrics

            # Temporarily update self.variables for the current category
            self.variables = relevant_vars

            # Compute summary statistics
            summary = self.compute_summary(temp_df, relevant_vars)

            # Format data for visualization
            summary_long = self.format_long_data(summary, group_name)

            # Generate a separate plot for each category
            plot_title = f"Comparison of {group_name} Across Languages"
            self.plot_summary(summary_long, group_name, plot_title)



class EntityJointAnalysis:
    """Class to analyze consistency variables across model variants at the entity level."""

    # Define mapping for variant names
    VARIANT_MAPPING = {
        "ANERCorp_CamelLab_arabertv02": "Arabic",
        "conll2003_bert": "English"
    }

    def __init__(self, dash_data):
        """
        Initialize the entity-level analysis.

        Parameters:
        - dash_data (dict): Dictionary containing dataset variants.
        
        """
        self.dash_data = dash_data
        self.analysis_df = self._prepare_data()

    def _prepare_data(self):
        """Prepare and combine all dataset variants into a single DataFrame."""
        analysis_data = []
        for data_name, data_content in self.dash_data.items():
            data = data_content.analysis_data.copy()
            data["Language"] = self.VARIANT_MAPPING.get(data_name, data_name)  # Map variant names
            analysis_data.append(data)
        return pd.concat(analysis_data)

    def filter_and_standardize(self, variables):
        """Filter out unwanted special tokens and standardize entity label names."""
        df = self.analysis_df[variables + ["Language", "True Labels"]].copy()
        
        # Remove special tokens
        df = df[~df["True Labels"].isin(["[SEP]", "[CLS]", "IGNORED"])]
        
        # Standardize label names
        df["True Labels"] = df["True Labels"].replace({"B-PERS": "B-PER", "I-PERS": "I-PER"})
        
        return df

    def compute_summary(self, df, variables):
        """Compute mean and standard deviation for each variable per model variant and entity label."""
        summary = df.groupby(["Language", "True Labels"])[variables].agg(["mean", "std"]).reset_index()

        # Flatten column names
        summary.columns = [' '.join(col).strip() if isinstance(col, tuple) else col for col in summary.columns]
        return summary

    def format_long_data(self, summary_df, metric_label):
        """Convert summary DataFrame into long format for visualization."""
        mean_cols = [col for col in summary_df.columns if "mean" in col]
        std_cols = [col.replace("mean", "std") for col in mean_cols]

        # Melt mean values
        mean_long = summary_df.melt(
            id_vars=["Language", "True Labels"],
            value_vars=mean_cols,
            var_name=metric_label,
            value_name="Average Value"
        )

        # Melt standard deviation values
        std_long = summary_df.melt(
            id_vars=["Language", "True Labels"],
            value_vars=std_cols,
            var_name=metric_label,
            value_name="Std Dev"
        )

        # Remove "mean" and "std" suffixes in "Consistency Metric" names
        mean_long[metric_label] = mean_long[metric_label].str.replace(" mean", "")
        std_long[metric_label] = std_long[metric_label].str.replace(" std", "")

        # Merge mean and std DataFrames
        summary_long = mean_long.merge(
            std_long, 
            on=["Language", "True Labels", metric_label], 
            how="left"
        )

        # Round values for better readability
        summary_long["Average Value"] = summary_long["Average Value"].round(3)
        summary_long["Std Dev"] = summary_long["Std Dev"].round(3)

        # Modify text labels to include Std Dev
        summary_long["text_label"] = summary_long.apply(
            lambda row: f"{row['Average Value']} <br>±<br>{row['Std Dev']}", axis=1
        )

        return summary_long

    def plot_summary(self, summary_long_df, metric_label, title):
        """Generate a bar plot comparing entity-based consistency variables."""
        fig = px.bar(
            summary_long_df,
            x="True Labels",
            y="Average Value",
            color="Language",
            facet_row=metric_label,  # Each entity type in a different row
            barmode="group",
            text="text_label",
            title=title,
            labels={"Average Value": "Average Score", metric_label: "Metrics"},
            height=800,
            width=1000
        )

        # Add standard deviation as error bars
        fig.update_traces(
            # textposition="outside",  
            textfont=dict(size=12),
            # insidetextanchor="start",  
        )

        # Adjust layout for readability
        fig.update_layout(
            template="plotly_white",
            showlegend=True,
            margin=dict(t=120, b=80, l=80, r=80),
            xaxis_tickangle=-45
        )
        
        fig.show()

    def analyze(self, variables, metric_label="Metric", title="Comparison of Consistency Variables Across Model Variants and True Labels"):
        """Run the full entity-based analysis pipeline."""
        filtered_df = self.filter_and_standardize(variables)
        summary = self.compute_summary(filtered_df, variables)
        summary_long = self.format_long_data(summary, metric_label)
        self.plot_summary(summary_long, metric_label, title)
        
    def analyze_group(self):
        """
        Generate separate plots for different categories of entity-level variables.
        """
        # Define variable groups
        variable_groups = {
            "Consistency Metrics": ["Consistency Ratio", "Inconsistency Ratio"],
            "Ambiguity Metrics - Dataset Level": [
                "Dataset Token Entropy", "Dataset Word Entropy",
            ],
            "Ambiguity Metrics - Token Level": [
                "Normalized Token Entropy", "Normalized Word Entropy"
            ],
            "Tokenization Metrics": ['Tokenization Rate'],
            "Loss Metrics": ['Losses'],
            # "Confidence Metrics": [
            #     'Token Confidence'
            #     # , 'B-LOC Confidence', 'I-LOC Confidence',
            #     # 'B-PER Confidence', 'I-PER Confidence', 
            #     # 'B-ORG Confidence', 'I-ORG Confidence',
            #     # 'B-MISC Confidence', 'I-MISC Confidence'
            # ],
            "Prediction Metrics": [
                'Variability', 'Normalized Prediction Entropy'
            ],
            "Silhouette Scores": ['True Silhouette Score', 'Pred Silhouette Score']
        }

        # Filter and standardize data
        

        # Loop through each variable category
        for group_name, variables in variable_groups.items():
            # Filter variables that exist in the dataset
            relevant_vars = [var for var in variables if var in self.analysis_df.columns]
            if not relevant_vars:
                continue  # Skip if none of the variables exist

            # Temporarily update self.variables for the current category
            self.variables = relevant_vars
            
            filtered_df = self.filter_and_standardize(relevant_vars)
            if "Ambiguity" in group_name:
                for var in relevant_vars:
                    filtered_df = filtered_df[filtered_df[var] != -1]  # Remove OOV entropy (-1)
                    
            # Compute summary statistics
            summary = self.compute_summary(filtered_df, relevant_vars)

            # Format data for visualization
            summary_long = self.format_long_data(summary, group_name)

            # Generate a separate plot for each category
            plot_title = f"Comparison of {group_name} Across Model Variants and True Labels"
            self.plot_summary(summary_long, group_name, plot_title)



In [None]:
consistency_variables = ["Consistency Ratio", "Inconsistency Ratio"]

ambiguity_variables = [
    # "Local Token Entropy", "Local Word Entropy",
    "Dataset Token Entropy", "Dataset Word Entropy",
    "Normalized Token Entropy", "Normalized Word Entropy"
]

tokenization_variables = ['Tokenization Rate']
loss_variables = ['Losses']

confidence_variables = [
    'O Confidence', 
    'B-LOC Confidence', 'I-LOC Confidence',
    'B-PER Confidence', 'I-PER Confidence', 
    'B-ORG Confidence', 'I-ORG Confidence',
	'B-MISC Confidence', 'I-MISC Confidence'
]
prediction_variables = [
    # 'Prediction Entropy',
    'Variability', 'Normalized Prediction Entropy'
]

silhouette_variables = ['True Silhouette Score', 'Pred Silhouette Score']

In [None]:
ar = dash_data['ANERCorp_CamelLab_arabertv02'].analysis_data
en = dash_data['conll2003_bert'].analysis_data

In [None]:
ar[ar['Dataset Word Entropy'] > 0]['Dataset Word Entropy'].mean()

In [None]:
en[en['Dataset Word Entropy'] > 0]['Dataset Word Entropy'].mean()

In [None]:
en[ambiguity_variables]['Dataset Word Entropy'].mean()


In [None]:
df.columns

In [None]:
analysis.analysis_df[analysis.analysis_df['Dataset Token Entropy']>0].groupby('Language')['Dataset Token Entropy'].mean()

In [None]:
analysis.analysis_df[analysis.analysis_df['Dataset Word Entropy']>0].groupby('Language')['Dataset Word Entropy'].mean()

In [None]:
analysis = JointAnalysis(dash_data)
analysis.analyse_group()

In [None]:
analysis = EntityJointAnalysis(dash_data)
analysis.analyze_group()