In [2]:
import pandas as pd
from experiment_utils.env_setup import init
from experiment_utils.utils import FileHandler
from pathlib import Path


2024-11-19 00:00:25 - INFO - PyTorch version 2.2.2 available.


In [3]:
from experiment_utils.utils import FileHandler
from experiment_utils.env_setup import init
from pathlib import Path
from dataclasses import dataclass, field
from typing import Dict, Any, List
import yaml

@dataclass
class DevelopmentConfig:
    debug: bool = False
    port: int = 8000
    def __post_init__(self):
        if not isinstance(self.debug, bool):
            raise ValueError(f"Expected boolean for debug, got {type(self.debug).__name__}")
        if not (1 <= self.port <= 65535):
            raise ValueError("Port must be between 1 and 65535")
    @staticmethod
    def from_dict(config_dict: Dict[str, Any]):
        return DevelopmentConfig(**config_dict)


@dataclass
class TabConfig:
    tab_value: str
    tab_label: str
    
@dataclass
class AppConfig:
    tabs: List[TabConfig] = field(default_factory=list)
    variants: List[str] = field(default_factory=list)

    def __post_init__(self):
        if not all(isinstance(tab, TabConfig) for tab in self.tabs):
            raise ValueError("Tabs must be a list of TabConfig instances")
        if not all(isinstance(variant, str) for variant in self.variants):
            raise ValueError("Variants must be a list of strings")

    @staticmethod
    def from_dict(config_dict: Dict[str, Any]):
        tabs = [TabConfig(**tab) for tab in config_dict.get('tabs', [])]
        variants = config_dict.get('variants', [])
        return AppConfig(tabs=tabs, variants=variants)

class DashboardConfigManager:
    def __init__(self, config_path: Path):
        self.config_path = config_path
        config_fh = FileHandler(config_path.parent)
        try:
            self.config = config_fh.load_yaml(config_path.name)
        except FileNotFoundError as e:
            raise FileNotFoundError(f"Configuration file not found at {config_path}") from e
        except yaml.YAMLError as e:
            raise ValueError("Error parsing YAML configuration.") from e
        except ValueError as e:
            raise ValueError("Validation error in configuration.") from e

 
    @property
    def development_config(self) -> DevelopmentConfig:
        return DevelopmentConfig.from_dict(
            self.config.get("development", {})
        )

    @property
    def app_config(self) -> AppConfig:
        return AppConfig.from_dict(
            self.config.get("dashboard", {})
        )
    @property
    def data_dir(self) -> Path:
        base_folder = init()
        return base_folder / self.config.get("dashboard", {}).get('data_dir', '')

    @property
    def data_config(self) -> Dict:
        return self.config.get("dashboard", {}).get("dashboard_data", {}).get("data", {})

    @property
    def variants(self) -> Dict:
        return self.config.get("dashboard", {}).get("variants", {})
    
    @property
    def dataset_tab(self) -> Dict:
        return self.config.get("dashboard", {}).get("dataset_tab", {})
    
    @property
    def decision_tab(self) -> Dict:
        return self.config.get("dashboard", {}).get("decision_tab", {})



In [4]:
import logging
from tqdm.autonotebook import tqdm
from experiment_utils.utils import FileHandler
import pandas as pd
from flask_caching import Cache
import numpy as np

from dataclasses import dataclass, field
from typing import Dict, Any
import plotly.graph_objects as go

@dataclass
class DashboardData:
    analysis_data: pd.DataFrame = field(default_factory=pd.DataFrame)
    train_data: pd.DataFrame = field(default_factory=pd.DataFrame)
    kmeans_results: pd.DataFrame = field(default_factory=pd.DataFrame)
    results: pd.DataFrame = field(default_factory=pd.DataFrame)
    entity_report: pd.DataFrame = field(default_factory=pd.DataFrame)
    token_report: pd.DataFrame = field(default_factory=pd.DataFrame)
    entity_confusion_data: pd.DataFrame = field(default_factory=pd.DataFrame)
    centroids_avg_similarity_matrix: pd.DataFrame = field(default_factory=pd.DataFrame)
    attention_weights_similarity_heatmap: go.Figure = field(default_factory=go.Figure)
    attention_weights_similarity_matrix: np.ndarray = field(default_factory=lambda: np.array([]))
    attention_similarity_heatmap: go.Figure = field(default_factory=go.Figure)
    attention_similarity_matrix: np.ndarray = field(default_factory=lambda: np.array([]))

    def __post_init__(self):
        # Round float columns to four decimal places
        self.round_floats(self.analysis_data)
        self.round_floats(self.kmeans_results)
        self.round_floats(self.results)

        # Convert list to string in the 'Word Pieces' column of analysis_data if it exists
        if 'Word Pieces' in self.analysis_data.columns:
            self.analysis_data['Word Pieces'] = self.analysis_data['Word Pieces'].apply(
                lambda x: ', '.join(x) if isinstance(x, list) else x
            )
        
        self.analysis_data['Consistency Ratio'] = np.where(
            self.analysis_data['Total Train Occurrences'] != 0,  # Condition to check for non-zero denominator
            self.analysis_data['Consistency Count'] / self.analysis_data['Total Train Occurrences'],  # Normal calculation if denominator is not zero
            0
        )

        self.analysis_data['Inconsistency Ratio'] = np.where(
            self.analysis_data['Total Train Occurrences'] != 0,
            self.analysis_data['Inconsistency Count'] / self.analysis_data['Total Train Occurrences'],
            0
        )
        
        self.analysis_data['Normalized Token Entropy'] = DashboardData.normalized_entropy(self.analysis_data, 'Local Token Entropy', 'Token Max Entropy')  # filling 0/0 division as it generates Nan
        self.analysis_data['Normalized Word Entropy'] = DashboardData.normalized_entropy(self.analysis_data, 'Local Token Entropy', 'Token Max Entropy')  # filling 0/0 division as it generates Nan
        self.analysis_data['Normalized Prediction Entropy'] = DashboardData.normalized_entropy(self.analysis_data, 'Prediction Entropy', 'Prediction Max Entropy')  # filling 0/0 division as it generates Nan
        self.analysis_data['Error Type'] = self.analysis_data.apply(
            lambda row: DashboardData.annotate_error(
                row['True Labels'], 
                row['Pred Labels']
                ), 
            axis=1
            )
    
    def is_loaded(self, attribute):
        """Checks if the given attribute is loaded based on its type."""
        attr_value = getattr(self, attribute)
        if isinstance(attr_value, pd.DataFrame):
            return not attr_value.empty
        elif isinstance(attr_value, go.Figure):
            return len(attr_value.data) > 0  # Check if the figure has data
        return False  # Default case if the attribute type is unrecognized

    @staticmethod
    def round_floats(df):
        for col in df.select_dtypes(include=['float']).columns:
            df[col] = df[col].round(4)
            
    @staticmethod
    def from_dict(dict_data: Dict[str, Any]):
        return DashboardData(**dict_data)
    
    @staticmethod
    def normalized_entropy(df, raw_entropy, max_entropy):
        result = np.full(df.shape[0], np.nan)
        zero_mask = (df[raw_entropy] == 0) & (df[max_entropy] == 0)
        result[zero_mask] = 0
        negative_one_mask = (df[raw_entropy] == -1) & (df[max_entropy] == -1)
        result[negative_one_mask] = -1
        valid_mask = (df[max_entropy] != 0) & ~zero_mask & ~negative_one_mask
        result[valid_mask] = df[raw_entropy][valid_mask] / df[max_entropy][valid_mask]
        zero_div_mask = (df[max_entropy] == 0) & (df[raw_entropy] != 0)
        result[zero_div_mask] = 0
        return result
    @staticmethod
    def annotate_error(true_label, pred_label):
        # If both are the same, it's correct (no error)
        if true_label == pred_label:
            return "No Errors"
        
        # Handle cases where one or both labels are 'O'
        if true_label == 'O' and pred_label != 'O':
            return "Chunk"  # False entity predicted
        if true_label != 'O' and pred_label == 'O':
            return "Entity and Chunk"  # Missed entity and chunk boundary
        
        # Extract entity types without position tags (like "B-", "I-")
        true_entity = true_label.split("-")[-1] if "-" in true_label else true_label
        pred_entity = pred_label.split("-")[-1] if "-" in pred_label else pred_label

        # If entity types are different (e.g., LOC vs. PER)
        if true_entity != pred_entity:
            # If both entity type and position (B- vs I-) are wrong
            return "Entity and Chunk" if true_label[0] != pred_label[0] else "Entity"

        # If entity types are the same but position tags (B- vs I-) are wrong
        return "Chunk"


class DataLoader:
    def __init__(self, config_manager, variant_name):
        self.data_config = config_manager.data_config
        self.data_dir = config_manager.data_dir / variant_name
        self.dashboard_data = {}

    def load(self, file_name, file_config):
        file_handler = FileHandler(self.data_dir / file_config['folder'])
        file_type = file_config["format"]
        file_path = file_handler.file_path / f"{file_name}.{file_type}"

        try:
            if file_path.exists():
                # Load Plotly figures specifically
                if file_type == "npy":
                    return file_handler.load_numpy(file_path.with_suffix('.npy'))

                # Handle regular JSON data files
                elif file_type == "json":
                    data = file_handler.read_json(file_path)
                    if "column_mappings" in file_config and file_config["column_mappings"]:
                        data = self.apply_column_mappings(data, file_config["column_mappings"])
                    return data
            else:
                logging.warning("File does not exist: %s", file_path)
        except Exception as e:
            logging.error("Failed to load data from %s: %s", file_path, e)
            return None
        
    def apply_column_mappings(self, data: pd.DataFrame, column_mappings: dict) -> pd.DataFrame:
        """ Rename columns in the DataFrame based on provided mappings. """
        return data.rename(columns=column_mappings)


    def load_all(self):
        
        logging.info("Loading Dashboard Data from  %s", self.data_dir)
        for file_name, file_config in tqdm(self.data_config.items()):
            self.dashboard_data[file_name] = self.load(file_name, file_config)
            
            


class DataManager:
    def __init__(self, config_manager, server) -> None:
        self.config_manager = config_manager
        self.variants = config_manager.variants
        self.cache = Cache(server, config={
            'CACHE_TYPE': 'filesystem',
            'CACHE_DIR': 'cache-directory',
            'CACHE_DEFAULT_TIMEOUT': 3600  # Cache timeout of 1 hour
        })
        self.cache.init_app(server)
        self.variants_data = self.load_all_variants_from_cache()
    
    def load_all_variants_from_cache(self):
        data = {}
        for variant in self.variants:
            cached_data = self.cache.get(variant)
            if cached_data:
                data[variant] = cached_data
        return data

    
    def load_variant(self, variant):
        """Loads data for a specific variant, with caching."""
        cached_data = self.cache.get(variant)
        if cached_data is None:
            loader = DataLoader(self.config_manager, variant)
            loader.load_all()
            data = DashboardData.from_dict(loader.dashboard_data)
            self.variants_data[variant] = data
            self.cache.set(variant, data)  # Cache the newly loaded data
            return data  # Return the new data
        self.variants_data[variant] = cached_data
        return cached_data  # Return the cached data if it was already loaded
    
    def load_data(self):
        """Loads data for all variants using the load_variant method for consistency."""
        for variant in self.variants:
            # Delegate the loading and caching to load_variant method
            self.variants_data[variant] = self.load_variant(variant)
        return self.variants_data

    def is_data_loaded(self):
        """Checks if all variants have data loaded in the cache."""
        for variant in self.variants:
            if self.cache.get(variant) is None:
                return False  # Return False if any variant is not loaded
        return True  # Return True if all variants are loaded
    
    def is_any_variant_loaded(self):
        """
        Check if any variant is loaded in the cache.

        Returns:
            bool: True if at least one variant is loaded, False otherwise.
        """
        for variant in self.variants:
            if self.cache.get(variant) is not None:
                return True  # Return True if any variant is loaded
        return False  # Return False if no variants are loaded



In [5]:
CONFIG_PATH = Path("/Users/ay227/Desktop/Final-Year/Thesis-Experiments/Online-Dashboard-Phase/dashboard-config.yaml")
config_manager = DashboardConfigManager(CONFIG_PATH)
dev_config = config_manager.development_config    

In [6]:
import json
with open('/Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/ExperimentData/corpora.json', 'r') as file:
    corpora = json.load(file)

In [7]:
for data in corpora['ANERCorp_CamelLab']['splits']['test']:
    for word, tag in zip(data['words'], data['tags']):
        print(word, tag)

 
    break

الصالحية B-LOC
المفرق B-LOC
- O
غيث B-PERS
الطراونة I-PERS
- O
أمر O
جلالة O
الملك O
عبدالله B-PERS
الثاني I-PERS
أمس O
بتنفيذ O
حزمة O
من O
المشاريع O
التعليمية O
والصحية O
والتنموية O
وأخرى O
مرتبطة O
بالأندية O
الشبابية O
و O
27 O
وحدة O
سكنية O
في O
قضاء O
الصالحية B-LOC
ونايفة B-LOC
في O
البادية O
الشرقية O
خلال O
ستة O
اشهر O
بتمويل O
من O
الديوان O
الملكي O
الهاشمي O
. O


In [8]:

df = pd.DataFrame([{'word':w, 'tag':t} for data in corpora['ANERCorp_CamelLab']['splits']['train'] for w, t in zip(data['words'], data['tags'])])
print(len(df))

125102


In [9]:
label_col = 'tag'
word_col = 'word'
counts = df['tag'].value_counts().sort_index()
types = df.groupby('tag')[
	'word'
].nunique()
ratios = types / counts

token_distribution_df = pd.DataFrame(
	{
		'count': counts,
		'types': types,
		'ratio': ratios,
	}
)

totals = df['word'].agg(["size", "nunique"]).tolist()
ne_totals = (
	df[df[label_col] != "O"][word_col]
	.agg(["size", "nunique"])
	.tolist()
)

token_distribution_df.loc["Total"] = totals + [totals[1] / totals[0]]
token_distribution_df.loc["Total NEs"] = ne_totals + [
	ne_totals[1] / ne_totals[0]
]

token_distribution_df['NE Proportions'] = (
		token_distribution_df['count'] / ne_totals[0]
	)
token_distribution_df['NE Proportions'] = token_distribution_df[
	'NE Proportions'
].apply(lambda x: round(x * 100, 2))

In [10]:
import pandas as pd

# Training data
train_data = {
    'Tag': ['B-LOC', 'B-PERS', 'I-PERS', 'B-ORG', 'I-ORG', 'B-MISC', 'I-LOC', 'I-MISC'],
    'Count': [3776, 2721, 2205, 1576, 1115, 888, 525, 375],
    'Types': [905, 1089, 1110, 522, 401, 343, 145, 220],
    'Ratio': [0.239672, 0.400221, 0.503401, 0.331218, 0.359641, 0.386261, 0.276190, 0.586667],
    'NE Proportions': [28.65, 20.64, 16.73, 11.96, 8.46, 6.74, 3.98, 2.85]
}

# Test data - classification report on the test
test_data = {
    'Tag': ['B-LOC', 'B-MISC', 'B-ORG', 'B-PERS', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PERS'],
    'Precision': [0.903272, 0.814433, 0.810875, 0.890346, 0.802326, 0.858823, 0.804688, 0.910798],
    'Recall': [0.950599, 0.672340, 0.762222, 0.870629, 0.831325, 0.442424, 0.749091, 0.907956],
    'F1': [0.926331, 0.736597, 0.785796, 0.880377, 0.816568, 0.584000, 0.775895, 0.909375]
}

# Convert dictionaries to DataFrames
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

# Merge the two DataFrames on Tag
combined_df = pd.merge(train_df, test_df, on='Tag')

# Remove the 'Tag' column for correlation computation
numeric_df = combined_df.drop('Tag', axis=1)

# Compute the correlation matrix
correlation_matrix = numeric_df.corr()

# Display the correlation matrix
correlation_matrix


Unnamed: 0,Count,Types,Ratio,NE Proportions,Precision,Recall,F1
Count,1.0,0.860958,-0.365725,1.0,0.741621,0.751917,0.802101
Types,0.860958,1.0,0.017137,0.860921,0.836661,0.686173,0.764818
Ratio,-0.365725,0.017137,1.0,-0.365622,0.273484,-0.621199,-0.544003
NE Proportions,1.0,0.860921,-0.365622,1.0,0.741654,0.751765,0.801963
Precision,0.741621,0.836661,0.273484,0.741654,1.0,0.396229,0.490564
Recall,0.751917,0.686173,-0.621199,0.751765,0.396229,1.0,0.992658
F1,0.802101,0.764818,-0.544003,0.801963,0.490564,0.992658,1.0


In [11]:
import plotly.express as px

# Create a scatter plot
fig = px.scatter(combined_df, x='Ratio', y='F1', color='Tag', size_max=60, 
                 title="Scatter Plot of F1 Score vs. Ratio",
                 labels={"Ratio": "Token-Type Ratio", "F1": "F1 Score"})

# Improve plot aesthetics by adding the entity tags as labels
fig.update_traces(textposition='top center')

# Show the plot
fig.show()

In [12]:
token_distribution_df.sort_values('count', ascending=False)

Unnamed: 0_level_0,count,types,ratio,NE Proportions
tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Total,125102.0,29252.0,0.233825,949.11
O,111921.0,25931.0,0.23169,849.11
Total NEs,13181.0,4069.0,0.308702,100.0
B-LOC,3776.0,905.0,0.239672,28.65
B-PERS,2721.0,1089.0,0.400221,20.64
I-PERS,2205.0,1110.0,0.503401,16.73
B-ORG,1576.0,522.0,0.331218,11.96
I-ORG,1115.0,401.0,0.359641,8.46
B-MISC,888.0,343.0,0.386261,6.74
I-LOC,525.0,145.0,0.27619,3.98


In [13]:
token_distribution_df.sort_values('count', ascending=False)

Unnamed: 0_level_0,count,types,ratio,NE Proportions
tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Total,125102.0,29252.0,0.233825,949.11
O,111921.0,25931.0,0.23169,849.11
Total NEs,13181.0,4069.0,0.308702,100.0
B-LOC,3776.0,905.0,0.239672,28.65
B-PERS,2721.0,1089.0,0.400221,20.64
I-PERS,2205.0,1110.0,0.503401,16.73
B-ORG,1576.0,522.0,0.331218,11.96
I-ORG,1115.0,401.0,0.359641,8.46
B-MISC,888.0,343.0,0.386261,6.74
I-LOC,525.0,145.0,0.27619,3.98


In [14]:
def create_variability(df):
	import pandas as pd

	# Assuming df is already defined and structured with 'tag' and 'word' columns

	# Calculate counts and types
	counts = df['tag'].value_counts().sort_index()
	types = df.groupby('tag')['word'].nunique()
	ratios = types / counts

	# Create the distribution DataFrame
	token_distribution_df = pd.DataFrame({
		'count': counts,
		'types': types,
		'ratio': ratios
	})

	# Calculate total tokens and unique tags
	total_tokens = df['tag'].count()
	unique_tags = df['tag'].nunique()

	# Calculate total tokens and unique NE tags (not 'O')
	total_ne_tokens = df[df['tag'] != "O"]['tag'].count()
	unique_ne_tags = df[df['tag'] != "O"]['tag'].nunique()

	# Adding totals to the DataFrame
	token_distribution_df.loc['Total'] = [total_tokens, unique_tags, unique_tags / total_tokens]
	token_distribution_df.loc['Total NEs'] = [total_ne_tokens, unique_ne_tags, unique_ne_tags / total_ne_tokens]


	token_distribution_df['NE Proportions'] = (
		token_distribution_df['count'] / total_ne_tokens
	)
	token_distribution_df['NE Proportions'] = token_distribution_df[
		'NE Proportions'
	].apply(lambda x: round(x * 100, 2))

	# This displays the DataFrame to see the result
	return token_distribution_df


In [15]:
types

tag
B-LOC       905
B-MISC      343
B-ORG       522
B-PERS     1089
I-LOC       145
I-MISC      220
I-ORG       401
I-PERS     1110
O         25931
Name: word, dtype: int64

In [16]:
create_variability(df)

Unnamed: 0_level_0,count,types,ratio,NE Proportions
tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
B-LOC,3776.0,905.0,0.239672,28.65
B-MISC,888.0,343.0,0.386261,6.74
B-ORG,1576.0,522.0,0.331218,11.96
B-PERS,2721.0,1089.0,0.400221,20.64
I-LOC,525.0,145.0,0.27619,3.98
I-MISC,375.0,220.0,0.586667,2.85
I-ORG,1115.0,401.0,0.359641,8.46
I-PERS,2205.0,1110.0,0.503401,16.73
O,111921.0,25931.0,0.23169,849.11
Total,125102.0,9.0,7.2e-05,949.11


In [17]:

from dash import Dash, dcc, html, Output, Input, State
app = Dash(__name__, suppress_callback_exceptions=True)

app_config = config_manager.app_config
server = app.server  # Flask server instance for caching
variants_data = None

data_manager = DataManager(config_manager, server)

In [18]:
data_manager.load_data()

2024-11-19 00:00:30 - INFO - Found Google Drive directory for account ahmed.younes.sam@gmail.com: /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com
2024-11-19 00:00:30 - INFO - Loading Dashboard Data from  /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/DashboardTest-v0/ANERCorp_CamelLab_arabertv02


  0%|          | 0/12 [00:00<?, ?it/s]

2024-11-19 00:00:33 - INFO - Found Google Drive directory for account ahmed.younes.sam@gmail.com: /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com
2024-11-19 00:00:33 - INFO - Loading Dashboard Data from  /Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/DashboardTest-v0/conll2003_bert


  0%|          | 0/12 [00:00<?, ?it/s]



{'ANERCorp_CamelLab_arabertv02': DashboardData(analysis_data=       Sentence Ids  Token Positions     Words    Tokens Word Pieces  \
 0                 0                0     [CLS]     [CLS]       [CLS]   
 1                 0                1  الصالحية  الصالحية    الصالحية   
 2                 0                2    المفرق    المفرق      المفرق   
 3                 0                3         -         -           -   
 4                 0                4       غيث       غيث         غيث   
 ...             ...              ...       ...       ...         ...   
 29706           960               10    للوليد     ##ليد  للو, ##ليد   
 29707           960               11        بن        بن          بن   
 29708           960               12      طلال      طلال        طلال   
 29709           960               13         .         .           .   
 29710           960               14     [SEP]     [SEP]       [SEP]   
 
       Core Tokens True Labels Token Selector Id Pred Labels  

In [19]:
isinstance(data_manager.variants_data['ANERCorp_CamelLab_arabertv02'].attention_similarity_matrix, np.ndarray)

True

In [20]:
df = data_manager.variants_data['ANERCorp_CamelLab_arabertv02'].analysis_data


In [21]:
df['True Labels'].value_counts()

True Labels
O          21616
IGNORED     2798
[CLS]        961
[SEP]        961
B-PERS       858
B-LOC        668
I-PERS       641
B-ORG        450
I-ORG        275
B-MISC       235
I-MISC       165
I-LOC         83
Name: count, dtype: int64

In [22]:
data_manager.variants_data['ANERCorp_CamelLab_arabertv02'].entity_report

Unnamed: 0,Tag,Precision,Recall,F1,Support
0,LOC,0.889204,0.926036,0.907246,676
1,MISC,0.712195,0.600823,0.651786,243
2,ORG,0.760181,0.732026,0.745838,459
3,PERS,0.880756,0.824309,0.851598,905
4,micro,0.843494,0.812089,0.827494,2283
5,macro,0.810584,0.770799,0.789117,2283
6,weighted,0.841074,0.812089,0.825545,2283


In [27]:
confs = data_manager.variants_data['ANERCorp_CamelLab_arabertv02'].entity_confusion_data

In [29]:
confs[confs['True Entity'] == 'LOC']['Pred Entity'].value_counts()

Pred Entity
LOC     626
O        42
MISC      4
ORG       3
PERS      1
Name: count, dtype: int64

In [24]:
tr_df = data_manager.variants_data['ANERCorp_CamelLab_arabertv02'].train_data

In [25]:
tr_df

Unnamed: 0,Sentence Ids,Token Positions,X,Y,Labels,Losses,Token Ids,Global Id,True Labels
0,0,0,9.446189,19.382036,-100,0.000000,2,2_0_0_-100,IGNORED
1,0,1,13.791077,-3.709414,5,0.006694,19876,19876_0_1_5,B-LOC
2,0,2,12.713058,12.945978,0,0.000059,14,14_0_2_0,O
3,0,3,11.556602,11.655669,-100,0.000000,120,120_0_3_-100,IGNORED
4,0,4,9.195292,14.356680,0,0.000398,113,113_0_4_0,O
...,...,...,...,...,...,...,...,...,...
147077,4148,28,10.635909,3.877772,0,0.000046,1259,1259_4148_28_0,O
147078,4148,29,8.651112,3.159802,0,0.000044,4537,4537_4148_29_0,O
147079,4148,30,8.473076,3.941707,0,0.000104,10776,10776_4148_30_0,O
147080,4148,31,-0.578020,15.568364,0,0.000043,20,20_4148_31_0,O


In [156]:
tr_df

Unnamed: 0,Sentence Ids,Token Positions,X,Y,Labels,Losses,Token Ids,Global Id,True Labels
0,0,0,9.446189,19.382036,-100,0.000000,2,2_0_0_-100,IGNORED
1,0,1,13.791077,-3.709414,5,0.006694,19876,19876_0_1_5,B-LOC
2,0,2,12.713058,12.945978,0,0.000059,14,14_0_2_0,O
3,0,3,11.556602,11.655669,-100,0.000000,120,120_0_3_-100,IGNORED
4,0,4,9.195292,14.356680,0,0.000398,113,113_0_4_0,O
...,...,...,...,...,...,...,...,...,...
147077,4148,28,10.635909,3.877772,0,0.000046,1259,1259_4148_28_0,O
147078,4148,29,8.651112,3.159802,0,0.000044,4537,4537_4148_29_0,O
147079,4148,30,8.473076,3.941707,0,0.000104,10776,10776_4148_30_0,O
147080,4148,31,-0.578020,15.568364,0,0.000043,20,20_4148_31_0,O


In [135]:
from arabert.preprocess import ArabertPreprocessor
from transformers import AutoTokenizer

In [136]:
model_name = 'aubmindlab/bert-base-arabertv02'
pre = ArabertPreprocessor(model_name)
tok = AutoTokenizer.from_pretrained(model_name)
tok.tokenize(pre.preprocess('--'))


`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884



['-', '-']

In [157]:
label_col = 'True Labels'
word_col = 'Token Ids'
DATA = tr_df[tr_df['Labels'] != -100]
counts = DATA[label_col].value_counts().sort_index()
types = DATA.groupby(label_col)[
	word_col
].nunique()
ratios = types / counts

token_distribution_df = pd.DataFrame(
	{
		'count': counts,
		'types': types,
		'ratio': ratios,
	}
)

totals = DATA[word_col].agg(["size", "nunique"]).tolist()
ne_totals = (
	DATA[DATA[label_col] != "O"][word_col]
	.agg(["size", "nunique"])
	.tolist()
)

token_distribution_df.loc["Total"] = totals + [totals[1] / totals[0]]
token_distribution_df.loc["Total NEs"] = ne_totals + [
	ne_totals[1] / ne_totals[0]
]

token_distribution_df['NE Proportions'] = (
		token_distribution_df['count'] / ne_totals[0]
	)
token_distribution_df['NE Proportions'] = token_distribution_df[
	'NE Proportions'
].apply(lambda x: round(x * 100, 2))

In [160]:
token_distribution_df.sort_values('count', ascending=False)

Unnamed: 0_level_0,count,types,ratio,NE Proportions
True Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Total,124659.0,23010.0,0.184584,946.47
O,111488.0,20911.0,0.187563,846.47
Total NEs,13171.0,3445.0,0.261559,100.0
B-LOC,3772.0,861.0,0.228261,28.64
B-PERS,2719.0,1003.0,0.368886,20.64
I-PERS,2202.0,1005.0,0.456403,16.72
B-ORG,1576.0,507.0,0.321701,11.97
I-ORG,1115.0,391.0,0.350673,8.47
B-MISC,887.0,333.0,0.375423,6.73
I-LOC,525.0,141.0,0.268571,3.99


In [143]:
token_distribution_df.sort_values('count', ascending=False)

Unnamed: 0_level_0,count,types,ratio,NE Proportions
True Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Total,24991.0,8455.0,0.338322,740.47
O,21616.0,7278.0,0.336695,640.47
Total NEs,3375.0,1480.0,0.438519,100.0
B-PERS,858.0,486.0,0.566434,25.42
B-LOC,668.0,264.0,0.39521,19.79
I-PERS,641.0,400.0,0.624025,18.99
B-ORG,450.0,205.0,0.455556,13.33
I-ORG,275.0,151.0,0.549091,8.15
B-MISC,235.0,120.0,0.510638,6.96
I-MISC,165.0,111.0,0.672727,4.89


In [9]:
df['Normalized Token Entropy'].describe()

count    29711.000000
mean        -0.177495
std          0.515464
min         -1.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: Normalized Token Entropy, dtype: float64

In [12]:
df['True Silhouette Score'].describe()

count    24991.000000
mean         0.606392
std          0.208912
min         -0.737800
25%          0.572700
50%          0.677900
75%          0.720700
max          0.782200
Name: True Silhouette Score, dtype: float64

In [12]:
import pandas as pd
def classify_error(true_label, pred_label):
    # If both are the same, it's correct (no error)
    if true_label == pred_label:
        return "No Errors"
    
    # Handle cases where one or both labels are 'O'
    if true_label == 'O' and pred_label != 'O':
        return "Chunk"  # False entity predicted
    if true_label != 'O' and pred_label == 'O':
        return "Entity and Chunk"  # Missed entity and chunk boundary
    
    # Extract entity types without position tags (like "B-", "I-")
    true_entity = true_label.split("-")[-1] if "-" in true_label else true_label
    pred_entity = pred_label.split("-")[-1] if "-" in pred_label else pred_label

    # If entity types are different (e.g., LOC vs. PER)
    if true_entity != pred_entity:
        # If both entity type and position (B- vs I-) are wrong
        return "Entity and Chunk" if true_label[0] != pred_label[0] else "Entity"

    # If entity types are the same but position tags (B- vs I-) are wrong
    return "Chunk"


# Apply the error classification
df['Modified Error Type'] = df.apply(lambda row: classify_error(row['True Labels'], row['Pred Labels']), axis=1)



In [16]:
df[(df['Token Max Entropy'] == 0)][['Token Max Entropy', 'Local Token Entropy']]

Unnamed: 0,Token Max Entropy,Local Token Entropy
2,0.0,0.0
3,0.0,0.0
8,0.0,0.0
9,0.0,0.0
10,0.0,0.0
...,...,...
29698,0.0,0.0
29699,0.0,0.0
29702,0.0,0.0
29704,0.0,0.0


In [49]:
df[df['Local Token Entropy'] != 0]

Index(['Sentence Ids', 'Token Positions', 'Words', 'Tokens', 'Word Pieces',
       'Core Tokens', 'True Labels', 'Token Selector Id', 'Pred Labels',
       'Agreements', 'X', 'Y', 'Labels', 'Losses', 'Token Ids', 'Global Id',
       'True Token Score', 'Pred Token Score', 'K=3', 'Boundary Clusters',
       'K=4', 'Entity Clusters', 'K=9', 'Token Clusters', 'Consistency Count',
       'Inconsistency Count', 'Total Train Occurrences', 'Local Token Entropy',
       'Token Max Entropy', 'Dataset Token Entropy', 'Local Word Entropy',
       'Word Max Entropy', 'Dataset Word Entropy', 'Tokenization Rate',
       'TR Entity', 'PR Entity', 'Error Type', 'O Confidence',
       'B-PERS Confidence', 'I-PERS Confidence', 'B-ORG Confidence',
       'I-ORG Confidence', 'B-LOC Confidence', 'I-LOC Confidence',
       'B-MISC Confidence', 'I-MISC Confidence', 'Prediction Entropy',
       'Prediction Max Entropy', 'Token Confidence', 'Variability', 'Pre X',
       'Pre Y', 'Consistency Ratio', 'Inconsis

In [14]:
(df['Modified Error Type'] == df['Error Type']).value_counts()

True    29711
Name: count, dtype: int64

In [29]:
df[df['Modified Error Type'] == "Entity and Chunk"][['Modified Error Type', "Error Type", "True Labels", "Pred Labels"]].sample(50)

Unnamed: 0,Modified Error Type,Error Type,True Labels,Pred Labels
8532,Entity and Chunk,Chunk,B-PERS,O
23544,Entity and Chunk,Chunk,B-PERS,O
6008,Entity and Chunk,Chunk,B-MISC,O
1403,Entity and Chunk,Chunk,B-MISC,O
7471,Entity and Chunk,Chunk,B-ORG,O
12039,Entity and Chunk,Chunk,I-ORG,O
24486,Entity and Chunk,Chunk,I-MISC,O
25603,Entity and Chunk,Chunk,I-MISC,O
24637,Entity and Chunk,Chunk,I-ORG,O
23651,Entity and Chunk,Chunk,B-PERS,O


In [49]:
df.columns

Index(['Sentence Ids', 'Token Positions', 'Words', 'Tokens', 'Word Pieces',
       'Core Tokens', 'True Labels', 'Token Selector Id', 'Pred Labels',
       'Agreements', 'X', 'Y', 'Labels', 'Losses', 'Token Ids', 'Global Id',
       'True Token Score', 'Pred Token Score', 'K=3', 'Boundary Clusters',
       'K=4', 'Entity Clusters', 'K=9', 'Token Clusters', 'Consistency Count',
       'Inconsistency Count', 'Total Train Occurrences', 'Local Token Entropy',
       'Token Max Entropy', 'Dataset Token Entropy', 'Local Word Entropy',
       'Word Max Entropy', 'Dataset Word Entropy', 'Tokenization Rate',
       'TR Entity', 'PR Entity', 'Error Type', 'O Confidence',
       'B-PERS Confidence', 'I-PERS Confidence', 'B-ORG Confidence',
       'I-ORG Confidence', 'B-LOC Confidence', 'I-LOC Confidence',
       'B-MISC Confidence', 'I-MISC Confidence', 'Prediction Entropy',
       'Prediction Max Entropy', 'Token Confidence', 'Variability', 'Pre X',
       'Pre Y', 'Consistency Ratio', 'Inconsis

In [51]:
df[df['Losses']>5][['Pred Labels', 'Losses', 'Token Confidence', 'B-PERS Confidence', 'O Confidence', 'B-LOC Confidence', 'B-ORG Confidence', 'Prediction Entropy', 'Variability']]

Unnamed: 0,Pred Labels,Losses,Token Confidence,B-PERS Confidence,O Confidence,B-LOC Confidence,B-ORG Confidence,Prediction Entropy,Variability
50,O,7.6469,0.9986,0.0005,0.9986,0.0001,0.0003,0.0196,0.3138
379,B-PERS,8.1570,0.9981,0.9981,0.0009,0.0002,0.0003,0.0242,0.3136
415,O,10.0408,0.9995,0.0001,0.9995,0.0000,0.0001,0.0068,0.3141
762,I-LOC,6.7695,0.9886,0.0007,0.0006,0.0048,0.0011,0.1154,0.3103
816,O,11.8740,0.9999,0.0000,0.9999,0.0000,0.0000,0.0010,0.3142
...,...,...,...,...,...,...,...,...,...
29010,I-PERS,5.0672,0.9862,0.0023,0.0063,0.0002,0.0002,0.1329,0.3094
29068,O,9.4033,0.9983,0.0007,0.9983,0.0001,0.0001,0.0222,0.3137
29142,O,7.2562,0.9986,0.0000,0.9986,0.0003,0.0001,0.0185,0.3138
29413,I-PERS,5.9066,0.9935,0.0027,0.0008,0.0002,0.0002,0.0706,0.3120


In [72]:
df[(df['B-LOC Confidence']>0.6)&(df['B-LOC Confidence']<0.8)][['Variability', 'Normalized Prediction Entropy', 'Prediction Entropy', 'B-PERS Confidence', 'B-LOC Confidence', 'B-ORG Confidence', 'B-MISC Confidence', 'O Confidence', 'I-PERS Confidence', 'I-LOC Confidence', 'I-ORG Confidence', 'I-MISC Confidence',]]

Unnamed: 0,Variability,Normalized Prediction Entropy,Prediction Entropy,B-PERS Confidence,B-LOC Confidence,B-ORG Confidence,B-MISC Confidence,O Confidence,I-PERS Confidence,I-LOC Confidence,I-ORG Confidence,I-MISC Confidence
35,0.2287,0.354869,1.1249,0.0011,0.7347,0.0019,0.0009,0.0553,0.0021,0.1978,0.0054,0.0008
4313,0.2038,0.446607,1.4157,0.0014,0.6587,0.005,0.0054,0.1719,0.0015,0.1468,0.0074,0.002
4448,0.2343,0.305372,0.968,0.0016,0.74,0.0011,0.0007,0.014,0.0009,0.239,0.002,0.0007
4493,0.2178,0.444115,1.4078,0.0067,0.7153,0.0387,0.125,0.0959,0.0013,0.0117,0.0038,0.0015
6138,0.2206,0.329632,1.0449,0.0024,0.6732,0.0096,0.0009,0.3094,0.0012,0.0026,0.0007,0.0002
6167,0.2237,0.355595,1.1272,0.0104,0.7039,0.0202,0.0023,0.2558,0.0022,0.0034,0.0013,0.0005
6336,0.2309,0.32796,1.0396,0.0099,0.7302,0.2395,0.01,0.0074,0.0003,0.0011,0.0013,0.0002
8844,0.2277,0.385123,1.2208,0.0123,0.7332,0.1978,0.0122,0.0279,0.0056,0.0029,0.0041,0.004
10147,0.2199,0.330988,1.0492,0.0007,0.6712,0.0148,0.0013,0.3089,0.0002,0.002,0.0008,0.0001
10148,0.2445,0.311903,0.9887,0.0006,0.7899,0.0096,0.0006,0.1522,0.0007,0.0436,0.0023,0.0003


In [58]:
df.groupby('True Labels')['Dataset Token Entropy'].mean()

True Labels
B-LOC     -0.121848
B-MISC    -0.214932
B-ORG     -0.139246
B-PERS    -0.238405
I-LOC     -0.048953
I-MISC    -0.137872
I-ORG     -0.214284
I-PERS    -0.211615
IGNORED   -0.998909
O         -0.067060
[CLS]     -1.000000
[SEP]     -1.000000
Name: Dataset Token Entropy, dtype: float64

In [37]:
df[df['Tokens'] == 'بن'][['Losses', 'Tokens', 'True Labels', 'Consistency Count', 'Inconsistency Count']]

Unnamed: 0,Losses,Tokens,True Labels,Consistency Count,Inconsistency Count
683,0.0056,بن,I-PERS,103,11
845,0.0114,بن,B-PERS,8,106
860,0.0182,بن,B-PERS,8,106
4905,0.0011,بن,O,1,113
7459,0.0043,بن,I-PERS,103,11
7505,0.0049,بن,I-PERS,103,11
7665,0.005,بن,I-PERS,103,11
7684,0.0077,بن,I-PERS,103,11
7696,0.1508,بن,I-PERS,103,11
7809,0.0115,بن,I-PERS,103,11


In [13]:
df.columns

Index(['Sentence Ids', 'Token Positions', 'Words', 'Tokens', 'Word Pieces',
       'Core Tokens', 'True Labels', 'Token Selector Id', 'Pred Labels',
       'Agreements', 'X', 'Y', 'Labels', 'Losses', 'Token Ids', 'Global Id',
       'True Token Score', 'Pred Token Score', 'K=3', 'Boundary Clusters',
       'K=4', 'Entity Clusters', 'K=9', 'Token Clusters', 'Consistency Count',
       'Inconsistency Count', 'Total Train Occurrences', 'Local Token Entropy',
       'Token Max Entropy', 'Dataset Token Entropy', 'Local Word Entropy',
       'Word Max Entropy', 'Dataset Word Entropy', 'Tokenization Rate',
       'TR Entity', 'PR Entity', 'Error Type', 'O Confidence',
       'B-PERS Confidence', 'I-PERS Confidence', 'B-ORG Confidence',
       'I-ORG Confidence', 'B-LOC Confidence', 'I-LOC Confidence',
       'B-MISC Confidence', 'I-MISC Confidence', 'Prediction Entropy',
       'Prediction Max Entropy', 'Token Confidence', 'Variability', 'Pre X',
       'Pre Y', 'Consistency Ratio', 'Inconsis

In [14]:
df['Word Pieces'].apply(
                lambda x: ', '.join(x) if isinstance(x, list) else x
            )

0             [CLS]
1          الصالحية
2            المفرق
3                 -
4               غيث
            ...    
29706    للو, ##ليد
29707            بن
29708          طلال
29709             .
29710         [SEP]
Name: Word Pieces, Length: 29711, dtype: object

In [75]:
# Example DataFrame
data = {
    'Raw Entropy': [0, -1, 5, 2.5, 3, 7],
    'Max Entropy': [0, -1, 10, 4, 0, 0]  # Including zero and non-zero cases
}

df = pd.DataFrame(data)

# Apply the function
df['Normalized Entropy'] = normalized_entropy(df, 'Raw Entropy', 'Max Entropy')

print(df)

   Raw Entropy  Max Entropy  Normalized Entropy
0          0.0            0               0.000
1         -1.0           -1              -1.000
2          5.0           10               0.500
3          2.5            4               0.625
4          3.0            0                 inf
5          7.0            0                 inf


In [62]:
df['Inconsistency Ratio'].nunique()

334

In [74]:
df[df['Token Max Entropy']==0]['Local Token Entropy'].isna().value_counts()

Local Token Entropy
False    16035
Name: count, dtype: int64