In [2]:
import sys
import os
from pathlib import Path
# This appends the directory one level up (the root of your project) to the sys.path.
# Modify the path depending on the location of modules you want to import.
sys.path.append(os.path.abspath('../../'))

from config.config_managers import DashboardConfigManager
from dataManager import DataManager
from dash import Dash
import pandas as pd
import numpy as np
import plotly.express as px
from abc import ABC, abstractmethod
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import Counter

2025-09-19 23:00:23 - INFO - PyTorch version 2.2.2 available.


In [3]:

class CorpusProcessor:
    def __init__(self, dash_data, datasets):
        """
        Initializes the CorpusProcessor.

        Parameters:
        - dash_data (dict): The dataset dictionary containing raw train and analysis data.
        - datasets (dict): Dictionary mapping dataset keys in `dash_data` to their standardized names.
        """
        self.dash_data = dash_data
        self.datasets = datasets
        self.corpora = {}

    def filter_data(self, df):
        """
        Filters the dataset by removing invalid labels and standardizing entity tags.

        Parameters:
        - df (pd.DataFrame): The input dataframe containing tokenized data.

        Returns:
        - pd.DataFrame: Processed dataframe with standardized entity tags.
        """
        tag_mapping = {
            'B-PERS': 'B-PER', 'I-PERS': 'I-PER'
        }
        df = df[df['Labels'] != -100].copy()  # Remove invalid labels
        df['True Labels'] = df['True Labels'].replace(tag_mapping)  # Standardize tags
        return df

    def process(self):
        """
        Processes all datasets and structures them in a standardized format.

        Returns:
        - dict: A structured dictionary containing train and test splits for each dataset.
        """
        for dataset_key, dataset_name in self.datasets.items():
            print(f'Processing {dataset_name}...')

            # Extract and filter train data
            train_data = self.dash_data[dataset_key].train_data
            filtered_train = self.filter_data(train_data)
            print(f'{dataset_name} Train Size: {len(filtered_train)}')

            # Extract and filter test data
            test_data = self.dash_data[dataset_key].analysis_data
            filtered_test = self.filter_data(test_data)
            print(f'{dataset_name} Test Size: {len(filtered_test)}')

            # Store in structured corpora dictionary
            self.corpora[dataset_name] = {
                'splits': {
                    'train': filtered_train.rename(
                            columns={

                            "core_tokens": "Core Tokens"
                        }
                    ),
                    'test': filtered_test
                }
            }

        return self.corpora

    def calculate_tokens_and_uniques(self, df):
        """
        Calculates the total number of tokens and unique words in a dataset split.

        Parameters:
        - df (pd.DataFrame): The input dataframe containing tokenized data.

        Returns:
        - tuple: (total tokens, unique words)
        """
        total_tokens = len(df)
        unique_words = len(df['Core Tokens'].unique())  # Unique words in the dataset
        return total_tokens, unique_words

    def compute_token_statistics(self):
        """
        Computes token statistics (total tokens & unique words) for each dataset split.

        Returns:
        - pd.DataFrame: A DataFrame summarizing token statistics.
        """
        results = {"Dataset": [], "Total Tokens": [], "Unique Words": []}

        for dataset_name, splits in self.corpora.items():
            for split_name, df in splits['splits'].items():
                total_tokens, unique_words = self.calculate_tokens_and_uniques(df)
                results["Dataset"].append(f"{dataset_name} ({split_name.capitalize()})")
                results["Total Tokens"].append(total_tokens)
                results["Unique Words"].append(unique_words)

        return pd.DataFrame(results)



class TokenTypeAnalyzer:
    def __init__(self, datasets):
        """
        Initialize the TokenTypeAnalyzer with a dictionary of datasets.
        
        Parameters:
            datasets (dict): A dictionary containing dataset splits.
        """
        self.datasets = datasets

    def extract_tag_token_type(self, df):
        """
        Extracts the number of total tokens, unique token types, and calculates the TTR for each entity tag.

        Parameters:
            df (DataFrame): Tokenized dataset containing 'true_labels' and 'token_ids'.

        Returns:
            DataFrame: A summary of token statistics for each entity tag.
        """
        df = df[df['True Labels'] != 'O']  # Remove non-entity tokens
        tag_stats = df.groupby('True Labels').agg({
            'Token Ids': ['count', 'nunique']
        }).reset_index()
        tag_stats.columns = ['Tag', 'Total Tokens', 'Tag Types']
        tag_stats['TTR'] = round(tag_stats['Tag Types'] / tag_stats['Total Tokens'], 4)
        return tag_stats

    def analyze_splits(self):
        """
        Analyzes token-type statistics for each dataset split (Train/Test) across ANERCorp & CoNLL-2003.

        Returns:
            DataFrame: Combined dataset with TTR statistics across splits and datasets.
        """
        all_results = []

        for dataset_name, splits in self.datasets.items():
            for split_name, df in splits['splits'].items():
                tag_stats = self.extract_tag_token_type(df)
                tag_stats['Dataset'] = dataset_name
                tag_stats['Split'] = split_name
                all_results.append(tag_stats)

        return pd.concat(all_results, ignore_index=True)

class EntityTagVisualizer:
    def __init__(self, df):
        """
        Initialize the Entity Tag Visualizer with the dataset.
        
        Parameters:
            df (pd.DataFrame): DataFrame containing entity tag distributions.
        """
        self.df = df
        self.color_map = {
            "B-LOC": "darkgreen",
            "B-PERS": "deepskyblue",
            "B-PER": "deepskyblue",
            "B-ORG": "darkcyan",
            "B-MISC": "palevioletred",
            "I-LOC": "yellowgreen",
            "I-PERS": "lightblue",
            "I-PER": "lightblue",
            "I-ORG": "cyan",
            "I-MISC": "violet",
            "O": "saddlebrown",
        }
        self.data_map = {"ANERCorp_CamelLab": "ANERCorp"}
        self.label_map = {"B-PERS": "B-PER", "I-PERS": "I-PER"}
        self.split_map = {"train": "Train", "test": "Test"}
        
        # Define a consistent tag ordering
        self.tag_order = ["B-LOC", "I-LOC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-MISC", "I-MISC"]

    def preprocess_data(self):
        """Standardizes dataset names, tag labels, and split names."""
        self.df['Dataset'] = self.df['Dataset'].replace(self.data_map)
        self.df['Tag'] = self.df['Tag'].replace(self.label_map)
        self.df['Split'] = self.df['Split'].replace(self.split_map)
        
        # Enforce consistent ordering of tags
        self.df['Tag'] = pd.Categorical(self.df['Tag'], categories=self.tag_order, ordered=True)

    def plot_faceted_bar_chart(self, metric, text, title):
        """
        Creates a faceted bar chart for entity tag analysis.

        Parameters:
            metric (str): Column name representing the metric to plot (e.g., "TTR").
            title (str): Title of the plot.
        """
        # Re-sort data by the categorical order of the Tag column
        self.df = self.df.sort_values(by=["Tag"])

        fig = px.bar(
            self.df,
            x="Tag",
            y=metric,
            color="Tag",
            facet_col="Dataset",
            facet_row="Split",
            color_discrete_map=self.color_map,
            text=text,
            title=title,
            labels={"Tag": "Entity Tag", metric: metric},
        )

        # Update layout for better presentation
        fig.update_layout(
            template="plotly_white",
            height=600,
            width=1200,
            margin=dict(t=60, l=20, r=20, b=20),
        )

        fig.show()


class OverlapMatrixAnalyzer:
    def __init__(self, corpora, tags):
        """
        Initialize the OverlapMatrixAnalyzer.
        
        Parameters:
            corpora (dict): Dictionary containing dataset splits.
            tags (list): List of entity tags to consider in the overlap analysis.
        """
        self.corpora = corpora
        self.tags = tags
    
    def extract_overlap_matrix(self, data):
        """
        Compute the word type overlap matrix across entity tags.

        Parameters:
            data (pd.DataFrame): Tokenized dataset containing 'core_tokens' and 'true_labels'.

        Returns:
            pd.DataFrame: Overlap matrix showing word type intersections between entity tags.
        """
        tag_types = {tag: set() for tag in self.tags}

        for token, tag in zip(data['Core Tokens'], data['True Labels']):
            normalised_tag = tag.replace('PERS', 'PER')
            if normalised_tag in self.tags:
                tag_types[normalised_tag].add(token)

        # Create an empty DataFrame for the overlap matrix
        overlap_matrix = pd.DataFrame(index=self.tags, columns=self.tags, dtype=int).fillna(0)

        for tag1 in self.tags:
            for tag2 in self.tags:
                if tag1 == tag2:
                    overlap_matrix.loc[tag1, tag2] = 0  # Zero out the diagonal
                else:
                    overlap_matrix.loc[tag1, tag2] = len(tag_types[tag1].intersection(tag_types[tag2]))

        return overlap_matrix

    def visualize_overlap(self, split):
        """
        Generate and display heatmaps showing word type overlap across entity tags with consistent color scaling.

        Parameters:
            split (str): Dataset split to analyze ('train' or 'test').
        """
        datasets = {
            'ANERCorp': self.corpora['ANERCorp']['splits'][split],
            'CoNLL-2003': self.corpora['CoNLL-2003']['splits'][split]
        }

        fig = make_subplots(
            rows=1, cols=len(datasets),
            subplot_titles=list(datasets.keys()),
            horizontal_spacing=0.1
        )

        max_value = 0
        matrices = []

        # Compute overlap matrices and determine the global max value for consistent scaling
        for name, dataset in datasets.items():
            matrix = self.extract_overlap_matrix(dataset)
            matrices.append((name, matrix))
            max_value = max(max_value, matrix.to_numpy().max())  # Update max_value globally across datasets

        col = 1
        for name, matrix in matrices:
            # Create a lower triangle mask
            mask_lower = np.tril(np.ones_like(matrix, dtype=bool))
            lower_triangle = matrix.mask(~mask_lower)

            # Prepare text data for display
            text_data = np.where(
                lower_triangle.isnull(), '', lower_triangle.fillna(0).astype(int).astype(str)
            )

            # Add heatmap to the subplot
            fig.add_trace(
                go.Heatmap(
                    z=lower_triangle,
                    x=lower_triangle.columns,
                    y=lower_triangle.index,
                    coloraxis="coloraxis",  # Shared color scale
                    showscale=True,  # Display the color scale for all heatmaps
                    text=text_data,
                    texttemplate="%{text}",
                    hoverinfo="text+z"
                ),
                row=1, col=col
            )
            col += 1

        # Update layout with shared color scale
        split_title = "Training Split" if split == "train" else "Testing Split"
        fig.update_layout(
            title_text=f"Token Type Overlap Across Entity Tags ({split_title})",
            template="plotly_white",
            height=600,
            width=1200,
            coloraxis=dict(
                colorscale='RdBu_r',  # Use the same color scale
                cmin=0,              # Minimum value for color scale
                cmax=max_value,      # Maximum value for color scale (shared across all heatmaps)
                colorbar=dict(title="Counts")  # Color bar title
            ),
            xaxis=dict(showgrid=False),
            xaxis2=dict(showgrid=False),
            yaxis=dict(showgrid=False),
            yaxis2=dict(showgrid=False)
        )

        fig.show()
        
    def visualize_overlap_one_scale(self, split):
        """
        Generate and display heatmaps showing word type overlap across entity tags for ANERCorp and CoNLL-2003.
        The first heatmap has no color scale, while the second heatmap shows the shared scale.

        Parameters:
            split (str): Dataset split to analyze ('train' or 'test').
        """
        datasets = {
            'ANERCorp': self.corpora['ANERCorp']['splits'][split],
            'CoNLL-2003': self.corpora['CoNLL-2003']['splits'][split]
        }

        # Calculate overlap matrices
        anercorp_matrix = self.extract_overlap_matrix(datasets['ANERCorp'])
        conll_matrix = self.extract_overlap_matrix(datasets['CoNLL-2003'])

        # Mask lower triangles for both matrices
        mask_lower = np.tril(np.ones_like(anercorp_matrix, dtype=bool))
        lower_triangle_anercorp = anercorp_matrix.mask(~mask_lower)
        lower_triangle_conll = conll_matrix.mask(~mask_lower)

        # Prepare text for heatmaps
        anercorp_text = np.where(
            lower_triangle_anercorp.isnull(), '',
            lower_triangle_anercorp.fillna(0).astype(int).astype(str)
        )
        conll_text = np.where(
            lower_triangle_conll.isnull(), '',
            lower_triangle_conll.fillna(0).astype(int).astype(str)
        )

        # Create a subplot with two heatmaps
        fig = make_subplots(
            rows=1, cols=2,
            subplot_titles=('ANERCorp', 'CoNLL-2003'),
            horizontal_spacing=0.1  # Adjust spacing between subplots
        )

        # Add ANERCorp heatmap (no color scale)
        fig.add_trace(
            go.Heatmap(
                z=lower_triangle_anercorp,
                x=lower_triangle_anercorp.columns,
                y=lower_triangle_anercorp.index,
                colorscale="RdBu_r",
                showscale=False,  # Turn off color scale for the first plot
                text=anercorp_text,
                texttemplate="%{text}",
                hoverinfo="text+z"
            ),
            row=1, col=1
        )

        # Add CoNLL-2003 heatmap (with color scale)
        fig.add_trace(
            go.Heatmap(
                z=lower_triangle_conll,
                x=lower_triangle_conll.columns,
                y=lower_triangle_conll.index,
                colorscale="RdBu_r",
                showscale=True,  # Show the color scale for the second plot
                text=conll_text,
                texttemplate="%{text}",
                hoverinfo="text+z"
            ),
            row=1, col=2
        )

        # Update the layout for consistency
        fig.update_layout(
            title_text=f"Token Type Overlap Across Entity Tags ({split.capitalize()} Split)",
            template="plotly_white",
            height=600,
            width=1200,
            xaxis=dict(showgrid=False),
            xaxis2=dict(showgrid=False),
            yaxis=dict(showgrid=False),
            yaxis2=dict(showgrid=False)
        )

        # Show the figure
        fig.show()



class EntityTagTTRVisualizer:
    def __init__(self, df):
        """
        Initialize the EntityTagTTRVisualizer.
        
        Parameters:
            df (pd.DataFrame): The DataFrame containing entity tag statistics, including TTR.
        """
        self.df = df
        self.color_map = {
            "B-LOC": "darkgreen",
            "B-PERS": "deepskyblue",
            "B-PER": "deepskyblue",
            "B-ORG": "darkcyan",
            "B-MISC": "palevioletred",
            "I-LOC": "yellowgreen",
            "I-PERS": "lightblue",
            "I-PER": "lightblue",
            "I-ORG": "cyan",
            "I-MISC": "violet",
            "O": "saddlebrown",
        }
        self.data_map = {"ANERCorp_CamelLab": "ANERCorp"}
        self.label_map = {"B-PERS": "B-PER", "I-PERS": "I-PER"}
        self.split_map = {"train": "Train", "test": "Test"}
        self.tag_order = ["B-LOC", "I-LOC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-MISC", "I-MISC"]

    def preprocess_data(self):
        """
        Standardizes dataset names, tag labels, and split names according to the mappings.
        Also, rounds TTR values for better readability.
        """
        self.df['Dataset'] = self.df['Dataset'].replace(self.data_map)
        self.df['Tag'] = self.df['Tag'].replace(self.label_map)
        self.df['Split'] = self.df['Split'].replace(self.split_map)
        self.df['TTR'] = self.df['TTR'].round(4)  # Ensure 4 decimal places
        
        # Enforce consistent ordering of tags
        self.df['Tag'] = pd.Categorical(self.df['Tag'], categories=self.tag_order, ordered=True)
        
        # Sort the DataFrame by Tag to ensure consistent plotting order
        self.df = self.df.sort_values(by=['Tag'])

    def plot_bar_chart(self, split):
        """
        Creates a grouped bar chart for TTR across entity tags for the specified split.
        
        Parameters:
            split (str): The data split to visualize ('Train' or 'Test').
        """
        filtered_df = self.df[self.df['Split'] == split]

        fig = go.Figure()

        for dataset in filtered_df['Dataset'].unique():
            subset = filtered_df[filtered_df['Dataset'] == dataset]
            fig.add_trace(go.Bar(
                x=subset['Tag'],
                y=subset['TTR'],
                name=dataset,
                text=subset['TTR'],
                textposition='auto',
                marker=dict(color=[self.color_map[tag] for tag in subset['Tag']])
            ))

        # Update layout
        fig.update_layout(
            title=f'Type-to-Token Ratio (TTR) Across Entity Tags in ANERCorp and CoNLL-2003 ({split} Split)',
            template="plotly_white",
            xaxis_title='Entity Tag',
            yaxis_title='TTR',
            barmode='group',
            legend_title="Dataset",
            height=500,
            width=1300
        )

        fig.show()

    def plot_faceted_bar_chart(self):
        """
        Plots a faceted bar chart with datasets and splits displayed separately.
        """
        fig = px.bar(
            self.df,
            x='Tag',
            y='TTR',
            color='Tag',
            facet_col='Dataset',
            facet_row='Split',
            color_discrete_map=self.color_map,
            text='TTR',
            title="Type-to-Token Ratio (TTR) Across Entity Tags in ANERCorp and CoNLL-2003",
            labels={"Tag": "Entity Tag"}
        )

        # Update layout for better readability
        fig.update_layout(
            template="plotly_white",
            height=700,
            width=1200,
            margin=dict(t=60, l=20, r=20, b=20),
            title_x=0.5
        )

        # Update text formatting
        fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')

        fig.show()


class EntityTagDistributionAnalyzer:
    def __init__(self, corpora, datasets_of_interest, tags_of_interest):
        """
        Initialize the analyzer with corpora, datasets of interest, and tags of interest.
        
        Parameters:
            corpora (dict): The corpora containing datasets and splits.
            datasets_of_interest (list): List of datasets to analyze.
            tags_of_interest (list): List of tags to analyze.
        """
        self.corpora = corpora
        self.datasets_of_interest = datasets_of_interest
        self.tags_of_interest = tags_of_interest
        self.dataset_mapping = {'ANERCorp_CamelLab': 'ANERCorp'}
        self.split_mapping = {'train': 'Train', 'test': 'Test'}
        self.label_map = {"B-PERS": "B-PER", "I-PERS": "I-PER"}

    def preprocess_data(self, df):
        """
        Apply mappings to standardize dataset names, tag labels, and split names.
        """
        df['Dataset'] = df['Dataset'].replace(self.dataset_mapping)
        df['Split'] = df['Split'].replace(self.split_mapping)
        df['Tag'] = df['Tag'].replace(self.label_map)
        return df

    def calculate_distributions(self, splits):
        """
        Compute statistical distributions for word types across entity tags.
        
        Parameters:
            splits (list): The data splits to analyze (e.g., ['train', 'test']).
            
        Returns:
            pd.DataFrame: A DataFrame containing the calculated statistics.
        """
        all_data = []
        
        # Process each dataset and split
        for split in splits:
            for dataset in self.datasets_of_interest:
                data = self.corpora[dataset]['splits'][split]
                tag_word_counts = {tag: Counter() for tag in self.tags_of_interest}
                
                # Count token occurrences for each entity tag
                for tag in self.tags_of_interest:
                    tag_word_counts[tag].update(
                        token for token, entity in zip(data['Token Ids'], data['True Labels']) if entity.replace('PERS', 'PER') == tag
                    )
                
                # Calculate statistics for each entity tag
                for tag, counter in tag_word_counts.items():
                    frequencies = np.array(list(counter.values()))
                    if frequencies.size > 0:
                        mean_freq = round(np.mean(frequencies), 2)
                        median_freq = round(np.median(frequencies), 2)
                        std_dev = round(np.std(frequencies), 2)
                    else:
                        mean_freq, median_freq, std_dev = 0, 0, 0

                    all_data.append({
                        'Dataset': dataset,
                        'Split': split,
                        'Tag': tag,
                        'Mean Frequency': mean_freq,
                        'Median Frequency': median_freq,
                        'Standard Deviation': std_dev
                    })
        
        # Convert list to DataFrame
        df = pd.DataFrame(all_data)
        return self.preprocess_data(df)

    def visualize_distributions(self, df, metric, title):
        """
        Create a bar chart visualizing statistical distributions of entity tags.
        
        Parameters:
            df (pd.DataFrame): The DataFrame containing the calculated statistics.
            metric (str): The metric to visualize (e.g., 'Standard Deviation').
            title (str): The title of the plot.
        """
        fig = px.bar(
            df,
            x='Tag',
            y=metric,
            color='Dataset',
            text=metric,
            barmode='group',
            facet_row='Split',
            title=title,
            template="plotly_white"
        )

        fig.update_layout(
            xaxis_title='Entity Tag',
            yaxis_title=metric,
            height=800,
            width=1200,
            legend_title="Dataset"
        )

        fig.show()


class OOVAnalyzer:
    def __init__(self, corpora, datasets_of_interest, tags_of_interest):
        """
        Initialize the OOV Analyzer.

        Parameters:
            corpora (dict): The corpora containing datasets and splits.
            datasets_of_interest (list): List of datasets to analyze.
            tags_of_interest (list): List of entity tags to analyze.
        """
        self.corpora = corpora
        self.datasets_of_interest = datasets_of_interest
        self.tags_of_interest = tags_of_interest
        self.dataset_mapping = {"ANERCorp_CamelLab": "ANERCorp"}  # Standardized dataset names

    def calculate_oov_rates(self):
        """
        Computes OOV rates for each entity tag in the test split using core tokens.

        Returns:
            pd.DataFrame: DataFrame containing OOV counts, total unique core tokens in test, and OOV rates.
        """
        all_data = []

        for dataset in self.datasets_of_interest:
            if "train" in self.corpora[dataset]["splits"] and "test" in self.corpora[dataset]["splits"]:
                train_tokens = {tag: set() for tag in self.tags_of_interest}
                test_tokens = {tag: set() for tag in self.tags_of_interest}

                # Extract core tokens for training split
                for _, row in self.corpora[dataset]["splits"]["train"].iterrows():
                    normalized_tag = row["True Labels"].replace("PERS", "PER")
                    if normalized_tag in self.tags_of_interest:
                        train_tokens[normalized_tag].add(row["Core Tokens"])

                # Extract core tokens for testing split
                for _, row in self.corpora[dataset]["splits"]["test"].iterrows():
                    normalized_tag = row["True Labels"].replace("PERS", "PER")
                    if normalized_tag in self.tags_of_interest:
                        test_tokens[normalized_tag].add(row["Core Tokens"])

                # Calculate OOV statistics
                for tag in self.tags_of_interest:
                    oov_tokens = test_tokens[tag] - train_tokens.get(tag, set())
                    oov_rate = len(oov_tokens) / len(test_tokens[tag]) if test_tokens[tag] else 0

                    all_data.append({
                        "Dataset": dataset,
                        "Tag": tag,
                        "OOV Core Tokens Count": len(oov_tokens),
                        "Total Unique Core Tokens in Test": len(test_tokens[tag]),
                        "OOV Rate": round(oov_rate, 4)
                    })

        df = pd.DataFrame(all_data)
        return df.replace({"Dataset": self.dataset_mapping})  # Standardize dataset names

    def visualize_oov_rates(self, df):
        """
        Generates a bar chart comparing OOV rates between datasets.

        Parameters:
            df (pd.DataFrame): DataFrame containing OOV metrics.
        """
        # Create a grouped bar chart for OOV rates per entity tag
        fig = px.bar(
            df,
            x="Tag",
            y="OOV Rate",
            color="Dataset",
            text=df["OOV Rate"].round(3),
            barmode="group",
            title="Comparison of OOV Rates Across Entity Tags in ANERCorp and CoNLL-2003",
            labels={"OOV Rate": "OOV Rate (%)", "Tag": "Entity Tag"}
        )

        fig.update_traces(textposition="outside")
        fig.update_layout(
            template="plotly_white",
            height=500,
            width=1200,
            legend_title="Dataset"
        )

        fig.show()

import pandas as pd
import plotly.express as px

class TokenizationRateAnalyzer:
    def __init__(self, corpora, datasets_of_interest, tags_of_interest):
        """
        Initialize the Tokenization Rate Analyzer.

        Parameters:
            corpora (dict): The corpora containing datasets and splits.
            datasets_of_interest (list): List of datasets to analyze.
            tags_of_interest (list): List of entity tags to analyze.
        """
        self.corpora = corpora
        self.datasets_of_interest = datasets_of_interest
        self.tags_of_interest = tags_of_interest
        self.dataset_mapping = {"ANERCorp_CamelLab": "ANERCorp"}  # Standardized dataset names

        # Define a fixed order for entity tags
        self.tag_order = [
            "B-LOC", "I-LOC",
            "B-PER", "I-PER",
            "B-ORG", "I-ORG",
            "B-MISC", "I-MISC"
        ]

    def calculate_tokenization_metrics(self):
        """
        Computes tokenization metrics for each entity tag.

        Returns:
            pd.DataFrame: DataFrame containing mean tokenization rate, standard deviation, and total subwords.
        """
        all_data = []

        for dataset in self.datasets_of_interest:
            for split in ["train", "test"]:
                df = self.corpora[dataset]["splits"][split]
                
                # Filter out non-entity tokens
                df = df[df["True Labels"] != "O"].copy()
                
                # Calculate tokenization rate
                if 'word_pieces' in df.columns:
                    df["Tokenization Rate"] = df["word_pieces"].apply(lambda x: len(x) if isinstance(x, list) else 0)
                
                # Aggregate statistics per entity type
                tag_metrics = df.groupby("True Labels")["Tokenization Rate"].agg(
                    mean="mean", std="std", sum="sum", count="count"
                ).reset_index()
                
                tag_metrics.rename(
                    columns={"sum": "total_subwords", "count": "total_words"},
                    inplace=True
                )
                
                tag_metrics["Dataset"] = dataset
                tag_metrics["Split"] = split
                all_data.append(tag_metrics)

        df = pd.concat(all_data, ignore_index=True)
        
        # Standardize dataset names
        df.replace({"Dataset": self.dataset_mapping}, inplace=True)

        # Set categorical order for "True Labels" to ensure consistent ordering
        df["True Labels"] = pd.Categorical(df["True Labels"], categories=self.tag_order, ordered=True)

        # Sort by categorical order
        df = df.sort_values(by=["True Labels"])

        return df

    def visualize_tokenization_rates(self, df):
        """
        Generates a faceted bar chart showing tokenization rates across datasets and splits.

        Parameters:
            df (pd.DataFrame): DataFrame containing tokenization metrics.
        """
        # Standardize split names for readability
        split_mapping = {"train": "Train", "test": "Test"}
        df["Split"] = df["Split"].replace(split_mapping)
        df["mean"] = df["mean"].round(2)

        # Create the faceted bar chart
        fig = px.bar(
            df,
            x="True Labels",
            y="mean",
            color="Dataset",
            # error_y="std",
            facet_row="Split",
            barmode="group",
            text='mean',
            title="Tokenization Rates Across Entity Tags in ANERCorp and CoNLL-2003 for both Training and Testing Splits",
            labels={"True Labels": "Entity Tag", "mean": "Average Tokenization Rate"},
            height=700,
            width=1200
        )

        # Improve layout for better readability
        fig.update_layout(
            template="plotly_white",
            legend_title="Entity Type",
            margin=dict(t=60, l=20, r=20, b=20),
            title_x=0.5
        )

        fig.show()


In [4]:
# CONFIG_PATH = Path("/Users/ay227/Desktop/Final-Year/Thesis-Experiments/Online-Dashboard-Phase/dashboard-config.yaml")
CONFIG_PATH = Path("/Users/ahmed/Desktop/Dashboard/analysis-config.yaml")

config_manager = DashboardConfigManager(CONFIG_PATH)
dev_config = config_manager.development_config    

app = Dash(__name__, suppress_callback_exceptions=True)

app_config = config_manager.app_config
server = app.server  # Flask server instance for caching
variants_data = None

data_manager = DataManager(config_manager, server)
dash_data = data_manager.load_data()

2025-09-19 23:00:25 - INFO - Found Google Drive directory for account ahmed.younes.sam@gmail.com: /Users/ahmed/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com
2025-09-19 23:00:25 - INFO - Loading Dashboard Data from  /Users/ahmed/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/BaseLineExperiment/ANERCorp_CamelLab_arabertv02


  0%|          | 0/18 [00:00<?, ?it/s]

2025-09-19 23:00:28 - INFO - Found Google Drive directory for account ahmed.younes.sam@gmail.com: /Users/ahmed/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com
2025-09-19 23:00:28 - INFO - Found Google Drive directory for account ahmed.younes.sam@gmail.com: /Users/ahmed/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com
2025-09-19 23:00:29 - INFO - Found Google Drive directory for account ahmed.younes.sam@gmail.com: /Users/ahmed/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com
2025-09-19 23:00:29 - INFO - Loading Dashboard Data from  /Users/ahmed/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/BaseLineExperiment/conll2003_bert


  0%|          | 0/18 [00:00<?, ?it/s]

2025-09-19 23:00:39 - INFO - Found Google Drive directory for account ahmed.younes.sam@gmail.com: /Users/ahmed/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com
2025-09-19 23:00:39 - INFO - Found Google Drive directory for account ahmed.younes.sam@gmail.com: /Users/ahmed/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com


In [5]:
df = dash_data['ANERCorp_CamelLab_arabertv02'].analysis_data
df[~df['True Aligned Scheme']]['Strict True Entities'].value_counts()

Strict True Entities
O    84
Name: count, dtype: int64

In [6]:
df[~df['True Aligned Scheme']]['True Labels'].value_counts()

True Labels
I-PER     57
I-MISC    10
I-ORG      9
I-LOC      8
Name: count, dtype: int64

In [7]:
df.columns

Index(['Sentence Ids', 'Token Positions', 'Words', 'Tokens', 'Word Pieces',
       'Core Tokens', 'True Labels', 'Token Selector Id', 'Pred Labels',
       'Agreements', 'X', 'Y', 'Labels', 'Loss Values', 'Token Ids',
       'Global Id', 'True Silhouette', 'Pred Silhouette', 'K=3',
       'Boundary Clusters', 'K=4', 'Entity Clusters', 'K=9', 'Token Clusters',
       'Consistency Count', 'Inconsistency Count', 'Total Train Occurrences',
       'Local Token Entropy', 'Token Max Entropy', 'Dataset Token Entropy',
       'Local Word Entropy', 'Word Max Entropy', 'Dataset Word Entropy',
       'Tokenization Rate', 'Error Type', 'O Confidence', 'B-PER Confidence',
       'I-PER Confidence', 'B-ORG Confidence', 'I-ORG Confidence',
       'B-LOC Confidence', 'I-LOC Confidence', 'B-MISC Confidence',
       'I-MISC Confidence', 'Prediction Entropy', 'Prediction Max Entropy',
       'Token Confidence', 'Variability', 'Pre X', 'Pre Y',
       'Strict True Entities', 'Strict Pred Entities', 'True E

In [8]:
dash_data['conll2003_bert'].analysis_data['True Aligned Scheme'].value_counts()

True Aligned Scheme
True    70367
Name: count, dtype: int64

In [9]:
# Usage Example
datasets_mapping = {
    'ANERCorp_CamelLab_arabertv02': 'ANERCorp',
    'conll2003_bert': 'CoNLL-2003'
}

processor = CorpusProcessor(dash_data, datasets_mapping)
corpora = processor.process()



Processing ANERCorp...
ANERCorp Train Size: 124659
ANERCorp Test Size: 24991
Processing CoNLL-2003...
CoNLL-2003 Train Size: 203621
CoNLL-2003 Test Size: 46435


In [76]:
from typing import Dict, Iterator, Iterable, Tuple, List, Optional
import pandas as pd
import numpy as np

from seqeval.metrics.sequence_labeling import get_entities
from seqeval.scheme import Entities, auto_detect
from collections import Counter, defaultdict


_SPLIT_LABEL    = {"train": "Train", "test": "Test", "validation": "Validation", "dev": "Validation"}
_DATASET_LABEL  = {"ANERCorp_CamelLab": "ANERCorp", "conll2003": "CoNLL-2003"}


# pretty/variant names -> raw corpus keys (extend as needed)
_DATASET_MAP    = {
    "ANERCorp": "ANERCorp_CamelLab",
    "CoNLL-2003": "conll2003",
    "ANERCorp_CamelLab_arabertv02": "ANERCorp_CamelLab",
    "conll2003_bert": "conll2003",
}

_TAG_NORMALIZE  = {"B-PERS": "B-PER", "I-PERS": "I-PER"}

_DEFAULT_TAGS = ['B-LOC','I-LOC','B-PER','I-PER','B-ORG','I-ORG','B-MISC','I-MISC']

BASE_DATASETS = {"ANERCorp_CamelLab", "conll2003"}


_DEFAULT_ENTITY_SPANS = ["LOC", "PER", "ORG", "MISC"]
_ENTITY_NORMALIZE = {"PERS": "PER"}  # unify ANER's PERS→PER

class BaseDashDataProcessor:
    def __init__(self, dash_data: Dict[str, "DashboardData"]):
        self.dash_data = dash_data  # raw objects with .train_data / .analysis_data
        self.corpora = {}           # filled by build_corpora()

    # labels (optional pretty names)
    def ds_label(self, variant_key: str) -> str:
        """
        Example:
          variant_key='ANERCorp_CamelLab_arabertv02'
          DATA_MAP[variant_key] -> 'ANERCorp_CamelLab'
          _DATASET_LABEL['ANERCorp_CamelLab'] -> 'ANERCorp'
        Falls back gracefully if mappings are missing.
        """
        ds_key = _DATASET_MAP.get(variant_key, variant_key)
        return _DATASET_LABEL.get(ds_key, ds_key)

    def normalise_data(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df[df['Labels'] != -100].copy()
        if 'True Labels' in df.columns:
            df['True Labels'] = df['True Labels'].replace({'B-PERS': 'B-PER', 'I-PERS': 'I-PER'})
        return df

    def _resolve_keys_for_variant(self, variant: str) -> list[str]:
        """Map a variant to concrete keys in dash_data."""
        
        if variant == "combined":
            return list(self.dash_data.keys())
        return [variant]

    def build_corpora(self, variant: str) -> dict:
        """
        selected_variant:
          - '<variant_name>' e.g. 'ANERCorp_CamelLab_arabertv02'
          - 'combined'       -> include all variants present in dash_data

        Populate self.corpora for the requested variant.
        Output shape:
          self.corpora[<Dataset Label>]['splits'] = {'Train': df, 'Test': df}
        """
        self.corpora = {}
        variant_keys = self._resolve_keys_for_variant(variant)

        for variant_key in variant_keys:
            ds_lbl = self.ds_label(variant_key)

            # ---- get & normalize splits ----
            train_df = self.normalise_data(self.dash_data[variant_key].train_data)
            test_df  = self.normalise_data(self.dash_data[variant_key].analysis_data)

            # ensure expected columns exist
            if "Core Tokens" not in train_df.columns and "core_tokens" in train_df.columns:
                train_df = train_df.rename(columns={"core_tokens": "Core Tokens"})
            if "Core Tokens" not in test_df.columns and "core_tokens" in test_df.columns:
                test_df  = test_df.rename(columns={"core_tokens": "Core Tokens"})

            self.corpora[ds_lbl] = {
                "splits": {
                    "Train": train_df,
                    "Test":  test_df,
                }
            }

        return self.corpora
    
    def iter_splits(self, ds_key: str) -> Iterator[Tuple[str, str, pd.DataFrame]]:
        # returns ('Train'/'Test', same label, df)
        for sk in ("Train", "Test"):
            if sk in self.corpora[ds_key]["splits"]:
                yield sk, sk, self.corpora[ds_key]["splits"][sk]



class TokenTypeOverlap(BaseDashDataProcessor):
    """
    Compute overlap of unique token *types* across entity tags for DF-based corpora
    (produced by BaseDashDataProcessor.build_corpora()).

    Options:
      - include_O: include the 'O' tag as one bucket
      - tag_set:   fixed tag inventory; if None, derive from data per dataset (union of splits)
      - fill_diagonal: set diagonal to 0 (typical for overlap visuals)
      - tag_order: explicit ordering; else sorted discovered tags
    """

    def _tag_sets_for_split(
        self, df: pd.DataFrame, *, include_O: bool, allowed_tags: Optional[set]
    ) -> dict[str, set]:
        if "Core Tokens" not in df.columns or "True Labels" not in df.columns:
            raise KeyError("Expected columns 'Core Tokens' and 'True Labels'.")

        df = df[["Core Tokens", "True Labels"]].dropna().copy()
        df["True Labels"] = df["True Labels"].astype(str)

        if not include_O:
            df = df[df["True Labels"] != "O"]

        if allowed_tags is not None:
            df = df[df["True Labels"].isin(allowed_tags)]

        buckets: dict[str, set] = {}
        for tag, g in df.groupby("True Labels", dropna=False):
            buckets[tag] = set(g["Core Tokens"].astype(str).unique())
        return buckets

    @staticmethod
    def _overlap_matrix(buckets: dict[str, set], *, order: list[str], fill_diagonal: bool) -> pd.DataFrame:
        mat = pd.DataFrame(0, index=order, columns=order, dtype=int)
        for t1 in order:
            for t2 in order:
                if t1 == t2 and fill_diagonal:
                    mat.loc[t1, t2] = 0
                else:
                    s1 = buckets.get(t1, set())
                    s2 = buckets.get(t2, set())
                    mat.loc[t1, t2] = len(s1 & s2)
        return mat

    def _discover_tags_for_dataset(self, ds_lbl: str, include_O: bool) -> list[str]:
        tags = set()
        for split_key, _, df in self.iter_splits(ds_lbl):
            vals = set(df["True Labels"].dropna().astype(str).unique())
            if not include_O:
                vals.discard("O")
            tags |= vals
        return sorted(tags)

    def generate_matrices(
        self,
        selected_variant: str,
        *,
        include_O: bool,
        tag_set: Optional[list[str]] = None,
        fill_diagonal: bool = True,
        tag_order: Optional[list[str]] = None,
    ) -> dict[str, dict[str, pd.DataFrame]]:
        """
        Returns:
          {Dataset -> {Split -> pd.DataFrame(matrix)}}
        """
        self.build_corpora(selected_variant)

        out: dict[str, dict[str, pd.DataFrame]] = {}
        for ds_lbl, content in self.corpora.items():
            # choose tag inventory
            if tag_set is None:
                tags = self._discover_tags_for_dataset(ds_lbl, include_O=include_O)
            else:
                tags = list(tag_set)
                if not include_O and "O" in tags:
                    tags = [t for t in tags if t != "O"]

            order = tag_order or tags
            out[ds_lbl] = {}

            for split_key, _, df in self.iter_splits(ds_lbl):
                buckets = self._tag_sets_for_split(df, include_O=include_O, allowed_tags=set(order))
                mat = self._overlap_matrix(buckets, order=order, fill_diagonal=fill_diagonal)
                out[ds_lbl][split_key] = mat

        return out

    def generate_df(
        self,
        selected_variant: str,
        *,
        include_O: bool = True,
        tag_set: Optional[list[str]] = None,
        fill_diagonal: bool = True,
        tag_order: Optional[list[str]] = None,
    ) -> pd.DataFrame:
        """
        Returns tidy long form:
          columns = [Dataset, Split, Tag1, Tag2, Overlap Count]
        """
        mats = self.generate_matrices(
            selected_variant,
            include_O=include_O,
            tag_set=tag_set,
            fill_diagonal=fill_diagonal,
            tag_order=tag_order,
        )
        rows = []
        for ds_lbl, splits in mats.items():
            for split_lbl, mat in splits.items():
                for t1 in mat.index:
                    for t2 in mat.columns:
                        rows.append({
                            "Dataset": ds_lbl,
                            "Split": split_lbl,
                            "Tag1": t1,
                            "Tag2": t2,
                            "Overlap Count": int(mat.loc[t1, t2]),
                        })
        return pd.DataFrame(rows, columns=["Dataset","Split","Tag1","Tag2","Overlap Count"])


In [77]:
helper = TokenTypeOverlap(dash_data)
df = helper.generate_df("combined")
df.head(50)

Unnamed: 0,Dataset,Split,Tag1,Tag2,Overlap Count
0,ANERCorp,Train,B-LOC,B-LOC,0
1,ANERCorp,Train,B-LOC,B-MISC,28
2,ANERCorp,Train,B-LOC,B-ORG,66
3,ANERCorp,Train,B-LOC,B-PER,71
4,ANERCorp,Train,B-LOC,I-LOC,33
5,ANERCorp,Train,B-LOC,I-MISC,20
6,ANERCorp,Train,B-LOC,I-ORG,44
7,ANERCorp,Train,B-LOC,I-PER,69
8,ANERCorp,Train,B-LOC,O,299
9,ANERCorp,Train,B-MISC,B-LOC,28


In [None]:
import numpy as np
import pandas as pd
from dash import html, dcc
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_overlap_heatmaps(
    df: pd.DataFrame,
    *,
    title: str = "Type Overlap Across Entity Tags",
    panel_by: str = "Dataset",        # or "Split" — which column to split into panels
    filter_equals: dict | None = None, # e.g. {"Split": "Test"} to show only Test across datasets
    tag_order: list[str] | None = None,
    lower_triangle_only: bool = True,
    colorscale: str = "RdBu_r",
) -> html.Div:
    """
    Render a row of heatmaps from a tidy long DataFrame with columns:
      ['Dataset','Split','Tag1','Tag2','Overlap Count'].

    Examples:
      plot_overlap_heatmaps(df, panel_by="Dataset", filter_equals={"Split":"Test"})
      plot_overlap_heatmaps(df, panel_by="Split")  # one panel per split
    """
    needed = {"Tag1", "Tag2", "Overlap Count"}
    if not needed.issubset(df.columns):
        missing = ", ".join(sorted(needed - set(df.columns)))
        raise KeyError(f"Missing required columns: {missing}")

    

    # Optional filtering (e.g., keep only Split=='Test')
    if filter_equals:
        for k, v in filter_equals.items():
            if k in df.columns:
                df = df[df[k] == v]

    # Choose panels
    if panel_by not in data.columns:
        # fall back to single panel
        df["_panel"] = "All"
        panel_col = "_panel"
    else:
        panel_col = panel_by

    # Order of tags (matrix axes)
    if tag_order is not None:
        tags = [t for t in tag_order if t in set(df["Tag1"]).union(set(df["Tag2"]))]
    else:
        tags = sorted(set(df["Tag1"]).union(set(df["Tag2"])))

    # Build a matrix per panel
    panels = {}
    for panel_value, g in df.groupby(panel_col, dropna=False):
        # pivot to square matrix (fill missing with 0)
        mat = (
            g.pivot_table(index="Tag1", columns="Tag2", values="Overlap Count", aggfunc="sum", fill_value=0)
            .reindex(index=tags, columns=tags, fill_value=0)
        )
        if lower_triangle_only:
            mask = np.tril(np.ones_like(mat, dtype=bool))
            mat = mat.where(mask)  # keep lower triangle (diagonal kept as-is)
        panels[str(panel_value)] = mat

    # Shared max for consistent color scale
    max_value = max(int(m.to_numpy()[~np.isnan(m.to_numpy())].max()) if m.size else 0 for m in panels.values())

    # Compose subplots
    fig = make_subplots(
        rows=1,
        cols=len(panels),
        subplot_titles=list(panels.keys()),
        horizontal_spacing=0.1,
    )

    # Add each panel’s heatmap
    for col_idx, (name, mat) in enumerate(panels.items(), start=1):
        # text labels (blank for NaNs from masking)
        text_data = np.where(mat.isnull(), "", mat.fillna(0).astype(int).astype(str))
        fig.add_trace(
            go.Heatmap(
                z=mat,
                x=mat.columns,
                y=mat.index,
                coloraxis="coloraxis",
                text=text_data,
                texttemplate="%{text}",
                hoverinfo="text+z",
                showscale=False,  # we’ll show a single shared colorbar
            ),
            row=1, col=col_idx
        )

    # Layout & shared color scale
    fig.update_layout(
        title=title,
        template="plotly_white",
        height=600,
        width=1200,
        coloraxis=dict(
            colorscale=colorscale,
            cmin=0,
            cmax=max_value,
            colorbar=dict(title="Counts", len=0.9),
        ),
        margin=dict(t=60, l=20, r=20, b=20),
    )
    # show colorbar on the last trace
    if fig.data:
        fig.data[-1].update(showscale=True)

    # Clean grids
    for ax in fig.layout:
        if ax.startswith("xaxis") or ax.startswith("yaxis"):
            fig.layout[ax].update(showgrid=False)

    return html.Div([dcc.Graph(figure=fig)])


In [82]:
plot_overlap_heatmaps(
    df=df,
    title="Token Type Overlap Across Entity Tags (Test)",
    panel_by="Dataset",
    filter_equals={"Split": "Test"},
    tag_order=_DEFAULT_TAGS + ["O"],   # optional
)

In [50]:
plot_faceted_bar_chart(
            df=df,
            metric="Tokens Proportion",        # or "TTR" depending on the figure
            text="Tag Types",
            title="Entity Tag Token-Type Distribution (Tokens, Types, TTR)",
            tag_order=_DEFAULT_TAGS
        )

In [51]:
plot_faceted_bar_chart(
            df=df,
            metric="TTR",
            text="TTR",
            title="Type-to-Token Ratio (TTR) per Entity Tag",
            tag_order=_DEFAULT_TAGS
        )

In [36]:
helper.corpora['ANERCorp']['splits'].keys()



dict_keys(['Train', 'Test'])

# Distributions

In [16]:
token_stats_df = processor.compute_token_statistics()
token_stats_df

Unnamed: 0,Dataset,Total Tokens,Unique Words
0,ANERCorp (Train),124659,23010
1,ANERCorp (Test),24991,8455
2,CoNLL-2003 (Train),203621,11243
3,CoNLL-2003 (Test),46435,5883


In [None]:
# Run Analysis
analyzer = TokenTypeAnalyzer(corpora)
token_type_df = analyzer.analyze_splits()


In [32]:
# Step 1: Calculate the total tag types for each Dataset and Split
total_tag_types = token_type_df.groupby(['Dataset', 'Split'])['Tag Types'].sum().reset_index()
total_tag_types.rename(columns={'Tag Types': 'Total Tag Types'}, inplace=True)

# Step 2: Merge the total back to the original DataFrame
df = token_type_df.merge(total_tag_types, on=['Dataset', 'Split'])

# Step 3: Compute the proportion of tag types within each group (avoiding division by zero)
df['Tag Type Proportion'] = df['Tag Types'] / (df['Total Tag Types'] + 1e-10)
# Initialize with DataFrame
visualizer = EntityTagVisualizer(df)

# Preprocess Data
visualizer.preprocess_data()

visualizer.plot_faceted_bar_chart('Tag Type Proportion', 'Tag Types', "Entity Tag Token Types Distribution Across Training and Testing Splits")



# Overlap

In [19]:
# Define entity tags to analyze
entity_tags = ['B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-MISC', 'I-MISC', 'O']

# Initialize the OverlapMatrixAnalyzer
overlap_analyzer = OverlapMatrixAnalyzer(corpora, entity_tags)

# Visualize the word type overlap for the training split
overlap_analyzer.visualize_overlap_one_scale('train')

# Visualize the word type overlap for the testing split
overlap_analyzer.visualize_overlap_one_scale('test')


# Lexical Diversity

In [14]:
# Initialize the TTR visualizer
ttr_visualizer = EntityTagTTRVisualizer(token_type_df)

# Preprocess the data
ttr_visualizer.preprocess_data()

# # Plot TTR bar chart for Train split
# ttr_visualizer.plot_bar_chart('Train')

# # Plot TTR bar chart for Test split
# ttr_visualizer.plot_bar_chart('Test')

# Plot faceted bar chart (Train/Test together)
ttr_visualizer.plot_faceted_bar_chart()


In [15]:
# Define the datasets and entity tags to analyze
datasets_of_interest = ['ANERCorp', 'CoNLL-2003']
tags_of_interest = ['B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-MISC', 'I-MISC']
splits = ['train', 'test']

# Initialize the analyzer
analyzer = EntityTagDistributionAnalyzer(corpora, datasets_of_interest, tags_of_interest)

# Compute entity tag distributions
distribution_df = analyzer.calculate_distributions(splits)

# Visualize standard deviation across entity tags
analyzer.visualize_distributions(distribution_df, 'Standard Deviation', 
    title="Standard Deviation of Token Type Frequencies Across Entity Tags")


In [16]:
data = corpora['ANERCorp']['splits']['test']
data[data['True Labels'] == 'B-LOC']['Token Ids'].value_counts().sort_values(ascending=False).head(50)

Token Ids
1091     32
1405     24
1514     22
2536     16
6116     11
5894     10
2878     10
2774      9
4064      9
5141      9
911       9
39855     8
5943      8
1213      8
11338     8
3018      7
2235      6
2646      6
2922      6
4763      6
1142      6
17977     6
3378      6
9142      6
7010      5
4229      5
13840     5
7681      5
5338      5
17493     5
4574      5
3899      5
10897     5
9334      5
3112      4
10289     4
11792     4
3548      4
876       4
6479      4
4667      4
8383      4
7549      4
28873     4
10693     4
3867      4
3647      4
6602      4
8979      4
17338     3
Name: count, dtype: int64

# OOV

In [17]:
# Define the datasets and entity tags to analyze
datasets_of_interest = ["ANERCorp", "CoNLL-2003"]
tags_of_interest = ["B-LOC", "I-LOC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-MISC", "I-MISC"]

# Initialize the analyzer
oov_analyzer = OOVAnalyzer(corpora, datasets_of_interest, tags_of_interest)

# Compute OOV rates
oov_df = oov_analyzer.calculate_oov_rates()

# Visualize OOV rates
oov_analyzer.visualize_oov_rates(oov_df)


In [18]:
dataset = 'CoNLL-2003'
atr = set(corpora[dataset]['splits']['train'][corpora[dataset]['splits']['train']['True Labels'] == 'B-LOC']['Token Ids'])
ate = set(corpora[dataset]['splits']['test'][corpora[dataset]['splits']['test']['True Labels'] == 'B-LOC']['Token Ids'])

In [132]:
len(ate - atr)

74

In [120]:
oov_df

Unnamed: 0,Dataset,Tag,OOV Core Tokens Count,Total Unique Core Tokens in Test,OOV Rate
0,ANERCorp,B-LOC,90,264,0.3409
1,ANERCorp,I-LOC,18,34,0.5294
2,ANERCorp,B-PER,269,486,0.5535
3,ANERCorp,I-PER,225,400,0.5625
4,ANERCorp,B-ORG,117,205,0.5707
5,ANERCorp,I-ORG,105,151,0.6954
6,ANERCorp,B-MISC,88,120,0.7333
7,ANERCorp,I-MISC,91,111,0.8198
8,CoNLL-2003,B-LOC,74,351,0.2108
9,CoNLL-2003,I-LOC,38,96,0.3958


# Tokenisation

In [30]:
# Define datasets and entity tags of interest
datasets_of_interest = ["ANERCorp", "CoNLL-2003"]
tags_of_interest = ["B-LOC", "I-LOC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-MISC", "I-MISC"]

# Initialize the analyzer
tokenization_analyzer = TokenizationRateAnalyzer(corpora, datasets_of_interest, tags_of_interest)

# Compute tokenization rates
tokenization_df = tokenization_analyzer.calculate_tokenization_metrics()

# Visualize tokenization rates
tokenization_analyzer.visualize_tokenization_rates(tokenization_df)
