In [1]:
import sys
import os
from pathlib import Path
# This appends the directory one level up (the root of your project) to the sys.path.
# Modify the path depending on the location of modules you want to import.
sys.path.append(os.path.abspath('../../'))

from config.config_managers import DashboardConfigManager
from dataManager import DataManager
from dash import Dash
import pandas as pd
import plotly.express as px
from abc import ABC, abstractmethod
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

2025-04-07 21:22:26 - INFO - PyTorch version 2.2.2 available.


In [2]:
CONFIG_PATH = Path("/Users/ay227/Desktop/Final-Year/Thesis-Experiments/Online-Dashboard-Phase/analysis-config.yaml")
config_manager = DashboardConfigManager(CONFIG_PATH)
dev_config = config_manager.development_config    

app = Dash(__name__, suppress_callback_exceptions=True)

app_config = config_manager.app_config
server = app.server  # Flask server instance for caching
variants_data = None

data_manager = DataManager(config_manager, server)
dash_data = data_manager.load_data()

In [15]:
import torch
import torch.nn.functional as F
import pandas as pd
import plotly.express as px

class TokenEmbeddingComparer:
    def __init__(self, dash_data, variant_name):
        self.lang = dash_data[variant_name]
        self.dataset = self.lang.get_test_dataset
        self.pretrained = self.lang.get_pretrained_model
        self.finetuned = self.lang.get_fine_tuned_model

    def _get_raw_embeddings(self, model, input_ids):
        with torch.no_grad():
            return model.bert.embeddings(input_ids).squeeze(0)

    def _get_contextual_embeddings(self, model, input_ids, attention_mask):
        with torch.no_grad():
            output = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                output_hidden_states=True,
                return_dict=True
            )
            return output.hidden_states[6].squeeze(0), output.hidden_states[-1].squeeze(0)

    def compare(self, sentence_id=0, top_k=15):
        example = self.dataset[sentence_id]
        input_ids = example['input_ids'].unsqueeze(0)
        attention_mask = example['attention_mask'].unsqueeze(0)
        token_type_ids = torch.zeros_like(input_ids)
        mask = attention_mask.squeeze(0).bool()

        tokens = self.dataset.tokenizer.convert_ids_to_tokens(example['input_ids'])
        tokens = [tok for tok, keep in zip(tokens, mask) if keep]

        raw_pre = self._get_raw_embeddings(self.pretrained, input_ids)[mask]
        raw_fin = self._get_raw_embeddings(self.finetuned, input_ids)[mask]

        ctx_pre_mid, ctx_pre_final = self._get_contextual_embeddings(self.pretrained.bert, input_ids, attention_mask)
        ctx_fin_mid, ctx_fin_final = self._get_contextual_embeddings(self.finetuned.bert, input_ids, attention_mask)

        ctx_pre_mid = ctx_pre_mid[mask]
        ctx_fin_mid = ctx_fin_mid[mask]
        ctx_pre_final = ctx_pre_final[mask]
        ctx_fin_final = ctx_fin_final[mask]

        core_mask = [not tok.startswith("##") and tok not in ["[CLS]", "[SEP]"] for tok in tokens]
        core_tokens = [tok for tok, keep in zip(tokens, core_mask) if keep]
        core_mask_tensor = torch.tensor(core_mask, dtype=torch.bool)

        raw_pre = raw_pre[core_mask_tensor]
        raw_fin = raw_fin[core_mask_tensor]
        ctx_pre_mid = ctx_pre_mid[core_mask_tensor]
        ctx_fin_mid = ctx_fin_mid[core_mask_tensor]
        ctx_pre_final = ctx_pre_final[core_mask_tensor]
        ctx_fin_final = ctx_fin_final[core_mask_tensor]

        raw_sim = F.cosine_similarity(raw_pre, raw_fin, dim=1)
        mid_sim = F.cosine_similarity(ctx_pre_mid, ctx_fin_mid, dim=1)
        ctx_sim = F.cosine_similarity(ctx_pre_final, ctx_fin_final, dim=1)

        df_raw = pd.DataFrame({
            "Token": core_tokens * 3,
            "Similarity": torch.cat([raw_sim, mid_sim, ctx_sim]).tolist(),
            "Type": ["Raw"] * len(core_tokens) + ["Mid"] * len(core_tokens) + ["Contextual"] * len(core_tokens)
        })
        df_plot = df_raw.groupby(["Token", "Type"], as_index=False).mean()


        ctx_df = df_plot[df_plot["Type"] == "Contextual"].copy()
        ctx_df["Similarity Drop"] = 1.0 - ctx_df["Similarity"]

        top_tokens = ctx_df.sort_values("Similarity Drop", ascending=False).head(top_k)["Token"].tolist()
        top_tokens = list(dict.fromkeys(top_tokens))  # Ensure uniqueness

        df_top = df_plot[df_plot["Token"].isin(top_tokens)].copy()
        df_top["Token"] = pd.Categorical(df_top["Token"], categories=top_tokens, ordered=True)
        df_top = df_top.sort_values("Token")

        default_colors = {
            "Raw": "#636EFA",         # Plotly default blue
            "Mid": "#00CC96",         # Plotly default green
            "Contextual": "#EF553B"    # Plotly default red
        }

        fig = px.bar(
            df_top,
            x="Token",
            y="Similarity",
            color="Type",
            barmode="group",
            color_discrete_map=default_colors,
            title=f"Top {top_k} Most Affected Tokens (Pretrained vs Finetuned)",
            labels={"Similarity": "Cosine Similarity"},
            height=600
        )
        # fig = px.bar(
        #     df_top,
        #     x="Token",
        #     y="Similarity",
        #     color="Type",
        #     barmode="group",
        #     title=f"Top {top_k} Most Affected Tokens (Pretrained vs Finetuned)",
        #     labels={"Similarity": "Cosine Similarity"},
        #     height=600
        # )
        fig.update_traces(texttemplate="%{y:.2f}", textposition="outside", textfont=dict(size=12))
        fig.update_layout(template="plotly_white", bargap=0.15, bargroupgap=0.05, showlegend=True, margin=dict(t=50, b=50, l=50, r=50), yaxis=dict(range=[0, 1.0]))

        return df_raw, fig

# Exploration

In [18]:
comparer = TokenEmbeddingComparer(dash_data, "ANERCorp_CamelLab_arabertv02")
df, fig = comparer.compare(sentence_id=0, top_k=15)

# Show manually if needed
display(df)
fig.show()


Unnamed: 0,Token,Similarity,Type
0,الصالحية,0.999828,Raw
1,المفرق,0.999687,Raw
2,-,0.999354,Raw
3,غيث,0.999812,Raw
4,الطر,0.999689,Raw
...,...,...,...
124,من,0.103962,Contextual
125,الديوان,0.373552,Contextual
126,الملكي,0.326255,Contextual
127,الهاشمي,0.437851,Contextual


In [None]:
df

In [17]:
comparer = TokenEmbeddingComparer(dash_data, "conll2003_bert")
df, fig = comparer.compare(sentence_id=0, top_k=15)

# Show manually if needed
display(df)
fig.show()


Unnamed: 0,Token,Similarity,Type
0,S,0.99752,Raw
1,-,0.996783,Raw
2,J,0.997584,Raw
3,GE,0.998197,Raw
4,L,0.997178,Raw
5,W,0.997182,Raw
6,",",0.995411,Raw
7,CH,0.997913,Raw
8,IN,0.997735,Raw
9,S,0.997069,Raw


In [19]:
df

Unnamed: 0,Token,Similarity,Type
0,الصالحية,0.999828,Raw
1,المفرق,0.999687,Raw
2,-,0.999354,Raw
3,غيث,0.999812,Raw
4,الطر,0.999689,Raw
...,...,...,...
124,من,0.103962,Contextual
125,الديوان,0.373552,Contextual
126,الملكي,0.326255,Contextual
127,الهاشمي,0.437851,Contextual


In [11]:
comparer.lang.analysis_data.columns

Index(['Sentence Ids', 'Token Positions', 'Words', 'Tokens', 'Word Pieces',
       'Core Tokens', 'True Labels', 'Token Selector Id', 'Pred Labels',
       'Agreements', 'X', 'Y', 'Labels', 'Loss Values', 'Token Ids',
       'Global Id', 'True Silhouette', 'Pred Silhouette', 'K=3',
       'Boundary Clusters', 'K=4', 'Entity Clusters', 'K=9', 'Token Clusters',
       'Consistency Count', 'Inconsistency Count', 'Total Train Occurrences',
       'Local Token Entropy', 'Token Max Entropy', 'Dataset Token Entropy',
       'Local Word Entropy', 'Word Max Entropy', 'Dataset Word Entropy',
       'Tokenization Rate', 'Error Type', 'O Confidence', 'B-PER Confidence',
       'I-PER Confidence', 'B-ORG Confidence', 'I-ORG Confidence',
       'B-LOC Confidence', 'I-LOC Confidence', 'B-MISC Confidence',
       'I-MISC Confidence', 'Prediction Entropy', 'Prediction Max Entropy',
       'Token Confidence', 'Variability', 'Pre X', 'Pre Y',
       'Strict True Entities', 'Strict Pred Entities', 'True E

In [64]:
ar.analysis_data['Token Ambiguity'].value_counts()

Token Ambiguity
 0.000000    16035
-1.000000     7371
 0.010800      823
 0.009250      517
 0.008300      288
             ...  
 0.512700        1
 0.313691        1
 0.870900        1
 0.597098        1
 0.945700        1
Name: count, Length: 298, dtype: int64

In [None]:
ar.analysis_data['Token Ambiguity'].apply(lambda x: "OOV" if x == -1 else "IV")

Token Ambiguity
IN     22340
OUT     7371
Name: count, dtype: int64

In [None]:
ar.analysis_data['Token Ambiguity'].value_counts()

In [None]:
df["In/Out"] = df["Labels"].apply(lambda x: "OUT" if x == -1 else "IN")