In [2]:
import json
from pathlib import Path
corpus_name = 'conll2003'
model_name = 'bert'
variant = f'{corpus_name}_{model_name}'

corpus_file = '/Users/ay227/Library/CloudStorage/GoogleDrive-ahmed.younes.sam@gmail.com/My Drive/Final Year Experiments/Thesis-Experiments/Experiments/ExperimentData/corpora.json'
with open(corpus_file, 'r') as file:
    corpora = json.load(file)  # Use json.load() to read file, not json.loads()




In [3]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import plotly.express as px
from seqeval.metrics.sequence_labeling import get_entities
from seqeval.scheme import Entities, auto_detect

from collections import Counter, defaultdict

def calculate_general_stats(corpora_splits):
    stats = []
    all_tokens = []
    all_entity_tokens = []  # Collect tokens that are part of entities
    all_unique_entity_words = set()

    for split_name, split_data in corpora_splits.items():
        tokens = []
        entity_tokens = []  # Tokens tagged as entities
        unique_entity_words = set()

        # Extract tokens and entity tokens
        for sentence in split_data:
            tokens.extend(sentence['words'])
            for word, tag in zip(sentence['words'], sentence['tags']):
                if tag != "O":  # If not "O", it's part of an entity
                    entity_tokens.append(word)
                    unique_entity_words.add(word)

        total_tokens = len(tokens)
        unique_words = len(set(tokens))
        ttr = unique_words / total_tokens if total_tokens else 0
        ne_proportion = len(entity_tokens) / total_tokens if total_tokens else 0
        entity_type_proportion = len(unique_entity_words) / len(entity_tokens) if len(entity_tokens) else 0


        # Collect all tokens and entity tokens for overall calculation
        all_tokens.extend(tokens)
        all_entity_tokens.extend(entity_tokens)
        all_unique_entity_words.update(unique_entity_words)

        # Add split-level stats
        stats.append({
            'Split': split_name,
            'Total Tokens': total_tokens,
            'Unique Words': unique_words,
            'TTR': round(ttr, 4),
            'Entity Words': len(entity_tokens),
            'NE Proportion': round(ne_proportion, 4),
            'Entity Types': len(unique_entity_words),
            'ETTR': round(entity_type_proportion, 4),
        })

    # Overall dataset stats
    total_tokens = len(all_tokens)
    unique_words = len(set(all_tokens))
    ttr = unique_words / total_tokens if total_tokens else 0
    ne_proportion = len(all_entity_tokens) / total_tokens if total_tokens else 0
    entity_type_proportion = len(all_unique_entity_words) / len(all_entity_tokens) if len(all_entity_tokens) else 0

    stats.append({
        'Split': 'Total',
        'Total Tokens': total_tokens,
        'Unique Words': unique_words,
        'TTR': round(ttr, 4),
        'Entity Words': len(all_entity_tokens),
        'NE Proportion': round(ne_proportion, 4),
        'Entity Types': len(all_unique_entity_words),
        'ETTR': round(entity_type_proportion, 4),
    })

    # Convert to DataFrame
    df = pd.DataFrame(stats)
    return df



from collections import Counter
import pandas as pd

def calculate_tag_stats(corpora_splits, tag_set):
    stats = []
    tag_totals = Counter({tag: 0 for tag in tag_set})
    all_unique_tag_words = {tag: set() for tag in tag_set}

    # Process splits
    for split_name, split_data in corpora_splits.items():
        tag_counts = Counter({tag: 0 for tag in tag_set})
        unique_tag_words = {tag: set() for tag in tag_set}

        total_ne_tokens = 0  # Total NE tokens for proportions
        total_ne_unique_words = set()

        # Process each sentence
        for sentence in split_data:
            for word, tag in zip(sentence['words'], sentence['tags']):
                if tag in tag_counts:
                    tag_counts[tag] += 1
                    unique_tag_words[tag].add(word)
                    if tag != "O":  # Count only NE tokens for proportions
                        total_ne_tokens += 1
                        total_ne_unique_words.add(word)

        # Calculate stats for each tag
        for tag, count in tag_counts.items():
            tag_token_proportion = count / total_ne_tokens if total_ne_tokens else 0
            tag_type_proportion = len(unique_tag_words[tag]) / len(total_ne_unique_words) if total_ne_unique_words else 0
            ttr = len(unique_tag_words[tag]) / count if count else 0

            stats.append({
                'Split': split_name,
                'Tag': tag,
                'Tag Words': count,
                'Tag Types': len(unique_tag_words[tag]),
                'TTR': round(ttr, 4),
                'Words Proportion': round(tag_token_proportion, 4),
                'Type Proportion': round(tag_type_proportion, 4),
            })

        # Update totals for overall stats
        tag_totals.update(tag_counts)
        for tag in tag_set:
            all_unique_tag_words[tag].update(unique_tag_words[tag])

    # Overall dataset stats
    total_ne_tokens = sum(tag_totals[tag] for tag in tag_set if tag != "O")
    total_ne_unique_words = set(word for tag in tag_set if tag != "O" for word in all_unique_tag_words[tag])

    for tag, count in tag_totals.items():
        tag_token_proportion = count / total_ne_tokens if total_ne_tokens else 0
        tag_type_proportion = len(all_unique_tag_words[tag]) / len(total_ne_unique_words) if total_ne_unique_words else 0
        ttr = len(all_unique_tag_words[tag]) / count if count else 0

        # stats.append({
        #     'Split': 'Total',
        #     'Tag': tag,
        #     'Tag Tokens': count,
        #     'Tag Types': len(all_unique_tag_words[tag]),
        #     'TTR': round(ttr, 4),
        #     'Token Proportion': round(tag_token_proportion, 4),
        #     'Type Proportion': round(tag_type_proportion, 4),
        # })

    # Convert to DataFrame
    df = pd.DataFrame(stats)
    return df


# Combine stats for both ANERCorp and CoNLL
def calculate_combined_stats(corpora):
    anercorp_splits = corpora['ANERCorp_CamelLab']['splits']
    conll_splits_no_validation = {key: value for key, value in corpora['conll2003']['splits'].items() if key != 'validation'}

    # Calculate general stats
    anercorp_general = calculate_general_stats(anercorp_splits)
    conll_general = calculate_general_stats(conll_splits_no_validation)

    # Add dataset source
    anercorp_general['Dataset'] = 'ANERCorp_CamelLab'
    conll_general['Dataset'] = 'CoNLL-2003'

    # Combine general stats
    combined_general = pd.concat([anercorp_general, conll_general], ignore_index=True)

    # Calculate tag stats
    anercorp_tag_set = ['B-LOC', 'I-LOC', 'B-PERS', 'I-PERS', 'B-ORG', 'I-ORG', 'B-MISC', 'I-MISC']
    anercorp_tags = calculate_tag_stats(anercorp_splits, anercorp_tag_set)
    conll_tag_set = ['B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-MISC', 'I-MISC']
    conll_tags = calculate_tag_stats(conll_splits_no_validation, conll_tag_set)

    # Add dataset source
    anercorp_tags['Dataset'] = 'ANERCorp_CamelLab'
    conll_tags['Dataset'] = 'CoNLL-2003'

    # Combine tag stats
    combined_tags = pd.concat([anercorp_tags, conll_tags], ignore_index=True)

    return combined_general, combined_tags

def calculate_oov_rates(corpora):
    results = {}

    for corpus_name, corpus_data in corpora.items():
        if corpus_name != 'ANERCorp_CamelLab-validation':
            # Ensure that both 'train' and 'test' splits are available
            if 'train' in corpus_data['splits'] and 'test' in corpus_data['splits']:
                # Extract unique words from training and testing sets
                train_words = set(
                    w for sentence in corpus_data['splits']['train'] for w in sentence['words']
                )
                test_words = set(
                    w for sentence in corpus_data['splits']['test'] for w in sentence['words']
                )

                # Find the OOV words
                oov_words = test_words - train_words

                # Calculate OOV rate
                oov_rate = len(oov_words) / len(test_words) if test_words else 0

                # Store results for this corpus
                results[corpus_name] = {
                    'OOV Words Count': len(oov_words),
                    'Total Unique Words in Test': len(test_words),
                    'OOV Rate': round(oov_rate, 4)
                }
            else:
                results[corpus_name] = "Missing 'train' or 'test' split data"

    return results


def calculate_oov_rates_per_tag(corpora):
    results = {}

    for corpus_name, corpus_data in corpora.items():
        if corpus_name != 'ANERCorp_CamelLab-validation':
            if 'train' in corpus_data['splits'] and 'test' in corpus_data['splits']:
                train_words = {}
                test_words = {}
                
                # Collect words and their associated tags from training set
                for sentence in corpus_data['splits']['train']:
                    for word, tag in zip(sentence['words'], sentence['tags']):
                        normalised_tag = tag.replace('PERS', 'PER')
                        if normalised_tag not in train_words:
                            train_words[normalised_tag] = set()
                        train_words[normalised_tag].add(word)
                
                # Collect words and their associated tags from test set
                for sentence in corpus_data['splits']['test']:
                    for word, tag in zip(sentence['words'], sentence['tags']):
                        normalised_tag = tag.replace('PERS', 'PER')
                        if normalised_tag not in test_words:
                            test_words[normalised_tag] = set()
                        test_words[normalised_tag].add(word)
                
                # Calculate OOV rate per tag
                oov_stats = {}
                for tag in test_words:
                    if tag != 'O':
                        oov_words = test_words[tag] - train_words.get(tag, set())
                        oov_rate = len(oov_words) / len(test_words[tag]) if test_words[tag] else 0
                        oov_stats[tag] = {
                            'OOV Words Count': len(oov_words),
                            'Total Unique Words in Test': len(test_words[tag]),
                            'OOV Rate': round(oov_rate, 4)
                        }
                
                results[corpus_name] = oov_stats
            else:
                results[corpus_name] = "Missing 'train' or 'test' split data"

    return results



def extract_overlap_matrix(data):
    # Extracting unique types for each tag
    tags = ['B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-MISC', 'I-MISC']
    tag_types = {tag: set() for tag in tags}
    
    for sentence in data:
        for word, tag in zip(sentence['words'], sentence['tags']):
            normalised_tag = tag.replace('PERS', 'PER')
            if normalised_tag in tag_types:
                tag_types[normalised_tag].add(word)
    # Create an empty DataFrame for the overlap matrix
    overlap_matrix = pd.DataFrame(index=tags, columns=tags, dtype=int)

    # Compute the overlap for each pair of tags
    for tag1 in tags:
        for tag2 in tags:
            if tag1 == tag2:
                # Ignore diagonal by setting it to zero
                overlap_matrix.loc[tag1, tag2] = 0
            else:
                # Intersection count for tag pairs
                overlap_matrix.loc[tag1, tag2] = len(tag_types[tag1].intersection(tag_types[tag2]))
                
    # overlap_matrix = np.log(overlap_matrix.replace(0, 0.01))
    return overlap_matrix


def extract_O_overlap_matrix(data):
    """
    Extracts an overlap matrix between entities and the O (Outside) tag.

    Parameters:
        data (list of dicts): List of sentences, where each sentence is a dictionary 
                              containing 'words' and 'tags'.

    Returns:
        overlap_matrix (DataFrame): A table showing word overlaps between entities and O.
    """
    # Extracting unique types for each tag, adding 'O' for the outside tag
    tags = ['B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-MISC', 'I-MISC', 'O']
    tag_types = {tag: set() for tag in tags}

    # Step 1: Collect words for each tag type
    for sentence in data:
        for word, tag in zip(sentence['words'], sentence['tags']):
            normalised_tag = tag.replace('PERS', 'PER')  # Normalizing tags like 'PERS'
            if normalised_tag in tag_types:
                tag_types[normalised_tag].add(word)
            elif tag == 'O':  # Adding words for the outside tag
                tag_types['O'].add(word)

    # Step 2: Create an empty DataFrame for the overlap matrix
    overlap_matrix = pd.DataFrame(index=tags, columns=tags, dtype=int).fillna(0)

    # Step 3: Compute the overlap for each pair of tags
    for tag1 in tags:
        for tag2 in tags:
            if tag1 == tag2:
                # Set diagonal to zero
                overlap_matrix.loc[tag1, tag2] = 0
            else:
                # Intersection count for tag pairs
                overlap_matrix.loc[tag1, tag2] = len(tag_types[tag1].intersection(tag_types[tag2]))

    # Return the overlap matrix
    return overlap_matrix


def type_distribution(corpora, split):
    # List the datasets you are interested in analyzing
    datasets_of_interest = ['ANERCorp_CamelLab', 'conll2003']
    tags_of_interest = ['B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-MISC', 'I-MISC']
    
    # Prepare data for DataFrame
    all_data = []
    
    # Process each dataset
    for dataset in datasets_of_interest:
        data = corpora[dataset]['splits'][split]
        tag_word_counts = {tag: Counter() for tag in tags_of_interest}
        
        # Fill the dictionary with counts
        for tag in tags_of_interest:
            tag_word_counts[tag].update(w for sen in data for w, t in zip(sen['words'], sen['tags']) if t.replace('PERS', 'PER') == tag)

        # Calculate statistical measures for each tag
        for tag, counter in tag_word_counts.items():
            frequencies = np.array(list(counter.values()))
            if frequencies.size > 0:  # Check if there are any frequencies to calculate stats
                mean_freq = round(np.mean(frequencies), 4)
                median_freq = round(np.median(frequencies), 4)
                std_dev = round(np.std(frequencies), 2)
            else:
                mean_freq, median_freq, std_dev = 0, 0, 0

            all_data.append({
                'Dataset': dataset,
                'Tag': tag,
                'Mean Frequency': mean_freq,
                'Median Frequency': median_freq,
                'Standard Deviation': std_dev
            })

    # Create DataFrame
    df = pd.DataFrame(all_data)

    # Return the DataFrame
    return df


def get_entity_distribution(corpora, split):
    data = []
    datasets_of_interest = ['ANERCorp_CamelLab', 'conll2003']
    versions = ['IOB', 'IOB2']

    for dataset in datasets_of_interest:
        for version in versions:
            y_true = [sen['tags'] for sen in corpora[dataset]['splits'][split]]
            if version == 'IOB':
                true_entities = get_entities(y_true)
            else:
                scheme = auto_detect(y_true, False)
                entities = Entities(y_true, scheme, False)
                true_entities = [entity.to_tuple()[1:] for sen in entities.entities for entity in sen]
            raw_counts = Counter([entity[0].replace('PERS', 'PER') for entity in true_entities])

            total = sum(raw_counts.values())
            for entity_type, count in raw_counts.items():
                percentage = round((count / total * 100), 2) if total > 0 else 0
                data.append({
                    'Dataset': dataset,
                    'Scheme Version': version,
                    'Entity Type': entity_type,
                    'Raw Count': count,
                    'Proportion': percentage
                })

    df = pd.DataFrame(data)
    return df

def calculate_entity_lengths(entities, strict=False):
    # Initialize a dictionary to store lengths of each entity type
    entity_lengths = defaultdict(list)

    # Calculate the length of each entity and store it in the dictionary
    for entity_type, start, end in entities:
        
        if strict:
            length = end - start
        else:
            length = end - start + 1  # +1 because end index is inclusive
        
        entity_lengths[entity_type.replace('PERS', 'PER')].append(length)

    # Compute average length and other statistics for each entity type
    entity_stats = {}
    for entity_type, lengths in entity_lengths.items():
        average_length = round(np.mean(lengths), 4)
        std = round(np.std(lengths), 4)
        min_length = min(lengths)
        max_length = max(lengths)
        entity_stats[entity_type] = {
            'Average Length': average_length,
            'Min Length': min_length,
            'Max Length': max_length,
            'STD': std
        }

    return entity_stats



def process_datasets(corpora, split):
    results = []
    datasets_of_interest = ['ANERCorp_CamelLab', 'conll2003']
    for dataset_name, dataset in corpora.items():
        if dataset_name in datasets_of_interest:
            for version in ['IOB', 'IOB2']:
                y_true = [sen['tags'] for sen in dataset['splits'][split]]
                strict = version == 'IOB2'
                if strict:
                    scheme = auto_detect(y_true, False)
                    true_entities = Entities(y_true, scheme, False)
                    entity_stats = calculate_entity_lengths([entity.to_tuple()[1:] for sen in true_entities.entities for entity in sen], True)
                else:
                    true_entities = get_entities(y_true)
                    entity_stats = calculate_entity_lengths(true_entities)
                    
                # Append results for DataFrame conversion
                for entity_type, stats in entity_stats.items():
                    stats.update({
						'Dataset': dataset_name,
						'Scheme': version,
						'Entity Type': entity_type
					})
                    results.append(stats)

    return pd.DataFrame(results)


In [4]:
combined_general_stats, combined_tag_stats = calculate_combined_stats(corpora)


In [5]:
oov_stats = calculate_oov_rates(corpora)
pd.DataFrame(oov_stats)


Unnamed: 0,ANERCorp_CamelLab,conll2003
OOV Words Count,3462.0,3693.0
Total Unique Words in Test,9075.0,9488.0
OOV Rate,0.3815,0.3892


In [6]:


# Define your custom color map
color_map = {
    "B-LOC": "darkgreen",
    "B-PERS": "deepskyblue",
    "B-PER": "deepskyblue",
    "B-ORG": "darkcyan",
    "B-MISC": "palevioletred",
    "I-LOC": "yellowgreen",
    "I-PERS": "lightblue",
    "I-PER": "lightblue",
    "I-ORG": "cyan",
    "I-MISC": "violet",
    "O": "saddlebrown",
}

# Example data for ANERCorp and CoNLL-2003 training splits
# This would usually be filtered from a larger DataFrame as needed

pie_tags = ['B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-MISC', 'I-MISC']
train_split = combined_tag_stats[combined_tag_stats['Split'] == 'train'].copy()
anercorp_train = train_split[train_split['Dataset'] == 'ANERCorp_CamelLab']
conll_train = train_split[train_split['Dataset'] == 'CoNLL-2003']
anercorp_wp = anercorp_train['Tag Words'].tolist()
anercorp_ttr = anercorp_train['Tag Types'].tolist()
conll_wp = conll_train['Tag Words'].tolist()
conll_ttr = conll_train['Tag Types'].tolist()

colors = [color_map[tag] for tag in pie_tags]

# Create subplot for ANERCorp
fig_twr = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]],
                             subplot_titles=("Words Proportion", "TTR"))
fig_twr.add_trace(go.Pie(labels=pie_tags, values=anercorp_ttr, name="TWR", marker_colors=colors), 1, 1)
fig_twr.add_trace(go.Pie(labels=pie_tags, values=conll_ttr, name="TWR", marker_colors=colors), 1, 2)

# Update layout for ANERCorp with a custom title
fig_twr.update_layout(
    title_text="Entity Tag Word Type Distribution in the Training Split",
    annotations=[
        dict(text='ANERCorp', font_size=20, showarrow=False),
        dict(text='CoNLL-2003',font_size=20, showarrow=False)

    ],
    height=500, width=800
)
fig_twr.show()

# Create subplot for CoNLL-2003
fig_wp = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]],
                          subplot_titles=("Words Proportion", "TTR"))
fig_wp.add_trace(go.Pie(labels=pie_tags, values=anercorp_wp, name="NE Proportion", marker_colors=colors), 1, 1)
fig_wp.add_trace(go.Pie(labels=pie_tags, values=conll_wp, name="NE Proportion", marker_colors=colors), 1, 2)

# Update layout for CoNLL-2003 with a custom title
fig_wp.update_layout(
    title_text="Entity Tag Distribution in the Training Split",
    annotations=[
        dict(text='ANERCorp', font_size=20, showarrow=False),
        dict(text='CoNLL-2003',font_size=20, showarrow=False)
    ],
    height=500, width=800
)
fig_wp.show()


In [7]:
train_split

Unnamed: 0,Split,Tag,Tag Words,Tag Types,TTR,Words Proportion,Type Proportion,Dataset
0,train,B-LOC,3776,905,0.2397,0.2865,0.2224,ANERCorp_CamelLab
1,train,I-LOC,525,145,0.2762,0.0398,0.0356,ANERCorp_CamelLab
2,train,B-PERS,2721,1089,0.4002,0.2064,0.2676,ANERCorp_CamelLab
3,train,I-PERS,2205,1110,0.5034,0.1673,0.2728,ANERCorp_CamelLab
4,train,B-ORG,1576,522,0.3312,0.1196,0.1283,ANERCorp_CamelLab
5,train,I-ORG,1115,401,0.3596,0.0846,0.0986,ANERCorp_CamelLab
6,train,B-MISC,888,343,0.3863,0.0674,0.0843,ANERCorp_CamelLab
7,train,I-MISC,375,220,0.5867,0.0285,0.0541,ANERCorp_CamelLab
16,train,B-LOC,7140,1223,0.1713,0.2097,0.1476,CoNLL-2003
17,train,I-LOC,1157,294,0.2541,0.034,0.0355,CoNLL-2003


In [21]:
train_split

Unnamed: 0,Split,Tag,Tag Words,Tag Types,TTR,Words Proportion,Type Proportion,Dataset
0,train,B-LOC,3776,905,0.2397,0.2865,0.2224,ANERCorp_CamelLab
1,train,I-LOC,525,145,0.2762,0.0398,0.0356,ANERCorp_CamelLab
2,train,B-PERS,2721,1089,0.4002,0.2064,0.2676,ANERCorp_CamelLab
3,train,I-PERS,2205,1110,0.5034,0.1673,0.2728,ANERCorp_CamelLab
4,train,B-ORG,1576,522,0.3312,0.1196,0.1283,ANERCorp_CamelLab
5,train,I-ORG,1115,401,0.3596,0.0846,0.0986,ANERCorp_CamelLab
6,train,B-MISC,888,343,0.3863,0.0674,0.0843,ANERCorp_CamelLab
7,train,I-MISC,375,220,0.5867,0.0285,0.0541,ANERCorp_CamelLab
16,train,B-LOC,7140,1223,0.1713,0.2097,0.1476,CoNLL-2003
17,train,I-LOC,1157,294,0.2541,0.034,0.0355,CoNLL-2003


In [22]:
train_split

Unnamed: 0,Split,Tag,Tag Words,Tag Types,TTR,Words Proportion,Type Proportion,Dataset
0,train,B-LOC,3776,905,0.2397,0.2865,0.2224,ANERCorp_CamelLab
1,train,I-LOC,525,145,0.2762,0.0398,0.0356,ANERCorp_CamelLab
2,train,B-PERS,2721,1089,0.4002,0.2064,0.2676,ANERCorp_CamelLab
3,train,I-PERS,2205,1110,0.5034,0.1673,0.2728,ANERCorp_CamelLab
4,train,B-ORG,1576,522,0.3312,0.1196,0.1283,ANERCorp_CamelLab
5,train,I-ORG,1115,401,0.3596,0.0846,0.0986,ANERCorp_CamelLab
6,train,B-MISC,888,343,0.3863,0.0674,0.0843,ANERCorp_CamelLab
7,train,I-MISC,375,220,0.5867,0.0285,0.0541,ANERCorp_CamelLab
16,train,B-LOC,7140,1223,0.1713,0.2097,0.1476,CoNLL-2003
17,train,I-LOC,1157,294,0.2541,0.034,0.0355,CoNLL-2003


In [23]:
matrix_tags = ['B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-MISC', 'I-MISC']
anercorp_train = corpora['ANERCorp_CamelLab']['splits']['train']
conll_train = corpora['conll2003']['splits']['train']

anercorp_overlap_matrix = extract_overlap_matrix(anercorp_train)
conll_overlap_matrix = extract_overlap_matrix(conll_train)


# Convert dictionaries to DataFrames
matrix1 = pd.DataFrame(anercorp_overlap_matrix, index=matrix_tags)
matrix2 = pd.DataFrame(conll_overlap_matrix, index=matrix_tags)

# # # # Create masks for upper and lower triangles
# mask_upper = np.triu(np.ones_like(matrix1, dtype=bool))
# mask_lower = np.tril(np.ones_like(matrix1, dtype=bool))

# Create masks for lower triangles
mask_lower = np.tril(np.ones_like(matrix1, dtype=bool))

# Mask both matrices with the lower triangle
lower_triangle_anercorp = matrix1.mask(~mask_lower)
lower_triangle_conll = matrix2.mask(~mask_lower)

# Mask the appropriate halves
# upper_triangle = matrix1.mask(mask_upper)
# upper_triangle = matrix1
# lower_triangle = matrix2.mask(mask_lower)
# lower_triangle = matrix2

# Prepare text for display: replace NaN with blank and non-NaN with integer
upper_text = np.where(lower_triangle_anercorp.isnull(), '', lower_triangle_anercorp.fillna(0).astype(int).astype(str))
lower_text = np.where(lower_triangle_conll.isnull(), '', lower_triangle_conll.fillna(0).astype(int).astype(str))

# Create subplot with two heatmaps
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('ANERCorp', 'CoNLL-2003'),
    horizontal_spacing=0.1  # Adjust spacing between subplots
)

# Add the upper triangle of the first matrix
fig.add_trace(
    go.Heatmap(
        z=lower_triangle_anercorp,
        x=lower_triangle_anercorp.columns,
        y=lower_triangle_anercorp.index,
        colorscale="RdBu_r",
        showscale=False,  # Optionally turn off the color scale for each subplot
        text=upper_text.astype(str),
        texttemplate="%{text}",  # Text template to display
        hoverinfo="text+z"  # Display text and z-value on hover
    ),
    row=1, col=1
)

# Add the lower triangle of the second matrix
fig.add_trace(
    go.Heatmap(
        z=lower_triangle_conll,
        x=lower_triangle_conll.columns,
        y=lower_triangle_conll.index,
        colorscale="RdBu_r",
        showscale=True,  # Optionally show the color scale for reference
        text=lower_text.astype(str),
        texttemplate="%{text}",  # Text template to display
        hoverinfo="text+z"  # Display text and z-value on hover
    ),
    row=1, col=2
)

# Update the layout for better readability
fig.update_layout(
    title_text='Word Type Overlap Across Entity Tags in Training Split',
    template="plotly_white",
    xaxis_title="Tags",
    yaxis_title="Tags",
    xaxis=dict(showgrid=False),  # Hide gridlines for x-axis
    xaxis2=dict(showgrid=False), # Hide gridlines for x-axis in second subplot
    yaxis=dict(showgrid=False),  # Hide gridlines for y-axis
    yaxis2=dict(showgrid=False), # Hide gridlines for y-axis in second subplot
    height=600,  # Set height of the figure canvas in pixels
    width=1200   # Set width of the figure canvas in pixels
)

# Show the figure
fig.show()


In [42]:
matrix_O_tags = ['B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-MISC', 'I-MISC', 'O']
anercorp_train = corpora['ANERCorp_CamelLab']['splits']['train']
conll_train = corpora['conll2003']['splits']['train']

anercorp_overlap_matrix = extract_O_overlap_matrix(anercorp_train)
conll_overlap_matrix = extract_O_overlap_matrix(conll_train)


# Convert dictionaries to DataFrames
matrix1 = pd.DataFrame(anercorp_overlap_matrix, index=matrix_O_tags)
matrix2 = pd.DataFrame(conll_overlap_matrix, index=matrix_O_tags)

# # # # Create masks for upper and lower triangles
# mask_upper = np.triu(np.ones_like(matrix1, dtype=bool))
# mask_lower = np.tril(np.ones_like(matrix1, dtype=bool))

# Create masks for lower triangles
mask_lower = np.tril(np.ones_like(matrix1, dtype=bool))

# Mask both matrices with the lower triangle
lower_triangle_anercorp = matrix1.mask(~mask_lower)
lower_triangle_conll = matrix2.mask(~mask_lower)

# Mask the appropriate halves
# upper_triangle = matrix1.mask(mask_upper)
# upper_triangle = matrix1
# lower_triangle = matrix2.mask(mask_lower)
# lower_triangle = matrix2

# Prepare text for display: replace NaN with blank and non-NaN with integer
upper_text = np.where(lower_triangle_anercorp.isnull(), '', lower_triangle_anercorp.fillna(0).astype(int).astype(str))
lower_text = np.where(lower_triangle_conll.isnull(), '', lower_triangle_conll.fillna(0).astype(int).astype(str))

# Create subplot with two heatmaps
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('ANERCorp', 'CoNLL-2003'),
    horizontal_spacing=0.1  # Adjust spacing between subplots
)

# Add the upper triangle of the first matrix
fig.add_trace(
    go.Heatmap(
        z=lower_triangle_anercorp,
        x=lower_triangle_anercorp.columns,
        y=lower_triangle_anercorp.index,
        colorscale="RdBu_r",
        showscale=False,  # Optionally turn off the color scale for each subplot
        text=upper_text.astype(str),
        texttemplate="%{text}",  # Text template to display
        hoverinfo="text+z"  # Display text and z-value on hover
    ),
    row=1, col=1
)

# Add the lower triangle of the second matrix
fig.add_trace(
    go.Heatmap(
        z=lower_triangle_conll,
        x=lower_triangle_conll.columns,
        y=lower_triangle_conll.index,
        colorscale="RdBu_r",
        showscale=True,  # Optionally show the color scale for reference
        text=lower_text.astype(str),
        texttemplate="%{text}",  # Text template to display
        hoverinfo="text+z"  # Display text and z-value on hover
    ),
    row=1, col=2
)

# Update the layout for better readability
fig.update_layout(
    title_text='Word Type Overlap Across Entity Tags in Training Split and O',
    template="plotly_white",
    xaxis_title="Tags",
    yaxis_title="Tags",
    xaxis=dict(showgrid=False),  # Hide gridlines for x-axis
    xaxis2=dict(showgrid=False), # Hide gridlines for x-axis in second subplot
    yaxis=dict(showgrid=False),  # Hide gridlines for y-axis
    yaxis2=dict(showgrid=False), # Hide gridlines for y-axis in second subplot
    height=600,  # Set height of the figure canvas in pixels
    width=1200   # Set width of the figure canvas in pixels
)

# Show the figure
fig.show()


In [40]:
matrix1

Unnamed: 0,B-LOC,I-LOC,B-PER,I-PER,B-ORG,I-ORG,B-MISC,I-MISC,O
B-LOC,0.0,28.0,14.0,17.0,47.0,31.0,16.0,12.0,108.0
I-LOC,28.0,0.0,6.0,13.0,5.0,14.0,3.0,9.0,49.0
B-PER,14.0,6.0,0.0,283.0,17.0,19.0,15.0,13.0,109.0
I-PER,17.0,13.0,283.0,0.0,11.0,24.0,17.0,18.0,145.0
B-ORG,47.0,5.0,17.0,11.0,0.0,56.0,21.0,14.0,138.0
I-ORG,31.0,14.0,19.0,24.0,56.0,0.0,11.0,35.0,184.0
B-MISC,16.0,3.0,15.0,17.0,21.0,11.0,0.0,15.0,99.0
I-MISC,12.0,9.0,13.0,18.0,14.0,35.0,15.0,0.0,117.0


In [39]:
anercorp_overlap_matrix

Unnamed: 0,B-LOC,I-LOC,B-PER,I-PER,B-ORG,I-ORG,B-MISC,I-MISC,O
B-LOC,0.0,28.0,14.0,17.0,47.0,31.0,16.0,12.0,108.0
I-LOC,28.0,0.0,6.0,13.0,5.0,14.0,3.0,9.0,49.0
B-PER,14.0,6.0,0.0,283.0,17.0,19.0,15.0,13.0,109.0
I-PER,17.0,13.0,283.0,0.0,11.0,24.0,17.0,18.0,145.0
B-ORG,47.0,5.0,17.0,11.0,0.0,56.0,21.0,14.0,138.0
I-ORG,31.0,14.0,19.0,24.0,56.0,0.0,11.0,35.0,184.0
B-MISC,16.0,3.0,15.0,17.0,21.0,11.0,0.0,15.0,99.0
I-MISC,12.0,9.0,13.0,18.0,14.0,35.0,15.0,0.0,117.0
O,108.0,49.0,109.0,145.0,138.0,184.0,99.0,117.0,0.0


In [25]:
extract_O_overlap_matrix(anercorp_train)


Unnamed: 0,B-LOC,I-LOC,B-PER,I-PER,B-ORG,I-ORG,B-MISC,I-MISC,O
B-LOC,0.0,28.0,14.0,17.0,47.0,31.0,16.0,12.0,108.0
I-LOC,28.0,0.0,6.0,13.0,5.0,14.0,3.0,9.0,49.0
B-PER,14.0,6.0,0.0,283.0,17.0,19.0,15.0,13.0,109.0
I-PER,17.0,13.0,283.0,0.0,11.0,24.0,17.0,18.0,145.0
B-ORG,47.0,5.0,17.0,11.0,0.0,56.0,21.0,14.0,138.0
I-ORG,31.0,14.0,19.0,24.0,56.0,0.0,11.0,35.0,184.0
B-MISC,16.0,3.0,15.0,17.0,21.0,11.0,0.0,15.0,99.0
I-MISC,12.0,9.0,13.0,18.0,14.0,35.0,15.0,0.0,117.0
O,108.0,49.0,109.0,145.0,138.0,184.0,99.0,117.0,0.0


In [26]:
# Creating the bar chart
fig = go.Figure()
tag_mapping = {
    'B-PERS': 'B-PER', 'I-PERS': 'I-PER',
    'B-PERS': 'B-PER', 'I-PERS': 'I-PER'
}
dataset_mapping = {
    'ANERCorp_CamelLab': 'ANERCorp'
}
train_split['Tag'] = train_split['Tag'].replace(tag_mapping)
train_split['Dataset'] = train_split['Dataset'].replace(dataset_mapping)
# Add bars for each dataset
for dataset in train_split['Dataset'].unique():
    subset = train_split[train_split['Dataset'] == dataset]
    fig.add_trace(go.Bar(
        x=subset['Tag'],
        y=subset['TTR'],
        name=dataset,
        text=subset['TTR'],
        textposition='auto'
    ))

# Update the layout
fig.update_layout(
    title='Type-to-Word Ratio (TWR) Across Entity Tags in ANERCorp and CoNLL-2003 Training Data',
    template="plotly_white",
    xaxis_title='Entity Tag',
    yaxis_title='TWR',
    barmode='group',
    legend_title="Dataset",
    height=500,
    width=1300
)

# Show the plot
fig.show()

In [27]:
train_split

Unnamed: 0,Split,Tag,Tag Words,Tag Types,TTR,Words Proportion,Type Proportion,Dataset
0,train,B-LOC,3776,905,0.2397,0.2865,0.2224,ANERCorp
1,train,I-LOC,525,145,0.2762,0.0398,0.0356,ANERCorp
2,train,B-PER,2721,1089,0.4002,0.2064,0.2676,ANERCorp
3,train,I-PER,2205,1110,0.5034,0.1673,0.2728,ANERCorp
4,train,B-ORG,1576,522,0.3312,0.1196,0.1283,ANERCorp
5,train,I-ORG,1115,401,0.3596,0.0846,0.0986,ANERCorp
6,train,B-MISC,888,343,0.3863,0.0674,0.0843,ANERCorp
7,train,I-MISC,375,220,0.5867,0.0285,0.0541,ANERCorp
16,train,B-LOC,7140,1223,0.1713,0.2097,0.1476,CoNLL-2003
17,train,I-LOC,1157,294,0.2541,0.034,0.0355,CoNLL-2003


In [28]:

type_distribution_df = type_distribution(corpora, 'train')
dataset_mapping = {
    'ANERCorp_CamelLab': 'ANERCorp'
}
type_distribution_df['Dataset'] = type_distribution_df['Dataset'].replace(dataset_mapping)
fig = px.bar(type_distribution_df, x='Tag', y='Standard Deviation', color='Dataset', text='Standard Deviation')
fig.update_layout(
    title_text="Standard Deviation of Word Type Frequencies Across Entity Tags in ANERCorp and CoNLL-2003",
    template="plotly_white",
    xaxis_title='Entity Tag',
    yaxis_title='TWR',
    barmode='group',
    legend_title="Dataset",
    height=500,
    width=1300
)
fig.show()


In [29]:
pd.DataFrame(Counter([w for sen in corpora['conll2003']['splits']['train'] for w,t in zip(sen['words'], sen['tags']) if t =='B-LOC']).items()).sort_values(by=1, ascending=False).head(50)

Unnamed: 0,0,1
7,U.S.,309
1,Germany,141
2,Britain,133
96,Australia,130
103,England,123
3,France,122
108,Spain,110
98,Italy,98
6,LONDON,93
110,New,93


In [30]:
y_true = [sen['tags'] for sen in corpora['conll2003']['splits']['train']]
true_entities = get_entities(y_true)
        
scheme = auto_detect(y_true, False)
strict_true_entities = Entities(y_true, scheme, False)
entity_stats = calculate_entity_lengths(true_entities)
strict_entity_stats = calculate_entity_lengths([entity.to_tuple()[1:] for sen in strict_true_entities.entities for entity in sen], True)
display(pd.DataFrame(entity_stats))
display(pd.DataFrame(strict_entity_stats))


Unnamed: 0,ORG,MISC,PER,LOC
Average Length,1.586,1.336,1.6861,1.162
Min Length,1.0,1.0,1.0,1.0
Max Length,10.0,7.0,6.0,10.0
STD,0.9225,0.6773,0.5474,0.4411


Unnamed: 0,ORG,MISC,PER,LOC
Average Length,1.586,1.336,1.6861,1.162
Min Length,1.0,1.0,1.0,1.0
Max Length,10.0,7.0,6.0,10.0
STD,0.9225,0.6773,0.5474,0.4411


In [31]:
Counter([entity[0] for entity in true_entities])

Counter({'LOC': 7140, 'PER': 6600, 'ORG': 6321, 'MISC': 3438})

In [15]:
entity_distribution = get_entity_distribution(corpora, 'train')
dataset_mapping = {
    'ANERCorp_CamelLab': 'ANERCorp'
}
entity_distribution['Dataset'] = entity_distribution['Dataset'].replace(dataset_mapping)
fig = px.bar(entity_distribution, x='Entity Type', y='Proportion', color='Dataset', text='Raw Count', barmode='group',
             facet_col='Scheme Version', category_orders={"Scheme Version": ["IOB", "IOB2"]},
             labels={"Dataset": "Dataset", "Proportion": "Proportion"})

# Update layout for better visual distinction
fig.update_layout(
    title_text="Proportion of Entity Types Across Different Schemes and Datasets",
    template="plotly_white",
    xaxis_title='Entity Type',
    yaxis_title='Proportion',
    legend_title="Dataset",
    height=500,
    width=1300,
    plot_bgcolor='rgba(0,0,0,0)'  # Set background color to transparent
)
# fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
# Add a vertical line to separate the schemes
# You might need to adjust the x-position based on your actual data layout
fig.add_shape(
    type="line",
    x0=4, y0=0, x1=4, y1=41,  # Adjust these values based on your axis and needs
    line=dict(color="Black", width=3)
)

fig.show()

In [33]:
entity_length_distribution = process_datasets(corpora, 'train')
dataset_mapping = {
    'ANERCorp_CamelLab': 'ANERCorp'
}
entity_length_distribution['Dataset'] = entity_length_distribution['Dataset'].replace(dataset_mapping)
# Create the bar graph with facets
fig = px.bar(entity_length_distribution, x='Entity Type', y='Average Length', color='Dataset', text='Average Length', barmode='group',
             facet_col='Scheme', category_orders={"Scheme": ["IOB", "IOB2"]},
             labels={"Dataset": "Dataset", "Average Length": "Average Length"})

# Update layout for better visual distinction
fig.update_layout(
    title_text="Average Length of Entities Across Different Schemes and Datasets",
    template="plotly_white",
    xaxis_title='Entity Type',
    yaxis_title='Average Length',
    legend_title="Dataset",
    height=500,
    width=1400,
    plot_bgcolor='rgba(0,0,0,0)'  # Set background color to transparent
)

# Add a vertical line to separate the schemes
# You might need to adjust the x-position based on your actual data layout
fig.add_shape(
    type="line",
    x0=4, y0=0, x1=4, y1=2,  # Adjust these values based on your axis and needs
    line=dict(color="Black", width=3)
)

fig.show()

In [34]:
# Tags and their corresponding TWR data for the test split
test_split = combined_tag_stats[combined_tag_stats['Split'] == 'test'].copy()
anercorp_test = test_split[test_split['Dataset'] == 'ANERCorp_CamelLab']
conll_test = test_split[test_split['Dataset'] == 'CoNLL-2003']
anercorp_ttr = anercorp_test['Tag Types'].tolist()
conll_ttr = conll_test['Tag Types'].tolist()

anercorp_wp = anercorp_test['Tag Words'].tolist()
conll_wp = conll_test['Tag Words'].tolist()
colors = [color_map[tag] for tag in pie_tags]
# Create subplot
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'domain'}, {'type': 'domain'}]],
                    subplot_titles=['ANERCorp', 'CoNLL-2003'])

# Add pie charts to the subplot
fig.add_trace(go.Pie(labels=pie_tags, values=anercorp_ttr, name='ANERCorp', marker_colors=colors), row=1, col=1)
fig.add_trace(go.Pie(labels=pie_tags, values=conll_ttr, name='CoNLL-2003', marker_colors=colors), row=1, col=2)

# Update layout for clarity and better viewing
fig.update_layout(
    title_text='Comparison of Tag Type Distribution in Test Split between ANERCorp and CoNLL-2003',
    # Adjust the layout to make the charts bigger
    height=600, width=900
)

# Show the plot
fig.show()


fig_wp = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]],
                          subplot_titles=("Words Proportion", "TTR"))
fig_wp.add_trace(go.Pie(labels=pie_tags, values=anercorp_wp, name="NE Proportion", marker_colors=colors), 1, 1)
fig_wp.add_trace(go.Pie(labels=pie_tags, values=conll_wp, name="NE Proportion", marker_colors=colors), 1, 2)

# Update layout for CoNLL-2003 with a custom title
fig_wp.update_layout(
    title_text="Entity Tag Distribution in the Testing Split",
    annotations=[
        dict(text='ANERCorp', font_size=20, showarrow=False),
        dict(text='CoNLL-2003',font_size=20, showarrow=False)
    ],
    height=600, width=900
)
fig_wp.show()


In [35]:
matrix_tags = ['B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-MISC', 'I-MISC']
anercorp_test = corpora['ANERCorp_CamelLab']['splits']['test']
conll_test = corpora['conll2003']['splits']['test']

anercorp_overlap_matrix = extract_overlap_matrix(anercorp_test)
conll_overlap_matrix = extract_overlap_matrix(conll_test)


# Convert dictionaries to DataFrames
matrix1 = pd.DataFrame(anercorp_overlap_matrix, index=matrix_tags)
matrix2 = pd.DataFrame(conll_overlap_matrix, index=matrix_tags)

# # # # Create masks for upper and lower triangles
# mask_upper = np.triu(np.ones_like(matrix1, dtype=bool))
# mask_lower = np.tril(np.ones_like(matrix1, dtype=bool))

# Create masks for lower triangles
mask_lower = np.tril(np.ones_like(matrix1, dtype=bool))

# Mask both matrices with the lower triangle
lower_triangle_anercorp = matrix1.mask(~mask_lower)
lower_triangle_conll = matrix2.mask(~mask_lower)

# Mask the appropriate halves
# upper_triangle = matrix1.mask(mask_upper)
# upper_triangle = matrix1
# lower_triangle = matrix2.mask(mask_lower)
# lower_triangle = matrix2

# Prepare text for display: replace NaN with blank and non-NaN with integer
upper_text = np.where(lower_triangle_anercorp.isnull(), '', lower_triangle_anercorp.fillna(0).astype(int).astype(str))
lower_text = np.where(lower_triangle_conll.isnull(), '', lower_triangle_conll.fillna(0).astype(int).astype(str))

# Create subplot with two heatmaps
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('ANERCorp', 'CoNLL-2003'),
    horizontal_spacing=0.1  # Adjust spacing between subplots
)

# Add the upper triangle of the first matrix
fig.add_trace(
    go.Heatmap(
        z=lower_triangle_anercorp,
        x=lower_triangle_anercorp.columns,
        y=lower_triangle_anercorp.index,
        colorscale="RdBu_r",
        showscale=False,  # Optionally turn off the color scale for each subplot
        text=upper_text.astype(str),
        texttemplate="%{text}",  # Text template to display
        hoverinfo="text+z"  # Display text and z-value on hover
    ),
    row=1, col=1
)

# Add the lower triangle of the second matrix
fig.add_trace(
    go.Heatmap(
        z=lower_triangle_conll,
        x=lower_triangle_conll.columns,
        y=lower_triangle_conll.index,
        colorscale="RdBu_r",
        showscale=True,  # Optionally show the color scale for reference
        text=lower_text.astype(str),
        texttemplate="%{text}",  # Text template to display
        hoverinfo="text+z"  # Display text and z-value on hover
    ),
    row=1, col=2
)

# Update the layout for better readability
fig.update_layout(
    title_text='Word Type Overlap Across Entity Tags in Testing Split',
    template="plotly_white",
    xaxis_title="Tags",
    yaxis_title="Tags",
    xaxis=dict(showgrid=False),  # Hide gridlines for x-axis
    xaxis2=dict(showgrid=False), # Hide gridlines for x-axis in second subplot
    yaxis=dict(showgrid=False),  # Hide gridlines for y-axis
    yaxis2=dict(showgrid=False), # Hide gridlines for y-axis in second subplot
    height=600,  # Set height of the figure canvas in pixels
    width=1200   # Set width of the figure canvas in pixels
)

# Show the figure
fig.show()


In [43]:
matrix_O_tags = ['B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-MISC', 'I-MISC', 'O']
anercorp_train = corpora['ANERCorp_CamelLab']['splits']['train']
conll_train = corpora['conll2003']['splits']['train']

anercorp_overlap_matrix = extract_O_overlap_matrix(anercorp_test)
conll_overlap_matrix = extract_O_overlap_matrix(conll_test)


# Convert dictionaries to DataFrames
matrix1 = pd.DataFrame(anercorp_overlap_matrix, index=matrix_O_tags)
matrix2 = pd.DataFrame(conll_overlap_matrix, index=matrix_O_tags)

# # # # Create masks for upper and lower triangles
# mask_upper = np.triu(np.ones_like(matrix1, dtype=bool))
# mask_lower = np.tril(np.ones_like(matrix1, dtype=bool))

# Create masks for lower triangles
mask_lower = np.tril(np.ones_like(matrix1, dtype=bool))

# Mask both matrices with the lower triangle
lower_triangle_anercorp = matrix1.mask(~mask_lower)
lower_triangle_conll = matrix2.mask(~mask_lower)

# Mask the appropriate halves
# upper_triangle = matrix1.mask(mask_upper)
# upper_triangle = matrix1
# lower_triangle = matrix2.mask(mask_lower)
# lower_triangle = matrix2

# Prepare text for display: replace NaN with blank and non-NaN with integer
upper_text = np.where(lower_triangle_anercorp.isnull(), '', lower_triangle_anercorp.fillna(0).astype(int).astype(str))
lower_text = np.where(lower_triangle_conll.isnull(), '', lower_triangle_conll.fillna(0).astype(int).astype(str))

# Create subplot with two heatmaps
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('ANERCorp', 'CoNLL-2003'),
    horizontal_spacing=0.1  # Adjust spacing between subplots
)

# Add the upper triangle of the first matrix
fig.add_trace(
    go.Heatmap(
        z=lower_triangle_anercorp,
        x=lower_triangle_anercorp.columns,
        y=lower_triangle_anercorp.index,
        colorscale="RdBu_r",
        showscale=False,  # Optionally turn off the color scale for each subplot
        text=upper_text.astype(str),
        texttemplate="%{text}",  # Text template to display
        hoverinfo="text+z"  # Display text and z-value on hover
    ),
    row=1, col=1
)

# Add the lower triangle of the second matrix
fig.add_trace(
    go.Heatmap(
        z=lower_triangle_conll,
        x=lower_triangle_conll.columns,
        y=lower_triangle_conll.index,
        colorscale="RdBu_r",
        showscale=True,  # Optionally show the color scale for reference
        text=lower_text.astype(str),
        texttemplate="%{text}",  # Text template to display
        hoverinfo="text+z"  # Display text and z-value on hover
    ),
    row=1, col=2
)

# Update the layout for better readability
fig.update_layout(
    title_text='Word Type Overlap Across Entity Tags in Testing Split and O',
    template="plotly_white",
    xaxis_title="Tags",
    yaxis_title="Tags",
    xaxis=dict(showgrid=False),  # Hide gridlines for x-axis
    xaxis2=dict(showgrid=False), # Hide gridlines for x-axis in second subplot
    yaxis=dict(showgrid=False),  # Hide gridlines for y-axis
    yaxis2=dict(showgrid=False), # Hide gridlines for y-axis in second subplot
    height=600,  # Set height of the figure canvas in pixels
    width=1200   # Set width of the figure canvas in pixels
)

# Show the figure
fig.show()


In [36]:
# Creating the bar chart
fig = go.Figure()
tag_mapping = {
    'B-PERS': 'B-PER', 'I-PERS': 'I-PER',
    'B-PERS': 'B-PER', 'I-PERS': 'I-PER'
}
test_split['Tag'] = test_split['Tag'].replace(tag_mapping)
# Add bars for each dataset
for dataset in test_split['Dataset'].unique():
    subset = test_split[test_split['Dataset'] == dataset]
    fig.add_trace(go.Bar(
        x=subset['Tag'],
        y=subset['TTR'],
        name=dataset,
        text=subset['TTR'],
        textposition='auto'
    ))

# Update the layout
fig.update_layout(
    title='Type-to-Word Ratio (TWR) Across Entity Tags in ANERCorp and CoNLL-2003 Testing Data',
    template="plotly_white",
    xaxis_title='Tag',
    yaxis_title='Tag Words Ratio (TWR)',
    barmode='group',
    legend_title="Dataset",
    height=500,
    width=1300
)

# Show the plot
fig.show()

In [21]:

type_distribution_df = type_distribution(corpora, 'test')
dataset_mapping = {
    'ANERCorp_CamelLab': 'ANERCorp'
}
type_distribution_df['Dataset'] = type_distribution_df['Dataset'].replace(dataset_mapping)
fig = px.bar(type_distribution_df, x='Tag', y='Standard Deviation', color='Dataset', text='Standard Deviation')
fig.update_layout(
    title_text="Standard Deviation of Word Type Frequencies Across Entity Tags in ANERCorp and CoNLL-2003",
    template="plotly_white",
    xaxis_title='Entity Tag',
    yaxis_title='TWR',
    barmode='group',
    legend_title="Dataset",
    height=500,
    width=1300
)
fig.show()


In [22]:
test_split

Unnamed: 0,Split,Tag,Tag Words,Tag Types,TTR,Words Proportion,Type Proportion,Dataset
8,test,B-LOC,668,268,0.4012,0.1979,0.1672,ANERCorp_CamelLab
9,test,I-LOC,83,34,0.4096,0.0246,0.0212,ANERCorp_CamelLab
10,test,B-PER,858,506,0.5897,0.2542,0.3157,ANERCorp_CamelLab
11,test,I-PER,641,419,0.6537,0.1899,0.2614,ANERCorp_CamelLab
12,test,B-ORG,450,210,0.4667,0.1333,0.131,ANERCorp_CamelLab
13,test,I-ORG,275,151,0.5491,0.0815,0.0942,ANERCorp_CamelLab
14,test,B-MISC,235,123,0.5234,0.0696,0.0767,ANERCorp_CamelLab
15,test,I-MISC,165,111,0.6727,0.0489,0.0692,ANERCorp_CamelLab
24,test,B-LOC,1668,475,0.2848,0.2056,0.1518,CoNLL-2003
25,test,I-LOC,257,102,0.3969,0.0317,0.0326,CoNLL-2003


In [13]:
entity_distribution = get_entity_distribution(corpora, 'test')
dataset_mapping = {
    'ANERCorp_CamelLab': 'ANERCorp'
}
entity_distribution['Dataset'] = entity_distribution['Dataset'].replace(dataset_mapping)
fig = px.bar(entity_distribution, x='Entity Type', y='Proportion', color='Dataset', text='Raw Count', barmode='group',
             facet_col='Scheme Version', category_orders={"Scheme Version": ["IOB", "IOB2"]},
             labels={"Dataset": "Dataset", "Proportion": "Proportion"})

# Update layout for better visual distinction
fig.update_layout(
    title_text="Proportion of Entity Types Across Different Schemes and Datasets",
    template="plotly_white",
    xaxis_title='Entity Type',
    yaxis_title='Raw Count',
    legend_title="Dataset",
    height=500,
    width=1400,
    plot_bgcolor='rgba(0,0,0,0)'  # Set background color to transparent
)
# fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
# Add a vertical line to separate the schemes
# You might need to adjust the x-position based on your actual data layout
fig.add_shape(
    type="line",
    x0=4, y0=0, x1=4, y1=41,  # Adjust these values based on your axis and needs
    line=dict(color="Black", width=3)
)

fig.show()


In [12]:
entity_distribution

Unnamed: 0,Dataset,Scheme Version,Entity Type,Raw Count,Proportion
0,ANERCorp,IOB,LOC,676,29.58
1,ANERCorp,IOB,PER,907,39.69
2,ANERCorp,IOB,ORG,459,20.09
3,ANERCorp,IOB,MISC,243,10.63
4,ANERCorp,IOB2,LOC,668,30.21
5,ANERCorp,IOB2,PER,858,38.81
6,ANERCorp,IOB2,ORG,450,20.35
7,ANERCorp,IOB2,MISC,235,10.63
8,conll2003,IOB,LOC,1668,29.53
9,conll2003,IOB,PER,1617,28.63


In [None]:
Counter({'PERS': 905, 'LOC': 676, 'ORG': 459, 'MISC': 243})
Counter({'PERS': 858, 'LOC': 668, 'ORG': 450, 'MISC': 235})

In [24]:
entity_length_distribution = process_datasets(corpora, 'test')
dataset_mapping = {
    'ANERCorp_CamelLab': 'ANERCorp'
}
entity_length_distribution['Dataset'] = entity_length_distribution['Dataset'].replace(dataset_mapping)
# Create the bar graph with facets
fig = px.bar(entity_length_distribution, x='Entity Type', y='Average Length', color='Dataset', text='Average Length', barmode='group',
             facet_col='Scheme', category_orders={"Scheme": ["IOB", "IOB2"]},
             labels={"Dataset": "Dataset", "Average Length": "Average Length"})

# Update layout for better visual distinction
fig.update_layout(
    title_text="Average Length of Entities Across Different Schemes and Datasets",
    template="plotly_white",
    xaxis_title='Entity Type',
    yaxis_title='Average Length',
    legend_title="Dataset",
    height=500,
    width=1400,
    plot_bgcolor='rgba(0,0,0,0)'  # Set background color to transparent
)

# Add a vertical line to separate the schemes
# You might need to adjust the x-position based on your actual data layout
fig.add_shape(
    type="line",
    x0=4, y0=0, x1=4, y1=2,  # Adjust these values based on your axis and needs
    line=dict(color="Black", width=3)
)

fig.show()

In [25]:
oov_stats_per_tag = calculate_oov_rates_per_tag(corpora)

# Convert results to a suitable format for display, such as a DataFrame
import pandas as pd

# Transform the dictionary into a DataFrame for easier viewing and analysis
data_frames = {}
for corpus, tags_data in oov_stats_per_tag.items():
    if isinstance(tags_data, dict):  # Ensure there's data to process
        df = pd.DataFrame.from_dict(tags_data, orient='index')
        data_frames[corpus] = df

In [26]:
# Creating a single DataFrame from the dictionary
all_data = pd.concat({
    k: v for k, v in data_frames.items()
}, axis=0)

# Rename the levels for clarity
all_data.index.names = ['Corpus', 'Tag']
all_data

Unnamed: 0_level_0,Unnamed: 1_level_0,OOV Words Count,Total Unique Words in Test,OOV Rate
Corpus,Tag,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ANERCorp_CamelLab,B-LOC,98,268,0.3657
ANERCorp_CamelLab,B-PER,347,506,0.6858
ANERCorp_CamelLab,I-PER,288,419,0.6874
ANERCorp_CamelLab,B-ORG,131,210,0.6238
ANERCorp_CamelLab,I-LOC,18,34,0.5294
ANERCorp_CamelLab,I-ORG,106,151,0.702
ANERCorp_CamelLab,B-MISC,93,123,0.7561
ANERCorp_CamelLab,I-MISC,93,111,0.8378
conll2003,B-LOC,187,475,0.3937
conll2003,B-PER,519,857,0.6056


In [27]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Tags and their corresponding OOV rates for both datasets
COL = 'OOV Words Count'

anercorp_oov_rates = all_data.loc['ANERCorp_CamelLab'][COL].tolist()
anercorp_tags = all_data.loc['ANERCorp_CamelLab'].index.tolist()
anercorp_colors = [color_map[tag] for tag in anercorp_tags]
conll_oov_rates = all_data.loc['conll2003'][COL].tolist()
conll_tags = all_data.loc['conll2003'].index.tolist()
conll_colors = [color_map[tag] for tag in conll_tags]

# Create subplot
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'domain'}, {'type': 'domain'}]],
                    subplot_titles=['ANERCorp', 'CoNLL-2003'])

# Add pie charts to the subplot
fig.add_trace(go.Pie(labels=anercorp_tags, values=anercorp_oov_rates, name='ANERCorp', marker_colors=anercorp_colors), row=1, col=1)
fig.add_trace(go.Pie(labels=conll_tags, values=conll_oov_rates, name='CoNLL-2003', marker_colors=conll_colors), row=1, col=2)

# Update layout for clarity and better viewing
fig.update_layout(
    title_text='Out-of-Vocabulary (OOV) Word Distribution by Entity Type in ANERCorp and CoNLL-2003',
    # Adjust the layout to make the charts bigger
    width=1200, height=600
)

# Show the plot
fig.show()


In [28]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Tags and their corresponding OOV rates for both datasets
COL = 'OOV Words Count'

anercorp_oov_rates = all_data.loc['ANERCorp_CamelLab'][COL].tolist()
anercorp_tags = all_data.loc['ANERCorp_CamelLab'].index.tolist()
anercorp_colors = [color_map[tag] for tag in anercorp_tags]
conll_oov_rates = all_data.loc['conll2003'][COL].tolist()
conll_tags = all_data.loc['conll2003'].index.tolist()
conll_colors = [color_map[tag] for tag in conll_tags]

# Create subplot
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'domain'}, {'type': 'domain'}]],
                    subplot_titles=['ANERCorp', 'CoNLL-2003'])

# Add pie charts to the subplot
fig.add_trace(go.Pie(labels=anercorp_tags, values=anercorp_oov_rates, name='ANERCorp', marker_colors=anercorp_colors), row=1, col=1)
fig.add_trace(go.Pie(labels=conll_tags, values=conll_oov_rates, name='CoNLL-2003', marker_colors=conll_colors), row=1, col=2)

# Update layout for clarity and better viewing
fig.update_layout(
    title_text='Out-of-Vocabulary (OOV) Word Distribution by Entity Type in ANERCorp and CoNLL-2003',
    # Adjust the layout to make the charts bigger
    width=1200, height=600
)

# Show the plot
fig.show()


In [29]:
all_data

Unnamed: 0_level_0,Unnamed: 1_level_0,OOV Words Count,Total Unique Words in Test,OOV Rate
Corpus,Tag,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ANERCorp_CamelLab,B-LOC,98,268,0.3657
ANERCorp_CamelLab,B-PER,347,506,0.6858
ANERCorp_CamelLab,I-PER,288,419,0.6874
ANERCorp_CamelLab,B-ORG,131,210,0.6238
ANERCorp_CamelLab,I-LOC,18,34,0.5294
ANERCorp_CamelLab,I-ORG,106,151,0.702
ANERCorp_CamelLab,B-MISC,93,123,0.7561
ANERCorp_CamelLab,I-MISC,93,111,0.8378
conll2003,B-LOC,187,475,0.3937
conll2003,B-PER,519,857,0.6056
