In [1]:
import json
import pandas as pd

# load the dataset
# file structure is a dictionary of dictionaries ( {"0": {...}, "1": {...}})
with open('/content/train.json', 'r') as f:
    data = json.load(f)

# convert values to a DataFrame
df = pd.DataFrame(data.values())

# --- STATISTIC 1: Avg Annotator Standard Deviation ---
# calculates mean of the 'stdev' column (which contains stdev of the 5 annotators for that row)
avg_std_dev = df['stdev'].mean()
print(f"Avg Annotator Std Dev: {avg_std_dev:.2f}")

# --- STATISTIC 2: High Ambiguity % ---
# calculates percentage of rows where average score is between 2.0 and 4.0
# the "gray area" cases that are hardest to classify
high_ambiguity_count = df[(df['average'] >= 2.0) & (df['average'] <= 4.0)].shape[0]
total_rows = len(df)
high_ambiguity_pct = (high_ambiguity_count / total_rows) * 100

print(f"High Ambiguity Percentage: {high_ambiguity_pct:.1f}%")

Avg Annotator Std Dev: 0.95
High Ambiguity Percentage: 53.6%


In [2]:
# --- STATISTIC 3: Average Narrative Length ---
# combine three text parts: Precontext + Sentence + Ending
#  handle cases where 'ending' = None or empty
def calculate_length(row):
    # Gather text parts, filtering out None/Empty values
    parts = [row.get('precontext'), row.get('sentence'), row.get('ending')]
    full_text = " ".join([str(p) for p in parts if p])

    # Split by whitespace to get a simple token count
    return len(full_text.split())

# Apply the function to every row
df['total_tokens'] = df.apply(calculate_length, axis=1)
avg_narrative_length = df['total_tokens'].mean()


# --- STATISTIC 4: Average Senses per Target ---
# group by the target word ('homonym') and count
# how many unique 'judged_meaning' entries exist for each
senses_per_target = df.groupby('homonym')['judged_meaning'].nunique()
avg_senses = senses_per_target.mean()

# --- PRINT RESULTS ---
print(f"Avg Narrative Length: {avg_narrative_length:.2f} tokens")
print(f"Avg Senses per Target: {avg_senses:.2f}")

Avg Narrative Length: 49.99 tokens
Avg Senses per Target: 2.13


##Analysis of Synthetic + Gold Data

In [6]:
# 1. load the merged_updated.json file
file_path = '/content/merged_updated.json'

with open(file_path, 'r') as f:
    data = json.load(f)

df = pd.DataFrame(data['results'])

# --- STATISTIC 1: Avg Annotator Standard Deviation ---
avg_std_dev = df['stdev'].mean()

# --- STATISTIC 2: High Ambiguity % ---
# count rows where average score is between 2.0 and 4.0
high_ambiguity_count = df[(df['average'] >= 2.0) & (df['average'] <= 4.0)].shape[0]
total_rows = len(df)
high_ambiguity_pct = (high_ambiguity_count / total_rows) * 100

# --- STATISTIC 3: Average Narrative Length ---
def calculate_length(row):
    # gather text parts, filtering out None/Empty values
    parts = [row.get('precontext'), row.get('sentence'), row.get('ending')]
    full_text = " ".join([str(p) for p in parts if p])

    # split by whitespace to get token count
    return len(full_text.split())

df['total_tokens'] = df.apply(calculate_length, axis=1)
avg_narrative_length = df['total_tokens'].mean()

# --- STATISTIC 4: Average Senses per Target ---
# Group by 'homonym' and count unique 'judged_meaning' entries
senses_per_target = df.groupby('homonym')['judged_meaning'].nunique()
avg_senses = senses_per_target.mean()

# --- PRINT ALL RESULTS ---
print("--- DATASET STATISTICS ---")
print(f"Total Instances: {total_rows}")
print(f"Avg Annotator Std Dev: {avg_std_dev:.2f}")
print(f"High Ambiguity %: {high_ambiguity_pct:.1f}%")
print(f"Avg Narrative Length: {avg_narrative_length:.2f} tokens")
print(f"Avg Senses per Target: {avg_senses:.2f}")

--- DATASET STATISTICS ---
Total Instances: 792
Avg Annotator Std Dev: 0.47
High Ambiguity %: 71.3%
Avg Narrative Length: 51.49 tokens
Avg Senses per Target: 2.02
