## Note

This notebook is designed to run an analysis on the CEA (Entity Linking) dataset created by the TURL authors

In [44]:
from __future__ import annotations

import ast
import json
import os
import pickle
import unicodedata
import urllib.parse
import urllib.request
from multiprocessing import Pool
from typing import List

import numpy as np
import pandas as pd
import requests

In [45]:
dataset_path = "~/turl-data/t2d.table_entity_linking.json"
dataset_path = os.path.expanduser(dataset_path)

In [46]:
with open(dataset_path, "r") as f:
    tables = json.load(f)

In [47]:
len(tables)

809

In [49]:
table_id, pgTitle, secTitle, caption, headers, entities, candidate_entities, labels, _ = tables[0]

In [50]:
entities

[[[3, 0], 'australia'],
 [[3, 1], 'AUD'],
 [[3, 2], 'Australian Dollars'],
 [[24, 0], 'israel'],
 [[24, 1], 'ILS'],
 [[24, 2], 'Israeli New Shekels'],
 [[52, 0], 'zambia'],
 [[52, 1], 'ZMK'],
 [[52, 2], 'Zambian Kwacha'],
 [[6, 0], 'bermuda'],
 [[6, 1], 'BMD'],
 [[6, 2], 'Bermudan Dollars'],
 [[36, 0], 'poland'],
 [[36, 1], 'PLZ'],
 [[36, 2], 'Polish Zloty'],
 [[43, 0], 'south korea'],
 [[43, 1], 'KRW'],
 [[43, 2], 'South Korean Won'],
 [[17, 0], 'fiji'],
 [[17, 1], 'FJD'],
 [[17, 2], 'Fiji Dollars'],
 [[13, 0], 'czech republic'],
 [[13, 1], 'CZK'],
 [[13, 2], 'Czech Koruna'],
 [[29, 0], 'lebanon'],
 [[29, 1], 'LBP'],
 [[29, 2], 'Lebanese Pounds'],
 [[26, 0], 'japan'],
 [[26, 1], 'JPY'],
 [[26, 2], 'Japanese Yen'],
 [[32, 0], 'new zealand'],
 [[32, 1], 'NZD'],
 [[32, 2], 'New Zealand Dollars'],
 [[46, 0], 'switzerland'],
 [[46, 1], 'CHF'],
 [[46, 2], 'Swiss Francs'],
 [[48, 0], 'thailand'],
 [[48, 1], 'THB'],
 [[48, 2], 'Thai Baht'],
 [[22, 0], 'india'],
 [[22, 1], 'INR'],
 [[22, 2], '

In [51]:
candidate_entities

[['Australia', 'island country in the Southern hemisphere', ['Country']],
 ['Australia', "continent on the Earth's Southern Hemisphere", ['Continent']],
 ['Australia', 'Wikimedia disambiguation page', []],
 ['Australia', '2008 film by Baz Luhrmann', []],
 ['8088 Australia', 'main-belt asteroid', []],
 ['Australian rules football', 'sport', ['Sport']],
 ['National Library of Australia',
  'national reference library in Canberra, Australia',
  ['Library']],
 ['Australia', 'Racehorse', ['RaceHorse']],
 ['Australia', '1989 film by Jean-Jacques Andrien', ['Film']],
 ['Australia', 'Manic Street Preachers song', ['Song']],
 ['Australian Capital Territory',
  'federal territory of Australia, containing the capital city, Canberra',
  ['AdministrativeRegion']],
 ['Australia', 'board game', ['Game']],
 ['Australian National University',
  'national research university in Canberra, Australian Capital Territory, Australia',
  []],
 ['Australia', 'The Shins song', ['Song']],
 ['Australia', 'fourth s

In [53]:
num_entities = []
num_candidates = []
for table in tables:
    _, _, _, _, _, entities, candidate_entities, _, _ = table
    num_entities.append(len(entities))
    num_candidates.append(len(candidate_entities))

In [54]:
len(num_entities), len(num_candidates)

(809, 809)

In [55]:
np.mean(num_entities), np.std(num_entities)

(76.81087762669964, 18.49117199173145)

In [56]:
np.mean(num_candidates), np.std(num_candidates)

(552.9060568603214, 500.2470871065791)

In [57]:
np.max(num_entities), np.max(num_candidates)

(147, 2146)

In [58]:
idx_max_num_candidates = np.argmax(num_candidates)
num_entities[idx_max_num_candidates]

132

In [60]:
tables_df = pd.DataFrame(
    tables,
    columns=["table_id", "pgTitle", "secTitle", "caption", "headers", "entities", "candidate_entities", "labels", ""],
)

In [61]:
tables_df.groupby("pgTitle").size().gt(1).sum()

124