# T2D Evaluation

This notebook evaluate the fine-tuned Entity-Linking (Cell Entity Linking, CEA) model, finetuned running the `./tasks/cea/fine_tune_EL.sh` script and evaluated with the `./notebooks/evaluate_task.ipynb` notebook.

The data needed for this notebook are:

* T2D `tables_instance_context` folder, which can be downloaded from https://webdatacommons.org/webtables/tables_instance_context.tar.gz
* T2D `entities_instance` folder, which can be downloaded from https://webdatacommons.org/webtables/entities_instance.tar.gz
* The english DBedia instances types, which can be downloaded from https://databus.dbpedia.org/dbpedia/mappings/instance-types/2019.08.30/instance-types_lang=en.ttl.bz2 
* The results pickle obtained from the evaluation notebook. 

In [1]:
import csv
import json
import time
import urllib.parse
import urllib.request
from operator import add
from urllib.parse import unquote

import findspark
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import Row

In [2]:
findspark.init()
conf = pyspark.SparkConf().setAll(
    [
        ("spark.executor.memory", "8g"),
        ("spark.executor.cores", "2"),
        ("spark.executor.instances", "7"),
        ("spark.driver.memory", "150g"),
        ("spark.driver.maxResultSize", "100g"),
        ("spark.driver.extraClassPath", "~/Downloads/sqlite-jdbc-3.36.0.3.jar"),
    ]
)
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

24/02/22 14:49:41 WARN Utils: Your hostname, chronos-gpu1 resolves to a loopback address: 127.0.1.1; using 10.0.0.113 instead (on interface ens18)
24/02/22 14:49:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/22 14:49:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [12]:
def wikidata_lookup(query, retry: int = 3):
    service_url = (
        "https://www.wikidata.org/w/api.php?action=wbsearchentities&search={}&language=en&limit=50&format=json"
    )
    url = service_url.format(urllib.parse.quote(query))
    for _ in range(retry):
        try:
            response = urllib.request.urlopen(url)
        except urllib.error.HTTPError as e:
            if e.code == 429 or e.code == 503:
                response = e.code
                time.sleep(1)
                continue
            else:
                response = e.code
                break
        except urllib.error.URLError as e:
            response = None
            break
        else:
            response = json.loads(response.read())
            break
    if isinstance(response, dict):
        response = [z.get("id") for z in response.get("search", [])]
    return [query, response]

In [13]:
# you can create the index-enwiki dump use this library https://github.com/jcklie/wikimapper
wikipedia_wikidata_mapping = (
    spark.read.format("jdbc")
    .options(
        url="jdbc:sqlite:~/turl-data/index_enwiki-20190420.db",
        driver="org.sqlite.JDBC",
        dbtable="mapping",
    )
    .load()
)
wikipedia_wikidata_mapping.show()

+------------+--------------------+-----------+
|wikipedia_id|     wikipedia_title|wikidata_id|
+------------+--------------------+-----------+
|          10| AccessibleComputing|   Q3097841|
|          12|           Anarchism|      Q6199|
|          13|  AfghanistanHistory|    Q188872|
|          14|AfghanistanGeography|   Q1637198|
|          15|   AfghanistanPeople|   Q1075999|
|          18|AfghanistanCommun...|   Q2658920|
|          19|AfghanistanTransp...|    Q509443|
|          20| AfghanistanMilitary|  Q11062919|
|          21|AfghanistanTransn...|   Q4113710|
|          23| AssistiveTechnology|    Q688498|
|          24|        AmoeboidTaxa|    Q506524|
|          25|              Autism|     Q38404|
|          27|      AlbaniaHistory|    Q213833|
|          29|       AlbaniaPeople|    Q583150|
|          30|        AsWeMayThink|    Q610709|
|          35|   AlbaniaGovernment|    Q917351|
|          36|      AlbaniaEconomy|      Q8055|
|          39|              Albedo|    Q

In [None]:
dbpedia_types = dict(
    spark.createDataFrame(
        sc.textFile("~/turl-data/dbpedia_types/2022_12_01/instance_type_en.ttl")
        .map(lambda x: x.split())
        .map(
            lambda x: Row(
                wikipedia_title=unquote(x[0][1:-1]).replace("http://dbpedia.org/resource/", ""),
                type=x[2][1:-1].split("/")[-1],
            )
        )
    )
    .join(wikipedia_wikidata_mapping, "wikipedia_title", "inner")
    .rdd.map(lambda x: (x["wikidata_id"], [x["type"]]))
    .reduceByKey(add)
    .collect()
)
print(len(dbpedia_types))

In [None]:
t2d_tables = sc.wholeTextFiles("~/turl-data/efthymiou/t2d/tables_instance_context").map(
    lambda x: (x[0].split("/")[-1][:-5], json.loads(x[1]))
)

In [None]:
t2d_tables.take(1)

In [None]:
t2d_entities = spark.createDataFrame(
    sc.wholeTextFiles("~/turl-data/efthymiou/t2d/entities_instance")
    .map(lambda x: (x[0].split("/")[-1][:-4], list(csv.reader(x[1].split("\n")))))
    .flatMap(
        lambda x: [
            Row(
                table_id=x[0],
                wikipedia_title=y[0].split("/")[-1],
                j=0,
                i=int(y[2]),
                mention=y[1].replace("&nbsp;", "").replace("&nbsp", ""),
            )
            for y in x[1]
            if len(y) == 3
        ]
    )
).join(wikipedia_wikidata_mapping, "wikipedia_title", "inner")

In [None]:
t2d_entities.show()

In [None]:
t2d_entity_mentions = list(set(t2d_entities.rdd.map(lambda x: x["mention"]).collect()))
print(len(t2d_entity_mentions))

In [None]:
t2d_entity_mentions[:10]

In [None]:
wikidata_lookup("Barack Obama")

In [None]:
from tqdm.contrib.concurrent import process_map


num_processes = 16
entity_t2d_candidates = process_map(wikidata_lookup, t2d_entity_mentions, max_workers=num_processes)

In [None]:
entity_t2d_candidates[:2]

In [None]:
# Entity candidates from `wikidata_lookup`

entity_t2d_candidates_rows = []
for x in entity_t2d_candidates:
    entity_t2d_candidates_rows.append(Row(mention=x[0], candidates=x[1]))

In [None]:
entity_t2d_candidates_df = spark.createDataFrame(entity_t2d_candidates_rows)

In [None]:
t2d_entities_with_candidates = t2d_entities.join(entity_t2d_candidates_df, "mention", "left")

In [None]:
t2d_entities_with_candidates.show()

## Build dataset for testing with TURL El model

In [None]:
sample = (
    t2d_entities_with_candidates.select("table_id", "wikidata_id", "candidates", "i", "j", "mention")
    .where(~F.isnull("candidates"))
    .rdd.map(lambda x: [x["table_id"], x["i"], x["j"], x["mention"], x["wikidata_id"], x["candidates"]])
    .filter(lambda x: x[4] in [z for z in x[5]])
    .map(lambda x: (x[0], [x[1:]]))
    .reduceByKey(add)
    .join(t2d_tables)
    .take(1)
)

In [None]:
sample[0][1][0]

In [None]:
def build_for_own(x):
    all_processed = []
    table_id = x[0]
    pgTitle = x[1][1]["pageTitle"]
    secTitle = ""
    caption = x[1][1]["title"]
    header_i = x[1][1]["headerRowIndex"]
    subject_j = x[1][1]["keyColumnIndex"]
    headers = [column[header_i] for column in x[1][1]["relation"][subject_j:]]
    all_entities = x[1][0]
    total_num = len(all_entities)
    chunck_num = int(total_num / max([1, int(total_num / 25)])) + 1
    while len(all_entities) > 0:
        entities = []
        candidate_entities = {}
        labels = []
        cand_for_each = []
        for e in all_entities[:chunck_num]:
            row_i = e[0]
            e_mention = e[2]
            entities.append([[row_i, 0], e_mention])
            for cand in e[4]:
                if cand[0] not in candidate_entities:
                    candidate_entities[cand[0]] = [
                        len(candidate_entities),
                        cand[1],
                        cand[2],
                        dbpedia_types.get(cand[0], []),
                    ]
            labels.append(candidate_entities[e[3]][0])
            cand_for_each.append([candidate_entities[cand[0]][0] for cand in e[4]])
            for p, column in enumerate(x[1][1]["relation"][subject_j + 1 : subject_j + 3]):
                if len(column) > row_i:
                    e_mention = column[row_i].replace("&nbsp;", "").replace("&nbsp", "")
                    entities.append([[row_i, p + 1], e_mention])
                    labels.append(0)
                    cand_for_each.append([])
                # entities = [[[z[0],0],z[2]] for z in all_entities[:50]]
                # candidate_entities = {}
                # for z in all_entities[:50]:
                #     for cand in z[4]:
                #         if cand[0] not in candidate_entities:
                #             candidate_entities[cand[0]] = [len(candidate_entities),cand[1],cand[2],dbpedia_types.get(cand[0],[])]
                # labels = [candidate_entities[z[3]][0]  for z in all_entities[:50]]
                # cand_for_each = [[candidate_entities[cand[0]][0] for cand in z[4]] for z in all_entities[:50]]
        tmp_candidate_entities = [0] * len(candidate_entities)
        for k, v in candidate_entities.items():
            tmp_candidate_entities[v[0]] = v[1:]
        all_processed.append(
            [table_id, pgTitle, secTitle, caption, headers, entities, tmp_candidate_entities, labels, cand_for_each]
        )
        all_entities = all_entities[chunck_num:]
    return all_processed

In [None]:
t2d_local = (
    t2d_entities_with_candidates.select("table_id", "wikidata_id", "candidates", "i", "j", "mention")
    .where(~F.isnull("candidates"))
    .rdd.map(lambda x: [x["table_id"], x["i"], x["j"], x["mention"], x["wikidata_id"], x["candidates"]])
    .filter(lambda x: x[4] in [z for z in x[5]])
    .map(lambda x: (x[0], [x[1:]]))
    .reduceByKey(add)
    .join(t2d_tables)
    .flatMap(build_for_own)
    .collect()
)

In [None]:
t2d_local

In [None]:
data_dir = "/srv/samba/group_workspace_1/deng.595/workspace/table_transformer/data/wikitable_entity/v2/"
with open(data_dir + "t2d.table_entity_linking.json", "w") as f:
    json.dump(t2d_local, f)

## Open an already created T2D dataset (this comes from the authors data)

In [2]:
def get_labels_and_candidate(tables):
    results = []
    # For every entity mention in the table
    for i, entity in enumerate(tables[5]):
        # If the candidate entities for the mention are empty, skip
        if len(tables[8][i]) == 0:
            continue
        # ((table_id, entity row, entity col), [entity label, candidate indexes, candidate entities])
        results.append(((tables[0], entity[0][0], entity[0][1]), [tables[7][i], tables[8][i], tables[6]]))
    return results

In [5]:
with open("~/turl-data/round1_t2d.table_entity_linking.json", "rb") as f:
    t2d_local = json.load(f)

In [6]:
t2d_local[0][6][19]

['2001: A Space Odyssey – Music from the Motion Picture Sound Track',
 '1968 compilation soundtrack album; various artists',
 ['Album']]

## Example of a single table contained in `t2d_local`

* **Table ID**: '23235546-1'
* **Page title**: 'Ivan Lendl career statistics'
* **Section title**: 'Singles: 19 finals (8 titles, 11 runner-ups)'
* **Caption**: ''
* **Headers**: ['outcome', 'year', ...]
* **Entity mentions**: [[[0, 4], 'Björn Borg'], [[9, 2], 'Wimbledon'], ...], with [[`row`, `col`], entity mention text]. `row` and `col` both starts from 0
* **Candidate entities**: [['Björn Borg', 'Swedish tennis player', []], ['Björn Borg', 'Swedish swimmer', ['Swimmer']], ...], this the merged set for all cells. [entity name, entity description, entity types]
* **Labels**: [0, 12, ...], this is the index of the gold entity in the candidate entities
* **Candidate indexes**: [[0, 1, ...], [11, 12, 13, ...], ...], candidates for each mention 

In [7]:
def classify(x):
    label = 1 if x["wikidata_id"] in [z for z in x["candidates"][:1]] else 0
    return label

In [None]:
t2d_entities_with_atleast_one_candidate = t2d_entities_with_candidates.where(F.size("candidates") >= 1)

In [None]:
t2d_all_predicted = t2d_entities_with_atleast_one_candidate.count()

In [None]:
t2d_TP = t2d_entities_with_atleast_one_candidate.rdd.map(lambda x: classify(x)).sum()

In [None]:
t2d_P = t2d_entities_with_candidates.count()

In [None]:
t2d_best_TP = t2d_entities_with_atleast_one_candidate.rdd.map(
    lambda x: 1 if x["wikidata_id"] in [z for z in x["candidates"]] else 0
).sum()

In [None]:
t2d_best_TP, t2d_all_predicted, t2d_TP

In [None]:
precision = t2d_TP / t2d_all_predicted
recall = t2d_TP / t2d_P
f1 = 2 * precision * recall / (precision + recall)
print(f1, precision, recall)

In [None]:
precision = t2d_best_TP / t2d_all_predicted
recall = t2d_best_TP / t2d_P
f1 = 2 * precision * recall / (precision + recall)
print(f1, precision, recall)

In [25]:
import pickle

with open(
    "~/projects/TURL/output/logs/turl/fine-tuning-el/2024-02-14_11-01-08/version_0/test/round1_t2d_entity_linking_results_dedup.pkl",
    "rb",
) as f:
    test_results = pickle.load(f)

In [26]:
def get_tp(result):
    result = result[1]
    # result[0] contains: label index (in the candidate list), candidate span (in the candidate list), candidates
    # result[1] contains: sorted predicted indexes, sorted predicted scores
    pred = []
    lookup = [result[0][1][0], 0]  # Lookup the first candidate
    # The prediction is first predicted candidate
    # TODO: consider the case where the first predicted candidate is not in the candidate span, i.e.
    # a totally different entity has been predicted
    for i, x in enumerate(result[1][0]):
        if x in result[0][1]:
            pred = [x, result[1][1][i]]
            break
    # Get the score of the correct candidate
    for i, x in enumerate(result[1][0]):
        if x == lookup[0]:
            lookup[1] = result[1][1][i]
            break
    final = pred[0] if pred[0] == lookup[0] or (pred[1] * 0.8) > lookup[1] else lookup[0]
    if final == result[0][0]:
        return 1
    else:
        return 0

In [27]:
test_results_sample = sc.parallelize(test_results).flatMap(
    lambda x: [((x[0], z[0], z[1]), (x[2][i], x[3][i])) for i, z in enumerate(x[1])]
)

In [29]:
sample_result = (
    sc.parallelize(t2d_local)
    .flatMap(get_labels_and_candidate)
    .join(
        sc.parallelize(test_results).flatMap(
            lambda x: [((x[0], z[0], z[1]), (x[2][i], x[3][i])) for i, z in enumerate(x[1])]
        )
    )
    .take(1)
)

24/02/20 09:47:12 WARN TaskSetManager: Stage 3 contains a task of very large size (6045 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [30]:
sample_result = sample_result[0]

In [31]:
sample_result[0]  # (table_id, entity row, entity col)

('50245608_0_871275842592178099', 146, 0)

In [32]:
result = sample_result[1]

In [33]:
result[1][1][0]

20.76685333251953

In [34]:
print("lookup:", [result[0][1][0], 0])

lookup: [1867, 0]


In [35]:
get_tp(sample_result)

1

In [36]:
our_tp = (
    sc.parallelize(t2d_local)
    .flatMap(get_labels_and_candidate)
    .join(
        sc.parallelize(test_results).flatMap(
            lambda x: [((x[0], z[0], z[1]), (x[2][i], x[3][i])) for i, z in enumerate(x[1])]
        )
    )
    .map(get_tp)
    .sum()
)

24/02/20 09:47:54 WARN TaskSetManager: Stage 5 contains a task of very large size (6045 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [37]:
our_tp

6261

In [None]:
precision = our_tp / t2d_all_predicted
recall = our_tp / t2d_P
f1 = 2 * precision * recall / (precision + recall)
print(f1, precision, recall)

In [None]:
def get_tp(result):
    result = result[1]
    pred = []
    lookup = [result[0][1][0], 0]
    for i, x in enumerate(result[1][0]):
        if x in result[0][1]:
            pred = [x, result[1][1][i]]
            break
    for i, x in enumerate(result[1][0]):
        if x == lookup[0]:
            lookup[1] = result[1][1][i]
            break
    final = pred[0] if pred[0] == lookup[0] or (pred[1] * 0.8) > lookup[1] else lookup[0]
    if final == result[0][0]:
        return (1, result[0][2][final])
    else:
        return (0, result[0][2][final])

In [None]:
sample = (
    sc.parallelize(t2d_local)
    .flatMap(get_labels_and_candidate)
    .join(
        sc.parallelize(test_results).flatMap(
            lambda x: [((x[0], z[0], z[1]), (x[2][i], x[3][i])) for i, z in enumerate(x[1])]
        )
    )
    .take(1)
)

In [None]:
sample[0][1][0][2]

In [None]:
our_results = (
    sc.parallelize(t2d_local)
    .flatMap(get_labels_and_candidate)
    .join(
        sc.parallelize(test_results).flatMap(
            lambda x: [((x[0], z[0], z[1]), (x[2][i], x[3][i])) for i, z in enumerate(x[1])]
        )
    )
    .map(lambda x: (x[0], get_tp(x)))
)

In [None]:
lookup_results = t2d_entities_with_candidates.where(F.size("candidates") >= 1).rdd.map(
    lambda x: (
        (x["table_id"], x["i"], x["j"]),
        (x["mention"], x["candidates"], 1 if x["wikidata_id"] in [z for z in x["candidates"][:1]] else 0),
    )
)

In [None]:
all_results = our_results.join(lookup_results)

In [None]:
all_results.take(1)[0][1][1][-1]

In [None]:
errors = all_results.filter(lambda x: x[1][0] == 0 and x[1][1][-1] == 1).collect()

In [None]:
correct = all_results.filter(lambda x: x[1][0] == 1 and x[1][1][-1] == 0).collect()

In [None]:
print(len(errors))
print(len(correct))

In [None]:
correct[0]

In [None]:
errors[60]

In [None]:
len(set([x[0][0] for x in errors]))

In [None]:
len(set([x[0][0] for x in correct]))

In [None]:
set([x[0][0] for x in errors])

In [None]:
[[x, t2d_tables_local[x]["pageTitle"]] for x in list(set([x[0][0] for x in errors]))]

In [None]:
[x for x in errors if x[0][0] == "41194422_0_7231546114369966811"]

In [None]:
t2d_tables_local = dict(t2d_tables.collect())

In [None]:
t2d_tables_local["71137051_0_8039724067857124984"]