In [None]:
from gensim.models import Word2Vec
import json
import numpy as np

In [None]:
import findspark

findspark.init()
from pyspark import SparkContext
import pyspark

conf = pyspark.SparkConf().setAll(
    [
        ("spark.executor.memory", "8g"),
        ("spark.executor.cores", "2"),
        ("spark.executor.instances", "7"),
        ("spark.driver.memory", "32g"),
        ("spark.driver.maxResultSize", "10g"),
    ]
)
sc = SparkContext(conf=conf)

In [None]:
def convert_ndarray_back(x):
    x["entityCell"] = np.array(x["entityCell"])
    return x


data_dir = "/srv/samba/group_workspace_1/deng.595/workspace/table_transformer/data/wikitable_entity/"
train_tables = sc.textFile(data_dir + "train_tables.jsonl").map(lambda x: convert_ndarray_back(json.loads(x.strip())))
dev_tables = sc.textFile(data_dir + "dev_tables.jsonl").map(lambda x: convert_ndarray_back(json.loads(x.strip())))

In [None]:
def collect_core_entities_simple(x):
    all_entities = []
    for i, j in zip(*x["entityCell"].nonzero()):
        if j == 0 and j in x["entityColumn"]:
            all_entities.append(str(x["tableData"][i][j]["surfaceLinks"][0]["target"]["id"]))
    return all_entities

In [None]:
train_core_entities = train_tables.map(collect_core_entities_simple).filter(lambda x: len(x) >= 2).collect()
dev_core_entities = dev_tables.map(collect_core_entities_simple).filter(lambda x: len(x) >= 2).collect()

In [None]:
model = Word2Vec(
    size=312,
    alpha=0.025,
    window=50,
    min_count=1,
    max_vocab_size=None,
    sample=0,
    seed=1,
    workers=4,
    min_alpha=0.0005,
    sg=1,
    hs=0,
    negative=25,
    ns_exponent=0.75,
    null_word=0,
    trim_rule=None,
    sorted_vocab=1,
    batch_words=100000,
)

In [None]:
model.build_vocab(train_core_entities)

In [None]:
import pickle

with open("../../data/dev_result.pkl", "rb") as f:
    dev_result = pickle.load(f)

dev_dataset = [
    [str(item[0].pop()), set([str(z) for z in item[1]]), [str(z) for z in item[7]]] for _, item in dev_result.items()
]

In [None]:
len(dev_dataset)

In [None]:
def precision_at_k(r, k):
    """Score is precision @ k
    Relevance is binary (nonzero is relevant).
    >>> r = [0, 0, 1]
    >>> precision_at_k(r, 1)
    0.0
    >>> precision_at_k(r, 2)
    0.0
    >>> precision_at_k(r, 3)
    0.33333333333333331
    >>> precision_at_k(r, 4)
    Traceback (most recent call last):
        File "<stdin>", line 1, in ?
    ValueError: Relevance score length < k
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Precision @ k
    Raises:
        ValueError: len(r) must be >= k
    """
    assert k >= 1
    r = np.asarray(r)[:k] != 0
    if r.size != k:
        raise ValueError("Relevance score length < k")
    return np.mean(r)


def average_precision(r):
    """Score is average precision (area under PR curve)
    Relevance is binary (nonzero is relevant).
    >>> r = [1, 1, 0, 1, 0, 1, 0, 0, 0, 1]
    >>> delta_r = 1. / sum(r)
    >>> sum([sum(r[:x + 1]) / (x + 1.) * delta_r for x, y in enumerate(r) if y])
    0.7833333333333333
    >>> average_precision(r)
    0.78333333333333333
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Average precision
    """
    r = np.asarray(r) != 0
    out = [precision_at_k(r, k + 1) for k in range(r.size) if r[k]]
    if not out:
        return 0.0
    return np.mean(out)

In [None]:
from gensim.models.callbacks import CallbackAny2Vec

In [None]:
class EpochLogger(CallbackAny2Vec):
    """Callback to log information about training"""

    def __init__(self, dev_dataset):
        self.epoch = 0
        self.dev_dataset = dev_dataset

    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        maps = []
        for seed, target, cand in self.dev_dataset:
            if seed not in model.wv.vocab:
                maps.append(0)
            else:
                scores = np.full(len(cand), -100.0)
                for i, e in enumerate(cand):
                    if e in model.wv.vocab:
                        scores[i] = model.wv.distance(seed, e)
                sorted_scores = scores.argsort()
                sorted_labels = [1 if cand[i] in target else 0 for i in sorted_scores]
                ap = average_precision(sorted_labels)
                maps.append(ap)
        print("map@dev", np.mean(maps))
        self.epoch += 1

In [None]:
epoch_logger = EpochLogger(dev_dataset)
model.train(sentences=dev_core_entities, total_examples=len(dev_core_entities), epochs=100, callbacks=[epoch_logger])

# Header

In [None]:
train_headers = train_tables.map(lambda x: x["processed_tableHeaders"]).collect()
dev_headers = dev_tables.map(lambda x: x["processed_tableHeaders"]).collect()

In [None]:
model = Word2Vec(
    size=312,
    alpha=0.025,
    window=50,
    min_count=1,
    max_vocab_size=None,
    sample=0,
    seed=1,
    workers=4,
    min_alpha=0.0005,
    sg=1,
    hs=0,
    negative=25,
    ns_exponent=0.75,
    null_word=0,
    trim_rule=None,
    sorted_vocab=1,
    batch_words=100000,
)

In [None]:
model.build_vocab(train_headers)

In [None]:
class HeaderEpochLogger(CallbackAny2Vec):
    """Callback to log information about training"""

    def __init__(self, dev_dataset):
        self.epoch = 0
        self.dev_dataset = dev_dataset

    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        maps = []
        for headers in self.dev_dataset:
            if len(headers) == 1:
                continue
            seed = headers[0]
            target = set(headers[:1])
            if seed not in model.wv.vocab:
                maps.append(0)
            else:
                cand = model.wv.most_similar(seed, topn=1000)
                sorted_labels = [1 if z[0] in target else 0 for z in cand]
                ap = average_precision(sorted_labels)
                maps.append(ap)
        print("map@dev", np.mean(maps))
        self.epoch += 1

In [None]:
epoch_logger = HeaderEpochLogger(dev_headers)
model.train(sentences=train_headers, total_examples=len(train_headers), epochs=10, callbacks=[epoch_logger])

In [None]:
from gensim.models import KeyedVectors

with open("../../data/header_vectors.kv", "wb") as f:
    model.wv.save(f)