In [31]:
from nltk.corpus import wordnet as wn
import csv, random, time
from collections import deque
import numpy as np
from enum import Enum
from nltk.corpus.reader.wordnet import Synset,Lemma

In [64]:
class PropertyType(Enum):
    LEX = 'lexical'
    SEM = 'semantic'


CONFIG = [
    ('synonym', 'syn', lambda elem: [l for l in elem.synset().lemmas() if elem is not l], PropertyType.LEX),
    ('antonym', 'ant', Lemma.antonyms, PropertyType.LEX),
#     ('hypernym', 'hyper', Synset.hypernyms, PropertyType.SEM),
    #('hyponym', 'hypo', Synset.hyponyms, PropertyType.SEM),
#     ('holonym', 'holo', [Synset.member_holonyms, Synset.substance_holonyms, Synset.part_holonyms], PropertyType.SEM),
    #('meronym', 'mero', [Synset.member_meronyms, Synset.substance_meronyms, Synset.part_meronyms], PropertyType.SEM),
    #('causes', 'causes', Synset.causes, PropertyType.SEM),
    #('entailment', 'entail', Synset.entailments, PropertyType.SEM),
]

In [65]:
class RouletteData:
    def __init__(self, data):
        self.data = data
        self.elems = [e for e in data]
        self.weights = []
        self.cums_weights = []
        self._normalize_data()

    def _normalize_data(self):
        temp = 0
        for elem in self.data:
            self.weights.append(self.data[elem])
            temp += self.data[elem]
            self.cums_weights.append(temp)

    def get_random_elements(self, k=1):
        return random.choices(self.elems, cum_weights=self.cums_weights, k=k)
    

In [66]:
def _get_word(item):
    temp = item.name().split('.')
    return temp[1] if temp[0] is "" else temp[0]


def _save_data(data, file='data', opener=None):
    print("Saving in " + file + " file")
    with (open(file, 'w') if not opener else open(file, 'w', opener=opener)) as fd:
        for t in data:
            fd.write(" ".join(map(lambda x: str(x), t)) + "\n")
    print("Done!")


def _get_relative_triplets(prop_type, elem):
    result = set()
    elem_word = _get_word(elem)
    for _, sym, func, _ in [c for c in CONFIG if c[3] is prop_type]:
        func_list = func if isinstance(func, (list, tuple)) else [func] if callable(func) else []
        for f in func_list:
            words = [_get_word(e) for e in f(elem)]
            for w in [e for e in words if e != elem_word]:
                result.add((elem_word, sym, w))
    return result


def build_wordnet_triplets(save=False):
    print("Extracting data")
    triplets = set()
    for synset in wn.all_synsets():
        triplets |= _get_relative_triplets(PropertyType.SEM, synset)
        lemmas = [lem for lem in synset.lemmas()]
        for lem in lemmas:
            triplets |= _get_relative_triplets(PropertyType.LEX, lem)
    print("Done!")
    if save:
        _save_data(triplets)

    return triplets


def _init_word_stats():
    temp = {}
    for _, sym, _, _ in CONFIG:
        temp[sym] = 0
    return temp


def _extract_stats(triplets):
    rel_stats = {}
    word_stats = {}

    for w1, r, w2 in triplets:
        if w1 not in rel_stats:
            rel_stats[w1] = _init_word_stats()
        if w2 not in rel_stats:
            rel_stats[w2] = _init_word_stats()

        word_stats[w1] = word_stats[w1]+1 if w1 in word_stats else 1
        word_stats[w2] = word_stats[w2]+1 if w2 in word_stats else 1
        rel_stats[w1][r] = rel_stats[w1][r]+1
        rel_stats[w2][r] = rel_stats[w2][r]+1

    return word_stats, rel_stats

def build_noise(triplets, samples_pc=1, save=False):
    word_stats, rel_stats = _extract_stats(triplets)
    print("Building noise")
    noise = set()
    samples_count = int(samples_pc * len(triplets))

    words_generator = RouletteData(word_stats)
    queue = None

    while samples_count > 0:
        if not queue or len(queue) is 0:
            queue = deque(words_generator.get_random_elements(samples_count*2), maxlen=samples_count*2)
        w1, w2 = queue.popleft(), queue.popleft()
        rel = RouletteData(rel_stats[w1]).get_random_elements()[0]
        if w1 != w2 and (w1, rel, w2) not in triplets and (w2, rel, w1) not in triplets:
            noise.add((w1, rel, w2))
            samples_count -= 1
    print("Done!")
    if save:
        _save_data(noise, 'data-noise')
    return noise, len(word_stats)


def _split_data(data, split):
    print("Splitting data")
    rel_data = {}
    for t in data:
        if t[1] not in rel_data:
            rel_data[t[1]] = []
        rel_data[t[1]].append(t)
    result = set()
    for rel in rel_data:
        random.shuffle(rel_data[rel])
        count = int(split * len(rel_data[rel]))
        result |= set(rel_data[rel][:count])
    print('Done!')
    return result


def _get_checked_items(data, val):
    return list(map(lambda x: x+(val,), data))


def _dic_to_sorted_tuple(dic):
    result = [(dic[k], k) for k in dic]
    result.sort()
    return result

def _save_csv_data(data, file):
    with open(file, 'w', newline='') as fd:
        writer = csv.writer(fd)
        for t in data:
            writer.writerow(t)


def _get_final_items(data):
    print("Getting final items")
    rels_ids = {}
    i = 0
    for _, sym, _, _ in CONFIG:
        rels_ids[sym] = i
        i += 1
    _save_csv_data(_dic_to_sorted_tuple(rels_ids), str(total_words) + '.rels.csv')
    i = 0
    words_ids = {}
    triplets = []
    for w1, r, w2, v in data:
        if w1 not in words_ids:
            words_ids[w1] = i
            i += 1
        if w2 not in words_ids:
            words_ids[w2] = i
            i += 1
        triplets.append((words_ids[w1], rels_ids[r], words_ids[w2], v))
    _save_csv_data(_dic_to_sorted_tuple(words_ids), str(total_words) + '.words.csv')
    print('Done!')
    return triplets

In [67]:
# Build all WordNet triplets
temp = build_wordnet_triplets()
# _save_data(temp, 'real-data')

Extracting data
Done!


In [73]:
# Split data
SPLIT=.5
real_data = _split_data(temp, SPLIT)

Splitting data
Done!


In [74]:
# Build noise from splitted data stats
NOISE_PC=1
false_data, total_words = build_noise(real_data, NOISE_PC)

Building noise
Done!


In [75]:
# Items to numbers
data = _get_final_items(_get_checked_items(real_data, 1) + _get_checked_items(false_data, 0))

Getting final items
Done!


In [76]:
# Shuffle final data
print("Shuffling data")
random.shuffle(data)
print("Done!")

Shuffling data
Done!


In [77]:
# Save data to file
if total_words:
    _save_data(data, str(total_words) + '.set')

Saving in 98987.set file
Done!
