In [1]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
import numpy as np

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def get_hyponyms(word):
    word = wn.synsets(word)
    if len(word) == 0:
        return []
    word = word[0]
    hyponyms = list(set([w for s in word.closure(lambda s:s.hyponyms()) for w in s.lemma_names()]))
    
    return hyponyms

In [3]:
def get_hypernyms(word):
    word = wn.synsets(word)
    if len(word) == 0:
        return []
    word = word[0]
    hypernyms = list(set([w for s in word.closure(lambda s:s.hypernyms()) for w in s.lemma_names()]))
    
    return hypernyms

In [4]:
def get_synonyms(word):
    word = wn.synsets(word)
    if len(word) == 0:
        return []
    synonyms = list(set([w for s in word for w in s.lemma_names()]))
    
    return synonyms

In [5]:
get_synonyms('apple_tree')

['apple_tree']

In [8]:
def get_vector(word1, word2):
    
    output = np.zeros(3)
    
    # check hyper/hypo-nym and synonyms
    synonyms = get_synonyms(word1)
    hypers = get_hypernyms(word1)
    hypos = get_hyponyms(word1)
    
    if word2 in synonyms:
        output[0] = 1
    if word2 in hypers:
        output[1] = 1
    if word2 in hypos:
        output[2] = 1
    
    return output

In [13]:
get_vector('bad', 'good')

array([0., 0., 0.])

In [15]:
from misc.tokenization import *
import torch
import torch.optim as optim
import numpy as np

import pickle
import argparse
from collections import Counter

from torchnlp.datasets import snli_dataset

In [16]:
train_data = snli_dataset(train=True)

In [29]:
import re
import nltk
nltk.download('punkt')

from tqdm import tqdm

class Cencept(object):
    """Cencept relation between words."""
    def __init__(self):
        self.word2rel = {}
        self.idx = 0

    def add_word(self, word, other=None, vec=None):
        if not other:
            self.word2rel[word] = np.zeros(3)
        else:
            if not word in self.word2rel:
                self.word2rel[word] = {other: vec}
                self.idx += 1
            else:
                self.word2rel[word][other] = vec

    def __call__(self, word, other):
        if not word in self.word2rel:
            return self.word2rel['<unk>']
        return self.word2rel[word][other]
    
    def __len__(self):
        return len(self.word2rel)
    
def prepare_vocab(dataset, threshold):
    
    counter = Counter()
    
    for t in tqdm(dataset):
        
        premise = t['premise']
        hypothesis = t['hypothesis']
        premise_tokens = nltk.word_tokenize(premise)
        hypothesis_tokens = nltk.word_tokenize(hypothesis)
        tokens = premise_tokens + hypothesis_tokens
        counter.update(tokens)
           
    # If the word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]
        
    # Create a vocab wrapper and add some special tokens.
    vocab = Concept()
    vocab.add_word('<unk>')
    
    # Add the words to the vocabulary.
    for word in words:  
        for other in words:
            vec = get_vector(word, other)
            vocab.add_word(word, other, vec)
            
    return vocab

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [30]:
vocab = prepare_vocab(train_data, 2)

100%|██████████| 550152/550152 [02:46<00:00, 3294.56it/s]


TypeError: add_word() takes 2 positional arguments but 4 were given

In [25]:
# prepare wordnet relation vocab
len(vocab)

26715

In [27]:
vocab('ff')

3