# Relation Matrix from Wordnet

- Generate a matrix for each word with each other word in the text w1: (5(syn, hyper, hypo, holo, mero), len(vocab))
- Initialize all with zeros
- For every word of the vocab  -> check the wordnet for a relation (syn, hyper, hypo, holo, mero) w1 w2
- mark 1 all relations that this word w2 has with w1
- Methods to add relations as extra knowledge:
    - add R in loss sum over all relations


Reference on the methods used for extraction of relations:
- npit (2019) in Github - https://github.com/npit/nlp-semantic-augmentation/tree/jnle
- Thibault Cordier & Antoine Tadros (2019) Project: Learning Word Representations by Embedding the WordNet Graph in Github (https://github.com/ShiroCupz/Embedding-WordNet)


In [None]:
import pickle
import numpy as np
import nltk
from nltk.corpus import wordnet as wn, stopwords
from keras.preprocessing.text import text_to_word_sequence
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Upload the data to google cloud in case the drag-drop upload is not working.
from google.colab import files
dataset_file_dict = files.upload()

Saving small_corpus_graph_no_stopwords.gpickle to small_corpus_graph_no_stopwords.gpickle


I have already constructed and uploaded the graph in the sciebo. You can use that one. Or you can construct a new one according to the HOWTO

In [None]:
# Load the graph
import networkx as nx
import random

def load_graph(path='/content/corpus_graph.gpickle'):
    G = nx.read_gpickle(path)
    print('>> Load Graph: ', G)
    return G

In [None]:
G = load_graph('/content/small_corpus_graph_no_stopwords.gpickle')

>> Load Graph:  DiGraph with 61167 nodes and 12902 edges


In [None]:
nodes = list(G.nodes)[:20000]

In [None]:
# Get unique words in the corpus
def get_nodes(doc):
    # Efficient implementation of get_entities
    split_text = doc.split()
    unique_words = set(split_text)
    return unique_words

# Get unique words from the Graph
def get_graph_nodes(G):
    print(G)
    return G.nodes

In [None]:
nodes = get_graph_nodes(G)

DiGraph with 61167 nodes and 12902 edges


In [None]:
def checkHolonym(target, context):
    '''
    Get sysnsets of both words and ten get holonyms of the first sysnset
    check if that is in synset of word 1
    '''
    synsets_word1 = wn.synsets(target)
    synsets_word2 = wn.synsets(context)
    for synonym in synsets_word2:
        holonyms = synonym.member_holonyms() + synonym.substance_holonyms() + synonym.part_holonyms()
        for holonym in holonyms:
            if holonym in synsets_word1:
                return 1
    return 0


def checkSynonym(target, context):
    '''
    Check synonym between two words target and see if their sysnet intersect
    # Maybe also a cosine similarity with a threshold would work
    '''
    synsets_word1 = wn.synsets(target)
    synsets_word2 = wn.synsets(context)
    common_synset = set(synsets_word1).intersection(set(synsets_word2))
    if len(common_synset) != 0:
        return 1
    return 0


def checkMeronym(target, context):
    # Meronym is reverse of holonym swap context and target
    return checkHolonym(context, target)


def checkHypernym(target, context):
    '''
    Get synset of both words and then get hypernyms for second word
    check if hypernym is in the synset of the first word
    '''
    hyper = lambda s: s.hypernyms()
    synsets_word1 = wn.synsets(target)
    synsets_word2 = wn.synsets(context)
    for synonym in synsets_word2:
        for hypernym in synonym.closure(hyper):
            if hypernym in synsets_word1:
                return 1
    return 0


def checkHyponym(target, context):
    # Reverse of hypernym swap target and context words
    return checkHypernym(context, target)


def checkAntonym(target, context):
    '''
    Get synset and lemmas and retrieve antonyms
    For each antonym get if there is a common
    '''
    synsets_word1 = wn.synsets(target)
    synsets_word2 = wn.synsets(context)
    antonyms = []
    for synonym in synsets_word1:
        for lemma in synonym.lemmas():
            antonyms.extend(lemma.antonyms()) # antonyms() method only works on lemmas.
    antonym_names = [antonym.name() for antonym in antonyms]
    for antonym_name in antonym_names:
        synsets_antonym = wn.synsets(antonym_name)
        common_meanings = set(synsets_word2).intersection(set(synsets_antonym))
        if len(common_meanings) > 0:
            return 1
    return 0

Generating a relation matrix. For the lab we were only focused on synonymy and hypernymy.

In [None]:
from tqdm import tqdm
def generateRelationMatrix(nodes):

    def getRelations(target, nodes):
        # print('>> ', target)
        # 0: synonyms, 1: hypernyms, 2:hyponyms, 3:meronyms, 4:holonyms 5:antonyms ()
        # Return an array (6, len(nodes)) with 1 and 0 where 1 if target related with a word from list of nodes
        relations = ['synonyms', 'hypernyms', 'hyponyms', 'meronyms', 'holonyms', 'antonyms']
        # We are only focused on hhyper and syn but you can also define other relation
        relations = ['synonyms', ]
        num_relations = len(relations)
        # def a matrix
        relationMatrix = np.zeros((num_relations, len(nodes)))
        for i, word in enumerate(nodes):
            # Check for the relations between a target node and all other nodes
            if target == word:
                relationMatrix[:, i] = np.hstack(np.ones(num_relations))
                # relationMatrix[:, i] = 1
                continue
            hyper = checkHypernym(target, word)
            syn = checkSynonym(target, word)

            # We are only focused on hhyper and syn but you can also define other relation
            # hypo = checkHyponym(target, word)
            # mer = checkMeronym(target, word)
            # ant = checkAntonym(target, word)
            # holo = checkHolonym(target, word)
            # Generate a 0 1 column for wk wn
            # rel = np.hstack((hyper, hypo, mer, holo, syn, ant))
            rel = np.hstack((syn, hyper))
            # rel = hyper
            # print(rel, relationMatrix[:, i])
            relationMatrix[:, i] = rel
        return relationMatrix

    wordsRelations = np.zeros((len(nodes), 1, len(nodes)))
    for index, word in enumerate(nodes):
        wordRelation = getRelations(word, nodes)
        wordsRelations[index, :, :] = wordRelation
    return wordsRelations

In [None]:
relMatrix = generateRelationMatrix(nodes)

In [None]:
np.save('relationMatrix.npy', relMatrix)

In [None]:
relMatrix

array([[[1., 0., 0., ..., 0., 0., 0.]],

       [[0., 1., 0., ..., 0., 0., 0.]],

       [[0., 0., 1., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 1., 0., 0.]],

       [[0., 0., 0., ..., 0., 1., 0.]],

       [[0., 0., 0., ..., 0., 0., 1.]]])