In [52]:
import json
from nltk import wordpunct_tokenize
from bs4 import BeautifulSoup as Soup
import networkx as nx
from collections import defaultdict
import copy
import numpy as np
import torch

In [None]:
!pip3 install gensim
!pip3 install node2vec

In [53]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def parse_synset(file, all_synsets=None):
    handler = open(file).read()
    soup = Soup(handler)
    if all_synsets is None:
        all_synsets = {}
    for element in soup.findAll('synset'):
        all_synsets[element.attrs['id']] = {'name': element.attrs['ruthes_name'], 'definition': element.attrs['definition']}
    return all_synsets

In [None]:
def parse_senses(file):
    handler = open(file).read()
    soup = Soup(handler)
    all_senses = defaultdict(list)
    for element in soup.findAll('sense'):
        all_senses[element.attrs['synset_id']].append(element.attrs['name'])
    return all_senses

In [None]:
def parse_wordnet(file, synsets, senses=None, G=None, directed=False):
    if G is None:
        if directed:
            G = nx.DiGraph()
        else:
            G = nx.Graph()
    if directed and type(G) != nx.classes.digraph.DiGraph:
        raise Exception('Graph is not directed')
    if not directed and type(G) != nx.classes.digraph.Graph:
        raise Exception('Graph should not be directed')
    
    print('Input graph: {} nodes, {} edges'.format(len(G.nodes), len(G.edges)))
    handler = open(file).read()
    soup = Soup(handler)
    for element in soup.findAll('relation'):
        relation = element.attrs
        parent_id = relation['parent_id']
        child_id = relation['child_id']
        if relation['name'] in ['hyponym', 'instance hyponym']:
            if parent_id not in G.nodes:
                G.add_node(parent_id, in_edges=[], out_edges=[])
            if child_id not in G.nodes:
                G.add_node(child_id, in_edges=[], out_edges=[])
            parent = G.nodes[parent_id]
            child = G.nodes[child_id]
            G.add_edge(parent_id, child_id)
            if senses is not None:
                parent_txt = copy.deepcopy(senses[parent_id])
                child_txt = copy.deepcopy(senses[child_id])
            else:
                parent_txt = [synsets[parent_id]['name']]
                child_txt = [synsets[child_id]['name']]
            new_attr = {parent_id: {'out_edges': parent['out_edges'] + [child_id], 'text': parent_txt, 'definition': synsets[parent_id]['definition']},
                        child_id: {'in_edges': child['in_edges'] + [parent_id], 'text': child_txt, 'definition': synsets[child_id]['definition']}}
            nx.set_node_attributes(G, new_attr)
    print('Updated graph: {} nodes, {} edges'.format(len(G.nodes), len(G.edges)))
    for syn in synsets:
        if syn not in G.nodes:
            G.add_node(syn)
            txt = senses[syn]
            defn = synsets[syn]['definition']
            nx.set_node_attributes(G, {syn: {'out_edges': [], 'in_edges': [], 'text': txt, 'definition': defn}})
    print('Graph with orphan nodes: {} nodes, {} edges'.format(len(G.nodes), len(G.edges)))
    return G

In [None]:
data_dir = '/content/drive/MyDrive/Study/NLP/ruwordnet/'

In [None]:
all_senses_noun = parse_senses(data_dir + 'ruwordnet/senses.N.xml')
all_senses_verb = parse_senses(data_dir + 'ruwordnet/senses.V.xml')
all_synsets_noun = parse_synset(data_dir + 'ruwordnet/synsets.N.xml')
all_synsets_verb = parse_synset(data_dir + 'ruwordnet/synsets.V.xml')

In [None]:
# wordnet graphs - directed
G_full_dir_noun = parse_wordnet(data_dir + 'ruwordnet/synset_relations.N.xml', all_synsets_noun, all_senses_noun, directed=True)
G_full_dir_verb = parse_wordnet(data_dir + 'ruwordnet/synset_relations.V.xml', all_synsets_verb, all_senses_verb, directed=True)

Input graph: 0 nodes, 0 edges
Updated graph: 29295 nodes, 39110 edges
Graph with orphan nodes: 29296 nodes, 39110 edges
Input graph: 0 nodes, 0 edges
Updated graph: 7408 nodes, 10317 edges
Graph with orphan nodes: 7521 nodes, 10317 edges


# My fitting code

In [None]:
from node2vec.node2vec import Node2Vec
EMBEDDING_FILENAME = './embeddings.emb'
EMBEDDING_MODEL_FILENAME = './embeddings.model'

node2vec = Node2Vec(G_full_dir_noun, dimensions=100, walk_length=30, num_walks=200, workers=4)
model = node2vec.fit(window=20, min_count=1, batch_words=4)

HBox(children=(FloatProgress(value=0.0, description='Computing transition probabilities', max=29296.0, style=P…




In [None]:
import gensim
ft_model = gensim.models.KeyedVectors.load('/content/drive/MyDrive/Study/NLP/models/araneum_none_fasttextcbow_300_5_2018.model')

In [None]:
len(G_full_dir_noun.nodes)

29296

In [None]:
len(all_senses_noun)

29296

And then we can get word by id from the database and do whatever we want

In [None]:
def build_id_to_noun_map(all_sences):
    result = {}
    for key, value_arr in all_sences.items():
        for v in calue_arr:
            result[v] = key
    return result

In [None]:
from tqdm.notebook import tqdm
import numpy as np

In [None]:
train_from = []
train_to = []

def get_ft_embedding(ft_model, word):
    emb = 0
    i = 0
    for w in word.lower().split(' '):
        emb += ft_model.wv[w]
        i += 1
    return emb / i

for ident in tqdm(G_full_dir_noun.nodes):
    words = all_senses_noun[ident]
    n2v_vec = model.wv[ident]
    for w in words:
        ft_vec = get_ft_embedding(ft_model, w)
        train_from.append(ft_vec)
        train_to.append(n2v_vec)
        
train_from = np.vstack(train_from)
train_to = np.vstack(train_to)

print(train_from.shape)
print(train_to.shape)

np.save('../train_from.npy', train_from)
np.save('../train_to.npy', train_to)

HBox(children=(FloatProgress(value=0.0, max=29296.0), HTML(value='')))




  


AttributeError: ignored

In [6]:
import numpy as np
train_from = np.load('/content/drive/MyDrive/Study/NLP/train_data/train_from.npy')
train_to = np.load('/content/drive/MyDrive/Study/NLP/train_data/train_to.npy')

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
indexes = np.arange(len(train_from))
np.random.shuffle(indexes)

train_source = train_from[indexes[:-5000]]
train_target = train_to[indexes[:-5000]]
test_source = train_from[indexes[-5000:]]
test_target = train_to[indexes[-5000:]]

# Baseline

In [None]:
import torch

torch.cuda.is_available()

False

In [None]:
import torch
import torch.nn as nn

enc_dec_model = nn.Sequential(
    nn.Linear(300, 300)
).to(device)

In [None]:
EPOCHS = 200
batch_size = 500
lr = 0.01

optimizer = torch.optim.Adam(enc_dec_model.parameters(), lr=lr)
criterion = torch.nn.MSELoss()

cos = torch.nn.CosineSimilarity()
def loss_func(outputs, true):
    return (1 - cos(outputs, true)).mean()

In [None]:
def cross_cosine_similarity(vec_batch_1, vec_batch_2):
    vec_batch_1 = vec_batch_1 / torch.norm(vec_batch_1, dim=-1, keepdim=True)
    vec_batch_2 = vec_batch_2 / torch.norm(vec_batch_2, dim=-1, keepdim=True)
    return torch.mm(vec_batch_1, vec_batch_2.T)    

def evaluate_ranking_score(true_test_vectors, predicted_vectors, N, soft=False):
    cos_sim = cross_cosine_similarity(predicted_vectors, true_test_vectors)
    indexes = torch.argsort(cos_sim, axis=1, descending=True)
    true = torch.from_numpy(np.arange(len(indexes))).type(torch.IntTensor).T.to(device)
    positions = (indexes == true.reshape(-1, 1)).nonzero()[:, 1].type(torch.FloatTensor).to(device)
    score = torch.zeros(len(indexes)).type(torch.FloatTensor).to(device)
    if not soft:
        score[positions < N] = 1 / (positions[positions < N] + 1)
    else:
        score[positions < N] = 1
    return score.mean()

In [None]:
import time

num_batch = len(train_source) // batch_size if len(train_source) % batch_size == 0 else len(train_source) // batch_size + 1
num_batch_test = len(test_source) // batch_size if len(test_source) % batch_size == 0 else len(test_source) // batch_size + 1

for epoch in range(EPOCHS):
    t = time.time()
    train_loss = 0
    for b in range(num_batch):
        optimizer.zero_grad()
        source = torch.from_numpy(train_source[b*batch_size:(b+1)*batch_size]).to(device)
        target = torch.from_numpy(train_target[b*batch_size:(b+1)*batch_size]).to(device)
        outputs = enc_dec_model(source)
        loss = loss_func(outputs, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.detach().cpu().numpy()
    train_loss = train_loss / num_batch
    
    with torch.no_grad():
        test_loss = 0
        rank_score_1 = 0
        rank_score_5 = 0
        rank_score_10 = 0
        soft_rank_score_1 = 0
        soft_rank_score_5 = 0
        soft_rank_score_10 = 0
        for b in range(num_batch_test):
            source = torch.from_numpy(test_source[b*batch_size:(b+1)*batch_size]).to(device)
            target = torch.from_numpy(test_target[b*batch_size:(b+1)*batch_size]).to(device)
            outputs = enc_dec_model(source)
            loss = loss_func(outputs, target)
            
            test_loss += loss.detach().cpu().numpy()

            rank_score_1 += evaluate_ranking_score(target, outputs, 1)
            rank_score_5 += evaluate_ranking_score(target, outputs, 5)
            rank_score_10 += evaluate_ranking_score(target, outputs, 10)

            soft_rank_score_1 += evaluate_ranking_score(target, outputs, 1, True)
            soft_rank_score_5 += evaluate_ranking_score(target, outputs, 5, True)
            soft_rank_score_10 += evaluate_ranking_score(target, outputs, 10, True)
            
        test_loss = test_loss / num_batch_test
        rank_score_1 /= num_batch_test
        rank_score_5 /= num_batch_test
        rank_score_10 /= num_batch_test
        soft_rank_score_1 /= num_batch_test
        soft_rank_score_5 /= num_batch_test
        soft_rank_score_10 /= num_batch_test
        
        print(f'Epoch {epoch} \t Mean train loss {train_loss} \t Mean test loss {test_loss} \t Time {time.time() - t}')
        print(f'Rank score 1 {rank_score_1} \t Rank_score 5 {rank_score_5} \t Rank score 10 {rank_score_10}')
        print(f'Soft rank score 1 {soft_rank_score_1} \t Soft rank_score 5 {soft_rank_score_5} \t Soft rank score 10 {soft_rank_score_10}')
        print('------------------------------------------------------------------------------------------------------------')

# Model 2

In [None]:
import torch
import torch.nn as nn

enc_dec_model = nn.Sequential(
    nn.Linear(300, 400),
    nn.ReLU(),
    nn.Linear(400, 500),
    nn.ReLU(),
    nn.Linear(500, 500),
    nn.ReLU(),
    nn.Linear(500, 500),
    nn.ReLU(),
    nn.Linear(500, 500),
    nn.ReLU(),
    nn.Linear(500, 400),
    nn.ReLU(),
    nn.Linear(400, 300)
).to(device)

In [None]:
EPOCHS = 200
batch_size = 500
lr = 0.01

optimizer = torch.optim.Adam(enc_dec_model.parameters(), lr=lr)
criterion = torch.nn.MSELoss()

cos = torch.nn.CosineSimilarity()
def loss_func(outputs, true):
    return (1 - cos(outputs, true)).mean()

In [None]:
def cross_cosine_similarity(vec_batch_1, vec_batch_2):
    vec_batch_1 = vec_batch_1 / torch.norm(vec_batch_1, dim=-1, keepdim=True)
    vec_batch_2 = vec_batch_2 / torch.norm(vec_batch_2, dim=-1, keepdim=True)
    return torch.mm(vec_batch_1, vec_batch_2.T)    

def evaluate_ranking_score(true_test_vectors, predicted_vectors, N, soft=False):
    cos_sim = cross_cosine_similarity(predicted_vectors, true_test_vectors)
    indexes = torch.argsort(cos_sim, axis=1, descending=True)
    true = torch.from_numpy(np.arange(len(indexes))).type(torch.IntTensor).T.to(device)
    positions = (indexes == true.reshape(-1, 1)).nonzero()[:, 1].type(torch.FloatTensor).to(device)
    score = torch.zeros(len(indexes)).type(torch.FloatTensor).to(device)
    if not soft:
        score[positions < N] = 1 / (positions[positions < N] + 1)
    else:
        score[positions < N] = 1
    return score.mean()

In [None]:
import time

num_batch = len(train_source) // batch_size if len(train_source) % batch_size == 0 else len(train_source) // batch_size + 1
num_batch_test = len(test_source) // batch_size if len(test_source) % batch_size == 0 else len(test_source) // batch_size + 1

for epoch in range(EPOCHS):
    t = time.time()
    train_loss = 0
    for b in range(num_batch):
        optimizer.zero_grad()
        source = torch.from_numpy(train_source[b*batch_size:(b+1)*batch_size]).to(device)
        target = torch.from_numpy(train_target[b*batch_size:(b+1)*batch_size]).to(device)
        outputs = enc_dec_model(source)
        loss = loss_func(outputs, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.detach().cpu().numpy()
    train_loss = train_loss / num_batch
    
    with torch.no_grad():
        test_loss = 0
        rank_score_1 = 0
        rank_score_5 = 0
        rank_score_10 = 0
        soft_rank_score_1 = 0
        soft_rank_score_5 = 0
        soft_rank_score_10 = 0
        for b in range(num_batch_test):
            source = torch.from_numpy(test_source[b*batch_size:(b+1)*batch_size]).to(device)
            target = torch.from_numpy(test_target[b*batch_size:(b+1)*batch_size]).to(device)
            outputs = enc_dec_model(source)
            loss = loss_func(outputs, target)
            
            test_loss += loss.detach().cpu().numpy()

            rank_score_1 += evaluate_ranking_score(target, outputs, 1)
            rank_score_5 += evaluate_ranking_score(target, outputs, 5)
            rank_score_10 += evaluate_ranking_score(target, outputs, 10)

            soft_rank_score_1 += evaluate_ranking_score(target, outputs, 1, True)
            soft_rank_score_5 += evaluate_ranking_score(target, outputs, 5, True)
            soft_rank_score_10 += evaluate_ranking_score(target, outputs, 10, True)
            
        test_loss = test_loss / num_batch_test
        rank_score_1 /= num_batch_test
        rank_score_5 /= num_batch_test
        rank_score_10 /= num_batch_test
        soft_rank_score_1 /= num_batch_test
        soft_rank_score_5 /= num_batch_test
        soft_rank_score_10 /= num_batch_test
        
        print(f'Epoch {epoch} \t Mean train loss {train_loss} \t Mean test loss {test_loss} \t Time {time.time() - t}')
        print(f'Rank score 1 {rank_score_1} \t Rank_score 5 {rank_score_5} \t Rank score 10 {rank_score_10}')
        print(f'Soft rank score 1 {soft_rank_score_1} \t Soft rank_score 5 {soft_rank_score_5} \t Soft rank score 10 {soft_rank_score_10}')
        print('------------------------------------------------------------------------------------------------------------')

# Model 3

In [None]:
import torch.nn as nn

EPOCHS = 200
batch_size = 500
lr = 0.01

enc_ft_model = nn.Sequential(
    nn.Linear(300, 400),
    nn.ReLU(),
    nn.Linear(400, 500)#,
    #nn.ReLU(),
    #nn.Linear(500, 500)
).to(device)

enc_node_model = nn.Sequential(
    nn.Linear(300, 400),
    nn.ReLU(),
    nn.Linear(400, 500)#,
    #nn.ReLU(),
    #nn.Linear(500, 500)
).to(device)

In [None]:
optimizer = torch.optim.Adam([i for i in enc_ft_model.parameters()] + [i for i in enc_node_model.parameters()], lr=lr)

cos = torch.nn.CosineSimilarity()
def loss_func(outputs, true, labels):
    return ((labels - cos(outputs, true)) ** 2).mean()

def cross_cosine_similarity(vec_batch_1, vec_batch_2):
    vec_batch_1 = vec_batch_1 / torch.norm(vec_batch_1, dim=-1, keepdim=True)
    vec_batch_2 = vec_batch_2 / torch.norm(vec_batch_2, dim=-1, keepdim=True)
    return torch.mm(vec_batch_1, vec_batch_2.T)    

def evaluate_ranking_score(true_test_vectors, predicted_vectors, N, soft=False):
    cos_sim = cross_cosine_similarity(predicted_vectors, true_test_vectors)
    indexes = torch.argsort(cos_sim, axis=1, descending=True)
    true = torch.from_numpy(np.arange(len(indexes))).type(torch.IntTensor).T.to(device)
    positions = (indexes == true.reshape(-1, 1)).nonzero()[:, 1].type(torch.FloatTensor).to(device)
    score = torch.zeros(len(indexes)).type(torch.FloatTensor).to(device)
    if not soft:
        score[positions < N] = 1 / (positions[positions < N] + 1)
    else:
        score[positions < N] = 1
    return score.mean()

In [None]:
def generate_batch(source, target, batch_size, positive_part=0.5):
    indexes = np.arange(len(source))
    pos_ind = np.random.choice(indexes, int(batch_size * positive_part))
    p_source = source[pos_ind]
    p_target = target[pos_ind]

    neg_size = batch_size - int(batch_size * positive_part)
    available_neg_inds = np.setdiff1d(indexes, pos_ind)
    neg_ind_s = np.random.choice(available_neg_inds, neg_size)
    neg_ind_t = np.random.choice(np.setdiff1d(available_neg_inds, neg_ind_s), neg_size)
    neg_source = source[neg_ind_s]
    neg_target = target[neg_ind_t]

    result_source = np.concatenate((p_source, neg_source), axis=0)
    result_target = np.concatenate((p_target, neg_target), axis=0)
    labels = np.array([1 for i in range(len(p_source))] + [0 for i in range(len(neg_source))])

    inds = np.arange(batch_size)
    np.random.shuffle(inds)
    return result_source[inds], result_target[inds], labels[inds]

In [None]:
import time

num_batch = len(train_source) // batch_size if len(train_source) % batch_size == 0 else len(train_source) // batch_size + 1
num_batch_test = len(test_source) // batch_size if len(test_source) % batch_size == 0 else len(test_source) // batch_size + 1

for epoch in range(EPOCHS):
    t = time.time()
    train_loss = 0
    for b in range(num_batch):
        optimizer.zero_grad()
        source, target, labels = generate_batch(train_source, train_target, batch_size)
        source = torch.from_numpy(source).to(device)
        target = torch.from_numpy(target).to(device)
        labels = torch.from_numpy(labels).to(device)
        outputs_ft = enc_ft_model(source)
        outputs_node = enc_node_model(target)
        loss = loss_func(outputs_ft, outputs_node, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.detach().cpu().numpy()
    train_loss = train_loss / num_batch
    
    with torch.no_grad():
        test_loss = 0

        rank_score_1 = 0
        rank_score_5 = 0
        rank_score_10 = 0

        soft_rank_score_1 = 0
        soft_rank_score_5 = 0
        soft_rank_score_10 = 0
        for b in range(num_batch_test):
            source = torch.from_numpy(test_source[b*batch_size:(b+1)*batch_size]).to(device)
            target = torch.from_numpy(test_target[b*batch_size:(b+1)*batch_size]).to(device)
            outputs_ft = enc_ft_model(source)
            outputs_node = enc_node_model(target)
            loss = loss_func(outputs_ft, outputs_node, torch.ones(batch_size).to(device))
            
            test_loss += loss.detach().cpu().numpy()

            rank_score_1 += evaluate_ranking_score(outputs_node, outputs_ft, 1)
            rank_score_5 += evaluate_ranking_score(outputs_node, outputs_ft, 5)
            rank_score_10 += evaluate_ranking_score(outputs_node, outputs_ft, 10)

            soft_rank_score_1 += evaluate_ranking_score(outputs_node, outputs_ft, 1, True)
            soft_rank_score_5 += evaluate_ranking_score(outputs_node, outputs_ft, 5, True)
            soft_rank_score_10 += evaluate_ranking_score(outputs_node, outputs_ft, 10, True)
            
        test_loss = test_loss / num_batch_test
        rank_score_1 /= num_batch_test
        rank_score_5 /= num_batch_test
        rank_score_10 /= num_batch_test
        soft_rank_score_1 /= num_batch_test
        soft_rank_score_5 /= num_batch_test
        soft_rank_score_10 /= num_batch_test
        
        print(f'Epoch {epoch} \t Mean train loss {train_loss} \t Mean test loss {test_loss} \t Time {time.time() - t}')
        print(f'Rank score 1 {rank_score_1} \t Rank_score 5 {rank_score_5} \t Rank score 10 {rank_score_10}')
        print(f'Soft rank score 1 {soft_rank_score_1} \t Soft rank_score 5 {soft_rank_score_5} \t Soft rank score 10 {soft_rank_score_10}')
        print('------------------------------------------------------------------------------------------------------------')