In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import precision_recall_fscore_support
from gensim.models import KeyedVectors
import numpy as np
import os
import sys
sys.path.insert(0, '../src/') 
import evaluate

In [7]:
clusters_path = "../data/terms.csv"
clusters, concepts = evaluate.load_clusters(clusters_path)


cluster_A = ['abstract_singular_term', 'abstract_term', 'adjective', 'article', 'definite_article', 'indefinite_article', 'mass_term', 'demonstrative', 'description', 'general_term', 'singular_term', 'definite_singular_term', 'indefinite_singular_term', 'eternal_sentence', 'indicator_word', 'name', 'noun', 'relative_term', 'substantive', 'observation_sentence', 'occasion_sentence', 'open_sentence', 'pronoun', 'relative_clause', 'relative_pronoun', 'one-word_sentence', 'word', 'verb']
cluster_B = ['abstract_object', 'class', 'concrete_object', 'physical_object', 'ideal_object', 'geometrical_object', 'material', 'object', 'particle', 'particular', 'physical_thing', 'scattered_object']
cluster_C = ['context', 'modulus', 'operant_behavior', 'phoneme', 'stimulus', 'stimulation']
cluster_D = ['conceptual_scheme', 'prelinguistic_quality_space']
cluster_E = ['canonical_notation', 'paraphrase', 'concatenation', 'concretion', 'conditional', 'conjunction', 'connective', 'construction', 'contradiction', 'copula', 'form', 'function', 'quantification', 'quantifier', 'quotational', 'predication', 'plural', 'regimentation', 'elimination', 'explication', 'linguistic_form', 'logic', 'syntax', 'variables']

In [8]:
from collections import defaultdict

input_words = []
gold_labels = []

def map_words(words, label, word_label_dict):
    for word in words:
        word_label_dict[word] = label

word_label_dict = dict()
map_words(cluster_A, 'language', word_label_dict)
map_words(cluster_B, 'ontology', word_label_dict)
map_words(cluster_C, 'reality', word_label_dict)
map_words(cluster_D, 'mind', word_label_dict)
map_words(cluster_E, 'metalinguistic', word_label_dict)

for label, words in word_label_dict.items(): 
    input_words.append(label)
    gold_labels.append(words)
    

gold_dict = defaultdict(set)

for word, label in zip(input_words, gold_labels):
    gold_dict[label].add(word)

#input_words = ['dog', 'cat', 'orange', 'lemon', 'table', 'cup', 'frog']
#gold_labels = ['animal', 'animal', 'fruit', 'fruit', 'furniture', 'furniture', 'animal']

In [38]:
# model
# change model path to your model
#model_path = '~/Data/dsm/word2vec/movies.bin'
#embeddings = KeyedVectors.load_word2vec_format(model_path, binary=True)

embeddings_path = "../data/hw_svd_w5_s0.0001_thr0_n15.txt"
embeddings = evaluate.load_embeddings(embeddings_path)


def get_embeddings(words, labels, embeddings):
    words_in_vocab = []
    labels_in_vocab = []
    vecs = []
    for w, l in zip(words, labels):
        if w in embeddings.vocab:
            words_in_vocab.append(w)
            labels_in_vocab.append(l)
            vecs.append(embeddings[w])
    return np.matrix(vecs), words_in_vocab, labels_in_vocab


def run_nearest_neighbors(input_vecs, input_labels, n_neighbors):
    X = input_vecs
    y = input_labels
    loo = LeaveOneOut()

    neigh = KNeighborsClassifier(n_neighbors=n_neighbors)

    predictions = []

    for train_index, test_index in loo.split(X):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train = [y[i] for i in train_index]
        y_test = y[test_index[0]]
        neigh.fit(X_train, y_train)
        pred = neigh.predict(X_test)
        predictions.append(pred[0])

    p, r, f1, s = precision_recall_fscore_support(predictions, y, average='macro')
    print(p, r, f1)
    print('Precision:', p)
    print('Recall', r)
    print('F1 macro', f1)

input_vecs, input_words_vocab, input_labels = get_embeddings(input_words, gold_labels, embeddings)
# we can modify the number of nearest neighbors
n_neighbors = 1
run_nearest_neighbors(input_vecs, input_labels, n_neighbors)

0.4523809523809524 0.43847238288027757 0.4426022659511032
Precision: 0.4523809523809524
Recall 0.43847238288027757
F1 macro 0.4426022659511032


