# Load & Prep

In [1]:
import os
import sys
sys.path.insert(0, '../src/') 
import evaluate
import nltk
import math
import numpy as np
import sklearn
from sklearn import metrics


embeddings_path = "../data/hw_svd_w5_s0.0001_thr0_n15.txt"
embeddings = evaluate.load_embeddings(embeddings_path)

centroid_vector_path = "../data/CENTROIDS_hw_svd_w5_s0.0001_thr0_n15.txt"
centroid_embeddings = evaluate.load_embeddings(centroid_vector_path)

termlist_path = "../data/quine_terms.txt"
termlist = []

with open(termlist_path, "r") as f:
    for line in f:
        termlist.extend(line.split())
        
f.close()



In [2]:
clusters_path = "../data/terms.csv"
clusters, concepts = evaluate.load_clusters(clusters_path)


cluster_A = ['abstract_singular_term', 'abstract_term', 'adjective', 'article', 'definite_article', 'indefinite_article', 'mass_term', 'demonstrative', 'description', 'general_term', 'singular_term', 'definite_singular_term', 'indefinite_singular_term', 'eternal_sentence', 'indicator_word', 'name', 'noun', 'relative_term', 'substantive', 'observation_sentence', 'occasion_sentence', 'open_sentence', 'pronoun', 'relative_clause', 'relative_pronoun', 'one-word_sentence', 'word', 'verb']
cluster_B = ['abstract_object', 'class', 'concrete_object', 'physical_object', 'ideal_object', 'geometrical_object', 'material', 'object', 'ordinary_enduring_middle-sized_physical_object', 'particle', 'particular', 'physical_thing', 'scattered_object']
cluster_C = ['context', 'modulus', 'operant_behavior', 'phoneme', 'stimulus', 'stimulation']
cluster_D = ['conceptual_scheme', 'prelinguistic_quality_space']
cluster_E = ['canonical_notation', 'paraphrase', 'concatenation', 'concretion', 'conditional', 'conjunction', 'connective', 'construction', 'contradiction', 'copula', 'form', 'function', 'quantification', 'quantifier', 'quotational', 'predication', 'plural', 'regimentation', 'elimination', 'explication', 'linguistic_form', 'logic', 'syntax', 'variables']

the_clusters = [cluster_A, cluster_B, cluster_C, cluster_D, cluster_E]

In [3]:
from collections import defaultdict

input_words = []
gold_labels = []

def map_words(words, label, word_label_dict):
    for word in words:
        word_label_dict[word] = label

word_label_dict = dict()
map_words(cluster_A, 'language', word_label_dict)
map_words(cluster_B, 'ontology', word_label_dict)
map_words(cluster_C, 'reality', word_label_dict)
map_words(cluster_D, 'mind', word_label_dict)
map_words(cluster_E, 'metalinguistic', word_label_dict)

for label, words in word_label_dict.items(): 
    input_words.append(label)
    gold_labels.append(words)
    

gold_dict = defaultdict(set)

for word, label in zip(input_words, gold_labels):
    gold_dict[label].add(word)
    
#for label, words in word_label_dict.items():
#    print(label, words)
print(input_words)
print(gold_labels)
print(gold_dict)

['abstract_singular_term', 'abstract_term', 'adjective', 'article', 'definite_article', 'indefinite_article', 'mass_term', 'demonstrative', 'description', 'general_term', 'singular_term', 'definite_singular_term', 'indefinite_singular_term', 'eternal_sentence', 'indicator_word', 'name', 'noun', 'relative_term', 'substantive', 'observation_sentence', 'occasion_sentence', 'open_sentence', 'pronoun', 'relative_clause', 'relative_pronoun', 'one-word_sentence', 'word', 'verb', 'abstract_object', 'class', 'concrete_object', 'physical_object', 'ideal_object', 'geometrical_object', 'material', 'object', 'ordinary_enduring_middle-sized_physical_object', 'particle', 'particular', 'physical_thing', 'scattered_object', 'context', 'modulus', 'operant_behavior', 'phoneme', 'stimulus', 'stimulation', 'conceptual_scheme', 'prelinguistic_quality_space', 'canonical_notation', 'paraphrase', 'concatenation', 'concretion', 'conditional', 'conjunction', 'connective', 'construction', 'contradiction', 'copula

# Centroids

In [4]:
def get_all_vectors(word_label_dict, model):
    
    vecs = []
    words_in_vocab = []
    
    for word in word_label_dict.keys():
        if word in model.vocab:
            vec = model[word]
            vecs.append(vec)
            words_in_vocab.append(word)
        else:
            print(word, 'oov')
    
    return np.array(vecs), words_in_vocab


vecs, words_in_vocab = get_all_vectors(word_label_dict, embeddings)

In [5]:
def normalize_vector(vec):
    mag = math.sqrt(sum([pow(value, 2) for value in vec]))

    unit_vec = []

    for value in vec:
        unit_vec.append(value/mag)
    unit_vec = np.array(unit_vec)
    return unit_vec

In [6]:
def centroid(cluster):
    for term in cluster:
        if term not in embeddings.vocab:
            continue
        vec = embeddings[term]
        norm_vec = normalize_vector(vec)
        vec_list.append(norm_vec)
    
    a_matrix = np.array(vec_list)
        
    centroid = np.mean(a_matrix, axis=0)
    centroids = centroid.tolist()
    print(cluster)
    print(centroids)
    
    with open("../data/centroids.txt", 'a') as output_file:
            output_file.write(f"cluster: {cluster}")
            output_file.write(str(centroids))
            output_file.write("\n")
    return centroids 

In [7]:
vec_list = []

for cluster in the_clusters:
    centroid(cluster)

['abstract_singular_term', 'abstract_term', 'adjective', 'article', 'definite_article', 'indefinite_article', 'mass_term', 'demonstrative', 'description', 'general_term', 'singular_term', 'definite_singular_term', 'indefinite_singular_term', 'eternal_sentence', 'indicator_word', 'name', 'noun', 'relative_term', 'substantive', 'observation_sentence', 'occasion_sentence', 'open_sentence', 'pronoun', 'relative_clause', 'relative_pronoun', 'one-word_sentence', 'word', 'verb']
[0.07867384027989947, -0.02980187276530675, 0.013852143408065997, 0.009830331142334394, 0.016890998220321966, 0.013443490977013925, -0.020277958237035165, -0.10633722169323168, -0.007684595676834669, -0.03258465316064445, 0.102628564291663, 0.060988035732807935, -0.0021381882241524775, 0.042082789332092474, 0.025405541247936713, 0.006837741404177226, -0.013733885690857634, -0.05595917572799114, -0.031194036258047533, 0.017592569002451052, -0.005059046139193237, 0.013968951103151135, -0.03277123748936662, -0.0080252154

['canonical_notation', 'paraphrase', 'concatenation', 'concretion', 'conditional', 'conjunction', 'connective', 'construction', 'contradiction', 'copula', 'form', 'function', 'quantification', 'quantifier', 'quotational', 'predication', 'plural', 'regimentation', 'elimination', 'explication', 'linguistic_form', 'logic', 'syntax', 'variables']
[0.08411398446981133, -0.028036653995076852, 0.029103211786059506, 0.01000016886415388, 0.005671577859591901, 0.007076043748984675, -0.004084132492438381, -0.03226899531677417, 0.007645223373140936, -0.02817003857744631, 0.08049377861445278, 0.03953320550901653, -0.0025111975485183568, 0.02727429606772627, 0.020552378920870074, -0.008557204184661734, -0.007432334841576979, -0.03307067532207378, -0.028777548250838895, 0.005231601783486662, -0.012677262006466674, 0.012107311376228459, -0.031589381584452604, -0.008822299664250306, 0.0018326349648184956, -0.019325160846694875, 0.0038528472520002756, -0.016806574383236125, -0.009901283955105271, 0.0005

In [8]:
vecA = centroid_embeddings["centroid_a"]
vecB = centroid_embeddings["centroid_b"]
vecC = centroid_embeddings["centroid_c"]
vecD = centroid_embeddings["centroid_d"]
vecE = centroid_embeddings["centroid_e"]


In [9]:
clusteredA = list()
clusteredB = list()
clusteredC = list()
clusteredD = list()
clusteredE = list()

In [10]:
def cos_per_cluster(term):
    vec = embeddings[term]
    
    cosA = sklearn.metrics.pairwise.cosine_similarity([vec], [vecA])
    cosB = sklearn.metrics.pairwise.cosine_similarity([vec], [vecB])
    cosC = sklearn.metrics.pairwise.cosine_similarity([vec], [vecC])
    cosD = sklearn.metrics.pairwise.cosine_similarity([vec], [vecD])
    cosE = sklearn.metrics.pairwise.cosine_similarity([vec], [vecE])
    
    cos_list = [cosA, cosB, cosC, cosD, cosE]
    closest = min(cos_list)
    
    if closest == cosA:
        clusteredA.append(term)
    elif closest == cosB:
        clusteredB.append(term)
    elif closest == cosC:
        clusteredC.append(term)
    elif closest == cosD:
        clusteredD.append(term)
    elif closest == cosE:
        clusteredE.append(term)

In [11]:
for cluster in the_clusters:
    for term in cluster:
        if term not in embeddings.vocab:
            continue
        cos_per_cluster(term)

In [12]:
print(clusteredA)
print(clusteredB)
print(clusteredC)
print(clusteredD)
print(clusteredE)

['name', 'abstract_object', 'class', 'concrete_object', 'physical_object', 'ideal_object', 'geometrical_object', 'material', 'object', 'ordinary_enduring_middle-sized_physical_object', 'particle', 'particular', 'physical_thing', 'context', 'modulus', 'operant_behavior', 'phoneme', 'stimulus', 'conceptual_scheme', 'prelinguistic_quality_space', 'paraphrase', 'concretion', 'function', 'explication', 'logic', 'variables']
['occasion_sentence', 'stimulation', 'concatenation', 'connective', 'quantifier', 'quotational', 'linguistic_form', 'syntax']
['canonical_notation', 'conjunction', 'copula']
['article', 'singular_term', 'indefinite_singular_term', 'open_sentence', 'pronoun', 'relative_clause', 'relative_pronoun', 'verb', 'conditional', 'construction', 'contradiction', 'form', 'quantification', 'predication', 'plural', 'regimentation', 'elimination']
['abstract_singular_term', 'abstract_term', 'adjective', 'definite_article', 'indefinite_article', 'mass_term', 'demonstrative', 'descriptio

In [13]:
TP = 0
FP = 0 
FN = 0 

for term in cluster_A:
    if term in clusteredA:
        TP += 1 
    else:
        FN += 1
        
for term in cluster_B:
    if term in clusteredB:
        TP += 1 
    else:
        FN += 1
        
for term in cluster_C:
    if term in clusteredC:
        TP += 1 
    else:
        FN += 1
        
for term in cluster_D:
    if term in clusteredD:
        TP += 1 
    else:
        FN += 1
        
for term in cluster_E:
    if term in clusteredE:
        TP += 1 
    else:
        FN += 1
        
for term in clusteredA:
    if term not in cluster_A:
        FP += 1 

for term in clusteredB:
    if term not in cluster_B:
        FP += 1 
        
for term in clusteredC:
    if term not in cluster_C:
        FP += 1 
        
for term in clusteredD:
    if term not in cluster_D:
        FP += 1 
        
for term in clusteredE:
    if term not in cluster_E:
        FP += 1   
        
        
precision = TP/ (TP + FP)
recall = TP / (TP + FN)

fscore = (2 * precision * recall) / (precision + recall)
 
print(f"TP: {TP}")
print(f"FP: {FP}")
print(f"FN: {FN}")
print()    
print(f"precision: {precision}")
print(f"recall: {recall}")
print(f"fscore: {fscore}")

TP: 1
FP: 72
FN: 72

precision: 0.0136986301369863
recall: 0.0136986301369863
fscore: 0.0136986301369863


In [14]:
y_pred = [5, 5, 5, 4, 5, 5, 5, 5, 4, 4, 4, 5, 5, 1, 5, 4, 5, 4, 5, 1, 5, 1, 4, 3, 4, 5, 4, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 4, 4, 1, 1, 1, 1, 1, 1, 1]

y_true = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]


fscore_metriced = sklearn.metrics.f1_score(y_true, y_pred, average='micro')

print(fscore_metriced)

0.0410958904109589
