# Load & Prep

In [1]:
import os
import sys
sys.path.insert(0, '../src/') 
import evaluate
import nltk

embeddings_path = "../data/n2v_rand_a0.1_n15_s10000_w5_l150_sd1_wd1.txt"
embeddings = evaluate.load_embeddings(embeddings_path)

termlist_path = "../data/quine_terms.txt"
termlist = []

with open(termlist_path, "r") as f:
    for line in f:
        termlist.extend(line.split())
        
f.close()



In [2]:
clusters_path = "../data/terms.csv"
clusters, concepts = evaluate.load_clusters(clusters_path)


cluster_A = ['abstract_singular_term', 'abstract_term', 'adjective', 'article', 'definite_article', 'indefinite_article', 'mass_term', 'demonstrative', 'description', 'general_term', 'singular_term', 'definite_singular_term', 'indefinite_singular_term', 'eternal_sentence', 'indicator_word', 'name', 'noun', 'relative_term', 'substantive', 'observation_sentence', 'occasion_sentence', 'open_sentence', 'pronoun', 'relative_clause', 'relative_pronoun', 'one-word_sentence', 'word', 'verb']
cluster_B = ['abstract_object', 'class', 'concrete_object', 'physical_object', 'ideal_object', 'geometrical_object', 'material', 'object', 'ordinary_enduring_middle-sized_physical_object', 'particle', 'particular', 'physical_thing', 'scattered_object']
cluster_C = ['context', 'modulus', 'operant_behavior', 'phoneme', 'stimulus', 'stimulation']
cluster_D = ['conceptual_scheme', 'prelinguistic_quality_space']
cluster_E = ['canonical_notation', 'paraphrase', 'concatenation', 'concretion', 'conditional', 'conjunction', 'connective', 'construction', 'contradiction', 'copula', 'form', 'function', 'quantification', 'quantifier', 'quotational', 'predication', 'plural', 'regimentation', 'elimination', 'explication', 'linguistic_form', 'logic', 'syntax', 'variables']

# K-Means Clustering 

In [3]:
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from gensim.models import KeyedVectors
import numpy as np

In [4]:
from collections import defaultdict

input_words = []
gold_labels = []

def map_words(words, label, word_label_dict):
    for word in words:
        word_label_dict[word] = label

word_label_dict = dict()
map_words(cluster_A, 'language', word_label_dict)
map_words(cluster_B, 'ontology', word_label_dict)
map_words(cluster_C, 'reality', word_label_dict)
map_words(cluster_D, 'mind', word_label_dict)
map_words(cluster_E, 'metalinguistic', word_label_dict)

for label, words in word_label_dict.items(): 
    input_words.append(label)
    gold_labels.append(words)
    

gold_dict = defaultdict(set)

for word, label in zip(input_words, gold_labels):
    gold_dict[label].add(word)
    
#for label, words in word_label_dict.items():
#    print(label, words)
print(input_words)
print(gold_labels)
print(gold_dict)

['abstract_singular_term', 'abstract_term', 'adjective', 'article', 'definite_article', 'indefinite_article', 'mass_term', 'demonstrative', 'description', 'general_term', 'singular_term', 'definite_singular_term', 'indefinite_singular_term', 'eternal_sentence', 'indicator_word', 'name', 'noun', 'relative_term', 'substantive', 'observation_sentence', 'occasion_sentence', 'open_sentence', 'pronoun', 'relative_clause', 'relative_pronoun', 'one-word_sentence', 'word', 'verb', 'abstract_object', 'class', 'concrete_object', 'physical_object', 'ideal_object', 'geometrical_object', 'material', 'object', 'ordinary_enduring_middle-sized_physical_object', 'particle', 'particular', 'physical_thing', 'scattered_object', 'context', 'modulus', 'operant_behavior', 'phoneme', 'stimulus', 'stimulation', 'conceptual_scheme', 'prelinguistic_quality_space', 'canonical_notation', 'paraphrase', 'concatenation', 'concretion', 'conditional', 'conjunction', 'connective', 'construction', 'contradiction', 'copula

In [5]:
def get_all_vectors(word_label_dict, model):
    
    vecs = []
    words_in_vocab = []
    
    for word in word_label_dict.keys():
        if word in model.vocab:
            vec = model[word]
            vecs.append(vec)
            words_in_vocab.append(word)
        else:
            print(word, 'oov')
    
    return np.array(vecs), words_in_vocab


vecs, words_in_vocab = get_all_vectors(word_label_dict, embeddings)

In [6]:
y_pred = KMeans(n_clusters=5, init='random').fit_predict(vecs)
print(y_pred)

[3 1 4 3 1 1 1 4 4 4 4 1 3 3 1 0 4 3 3 3 3 3 4 3 3 1 2 4 3 0 3 3 1 1 0 2 1
 3 0 1 1 3 1 1 3 3 3 3 1 3 3 3 3 0 0 3 0 3 4 2 2 0 0 1 3 4 3 3 3 1 2 3 2]


In [7]:
clustering_output = []
predicted_clusters = defaultdict(list)
for word, pred_label in zip(words_in_vocab, y_pred):
    predicted_clusters[pred_label].append(word)
    clustering_output.append(pred_label)
    
for label, words in predicted_clusters.items():
    print(label, words)
    
print(clustering_output)

3 ['abstract_singular_term', 'article', 'indefinite_singular_term', 'eternal_sentence', 'relative_term', 'substantive', 'observation_sentence', 'occasion_sentence', 'open_sentence', 'relative_clause', 'relative_pronoun', 'abstract_object', 'concrete_object', 'physical_object', 'particle', 'context', 'phoneme', 'stimulus', 'stimulation', 'conceptual_scheme', 'canonical_notation', 'paraphrase', 'concatenation', 'concretion', 'connective', 'contradiction', 'predication', 'regimentation', 'elimination', 'explication', 'syntax']
1 ['abstract_term', 'definite_article', 'indefinite_article', 'mass_term', 'definite_singular_term', 'indicator_word', 'one-word_sentence', 'ideal_object', 'geometrical_object', 'ordinary_enduring_middle-sized_physical_object', 'physical_thing', 'scattered_object', 'modulus', 'operant_behavior', 'prelinguistic_quality_space', 'quotational', 'linguistic_form']
4 ['adjective', 'demonstrative', 'description', 'general_term', 'singular_term', 'noun', 'pronoun', 'verb', 

In [8]:
for word, label in zip(input_words, gold_labels):
    gold_dict[label].add(word)
 
cluster_dict = defaultdict(set)
for word, cluster_label in zip(input_words, clustering_output):
    cluster_dict[cluster_label].add(word)
    
print(cluster_dict)

defaultdict(<class 'set'>, {3: {'canonical_notation', 'regimentation', 'abstract_object', 'article', 'explication', 'substantive', 'observation_sentence', 'contradiction', 'indefinite_singular_term', 'relative_term', 'connective', 'conceptual_scheme', 'particle', 'syntax', 'open_sentence', 'stimulus', 'occasion_sentence', 'paraphrase', 'relative_pronoun', 'phoneme', 'eternal_sentence', 'relative_clause', 'concatenation', 'context', 'concretion', 'predication', 'elimination', 'stimulation', 'concrete_object', 'abstract_singular_term', 'physical_object'}, 1: {'definite_article', 'indicator_word', 'ideal_object', 'physical_thing', 'prelinguistic_quality_space', 'mass_term', 'abstract_term', 'quotational', 'definite_singular_term', 'scattered_object', 'modulus', 'linguistic_form', 'one-word_sentence', 'geometrical_object', 'ordinary_enduring_middle-sized_physical_object', 'indefinite_article', 'operant_behavior'}, 4: {'demonstrative', 'verb', 'plural', 'description', 'pronoun', 'copula', '

In [9]:
mapping_dict = dict()
    
for gold_label, gold_words in gold_dict.items():
    gold_labels_clusters = []
    for cluster_label, cluster_words in cluster_dict.items():
        pair = (gold_label, cluster_label)
        overlap = gold_words.intersection(cluster_words)
        gold_labels_clusters.append((len(overlap), pair))
    max_match, best_match = max(gold_labels_clusters)
    mapping_dict[best_match[1]] =  best_match[0]
            
print(mapping_dict)  

{3: 'metalinguistic', 1: 'ontology'}


In [10]:
#gold_labels_mapped = 

#for gold_label, gold_words in gold_dict.items():
    

In [11]:
n_correct = 0
total = 0

for gold_label, pred_label in zip(gold_labels, clustering_output):
    print(gold_label, pred_label)
    total += 1
    if gold_label == pred_label:
        n_correct += 1
        
print(n_correct)
print(total)

language 3
language 1
language 4
language 3
language 1
language 1
language 1
language 4
language 4
language 4
language 4
language 1
language 3
language 3
language 1
language 0
language 4
language 3
language 3
language 3
language 3
language 3
language 4
language 3
language 3
language 1
language 2
language 4
ontology 3
ontology 0
ontology 3
ontology 3
ontology 1
ontology 1
ontology 0
ontology 2
ontology 1
ontology 3
ontology 0
ontology 1
ontology 1
reality 3
reality 1
reality 1
reality 3
reality 3
reality 3
mind 3
mind 1
metalinguistic 3
metalinguistic 3
metalinguistic 3
metalinguistic 3
metalinguistic 0
metalinguistic 0
metalinguistic 3
metalinguistic 0
metalinguistic 3
metalinguistic 4
metalinguistic 2
metalinguistic 2
metalinguistic 0
metalinguistic 0
metalinguistic 1
metalinguistic 3
metalinguistic 4
metalinguistic 3
metalinguistic 3
metalinguistic 3
metalinguistic 1
metalinguistic 2
metalinguistic 3
metalinguistic 2
0
73


In [12]:
count_gold = 0
count_cluster = 0

for gold_label in gold_labels:
    count_gold += 1
    
for pred_label in clustering_output:
    count_cluster += 1
    
print(gold_labels)
print(count_gold)
print()
print(clustering_output)
print(count_cluster)

['language', 'language', 'language', 'language', 'language', 'language', 'language', 'language', 'language', 'language', 'language', 'language', 'language', 'language', 'language', 'language', 'language', 'language', 'language', 'language', 'language', 'language', 'language', 'language', 'language', 'language', 'language', 'language', 'ontology', 'ontology', 'ontology', 'ontology', 'ontology', 'ontology', 'ontology', 'ontology', 'ontology', 'ontology', 'ontology', 'ontology', 'ontology', 'reality', 'reality', 'reality', 'reality', 'reality', 'reality', 'mind', 'mind', 'metalinguistic', 'metalinguistic', 'metalinguistic', 'metalinguistic', 'metalinguistic', 'metalinguistic', 'metalinguistic', 'metalinguistic', 'metalinguistic', 'metalinguistic', 'metalinguistic', 'metalinguistic', 'metalinguistic', 'metalinguistic', 'metalinguistic', 'metalinguistic', 'metalinguistic', 'metalinguistic', 'metalinguistic', 'metalinguistic', 'metalinguistic', 'metalinguistic', 'metalinguistic', 'metalingui

In [13]:
from sklearn.metrics.cluster import adjusted_rand_score 

score = adjusted_rand_score(gold_labels, clustering_output)
print(score)

0.008805161125445965


# Centroids 

In [14]:
# Normalize each vector
import math

def normalize_vector(vec):
    mag = math.sqrt(sum([pow(value, 2) for value in vec]))

    unit_vec = []

    for value in vec:
        unit_vec.append(value/mag)
    unit_vec = np.array(unit_vec)
    
vec_list = []

#make this into a function, then loop through the different clusters
for word in termlist: #do this for word in cluster
    if word not in embeddings.vocab:
        continue
    vec = embeddings[word]
    norm_vec = normalize_vector(vec)
    vec_list.append(norm_vec)


    

# Create list of all normalized vectors, then turn into np.array    
    
a_matrix = np.array(vec_list)

# get mean using axis = 0

centroid = np.mean(a_matrix, axis=0)
#print(centroid)

# get cosines between centroid and words


TypeError: unsupported operand type(s) for +: 'NoneType' and 'NoneType'