In [1]:
from glob import glob

import pickle

from nltk.tokenize import RegexpTokenizer
from collections import defaultdict
from typing import List, Dict

import random
import numpy as np

def seed(seed = 1810):
    random.seed(seed)
    np.random.seed(seed)
    #torch.manual_seed(seed)
    #torch.cuda.manual_seed(seed)
    #torch.cuda.manual_seed_all(seed)
    #torch.backends.cudnn.deterministic = True
    #torch.backends.cudnn.benchmark = False

# set seed for reproducibility
SEED = 13
seed(SEED)


In [4]:
from griffon.preprocessing import Tokenizer, CorpusLoader, Vocab
from gensim.models import Word2Vec

corpus_loader = CorpusLoader("../data/intermediary/corpus", Tokenizer())
word2vec_embedding_dim = 64

model = Word2Vec(corpus_loader,
                 vector_size = word2vec_embedding_dim,
                 window = 3,
                 min_count = 3,
                 seed = SEED,
                 workers = 1)

vocab = Vocab(model.wv)


In [None]:
tokenizer = Tokenizer()
input_str = "forall (x y : Carrier (cart E F)) (_ : @Equal (cart E F) x y), @Equal E (proj1 x) (proj1 y) somerandomword"
tokenized = tokenizer(input_str)
print(len(tokenized))
print(input_str)
print(tokenized)
print(vocab.sentence_to_tensor(tokenized).shape)

20
forall (x y : Carrier (cart E F)) (_ : @Equal (cart E F) x y), @Equal E (proj1 x) (proj1 y) somerandomword
['forall', 'x', 'y', 'Carrier', 'cart', 'E', 'F', 'Equal', 'cart', 'E', 'F', 'x', 'y', 'Equal', 'E', 'proj1', 'x', 'proj1', 'y', 'somerandomword']
(22, 64)


In [5]:
with open("../models/vocab.pickle", "wb") as f:
    pickle.dump(vocab, f)

In [None]:
def three_cos_add(word_vec, init, sub, add):
    res = word_vec.most_similar(positive = [init, add], negative = [sub], topn = 5)
    print(f"{init} - {sub} + {add} = {res}")

In [None]:
#FROM DL4NLP CLASS

# import libraries
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

#----------------------------------------------------------------------------------------------------

# perform PCA
def fit_pca(model):
    
    # the pca_model will preserve only the first two principal components
    pca_model = PCA(n_components = 2)
    
    # perform PCA on the model's word embeddings matrix
    pca_model.fit(model.vectors)
    
    # return the pca_model
    return pca_model

#---------------------------------------------------------------------------------------------------

# plot words
def wordlist_2dplot_pca(model, pca_model, word_list):
    
    print(type(model))
    # convert the list of words the their relative word embeddings
    word_vecs = np.vstack([model[w] for w in word_list])
    
    # project the word embeddings to the 2D subspace
    reduced_wordembs = pca_model.transform(word_vecs)
    
    # plot each projected word embedding
    fig, ax = plt.subplots()
    ax.scatter(reduced_wordembs[:, 0], reduced_wordembs[:, 1])
    for i, n in enumerate(word_list):
        ax.annotate(n, (reduced_wordembs[i, 0], reduced_wordembs[i, 1]))

In [None]:
type(word_vectors)

In [None]:

from sklearn.manifold import TSNE
embeddings = word_vectors.vectors

embedded_embeddings = TSNE(n_components=2, learning_rate='auto',
                  init='random').fit_transform(embeddings)

embedded_embeddings.shape

In [None]:
from tqdm import tqdm
from adjustText import adjust_text
%matplotlib notebook


def visualize(model, word_list:List[str]):
    def get_tsne_embeddings(model, perplexity):
        return TSNE(n_components=2, learning_rate='auto',
                      init='random', perplexity=perplexity).fit_transform(model.vectors)

    def wordlist_2dplot_tSNE(ax, model, sne_embeddings, word_list):
        [model.key_to_index[w] for w in word_list]
        # convert the list of words the their relative word embeddings
        word_vecs = np.vstack([model.key_to_index[w] for w in word_list]).reshape((-1))
        print(word_vecs.shape)
        print(sne_embeddings.shape)
        # project the word embeddings to the 2D subspace
        reduced_wordembs = sne_embeddings[word_vecs]
        print(reduced_wordembs.shape)

        # plot each projected word embedding
    #    fig, ax = plt.subplots()
        ax.plot(reduced_wordembs[:,0], reduced_wordembs[:, 1], 'bo')
        text = []
        for i, n in enumerate(word_list):
            text.append(ax.text(reduced_wordembs[i, 0], reduced_wordembs[i, 1], n))

        adjust_text(text, ax=ax)
    
    PERPLEXITIES = [10, 30, 50]
    NR_SAMPLES = 3
    
    f, axs = plt.subplots(NR_SAMPLES,len(PERPLEXITIES),figsize=(15,15))
    for y in tqdm(range(NR_SAMPLES)):
        for x, perplexity in enumerate(PERPLEXITIES):
            if y == 0:
                axs[y,x].title.set_text(f"perplexity : {perplexity}")
            embeddings = get_tsne_embeddings(model, perplexity)
            wordlist_2dplot_tSNE(axs[y,x], model, embeddings, word_list)
        
    # Set common labels
    f.text(0.5, 0.91, 'Perplexity', ha='center', va='center', fontsize="xx-large")
    f.text(0.06, 0.5, 'Attempts', ha='center', va='center', rotation='vertical', fontsize="xx-large")
    
word_list = [
            #nat ops
            "add", "sub",
            "mult", "div",
            "andb", "orb", "negb", "eqb",
            #list
             "app", "cons", "list", "nat", "set",
             "distributive", "commutative", "transitive", "reflexive",
            # relations
             "eq", "symmetric"]        
        
visualize(word_vectors, word_list)


In [None]:
%matplotlib inline
from tqdm import tqdm

PERPLEXITIES = [50]
NR_SAMPLES = 3
    
def dummy_plot(ax):
    x, y = np.random.random((2,10))
    ax.plot(x, y, 'bo')
    texts = [ax.text(x[i], y[i], 'Text%s' %i) for i in range(len(x))]
    adjustText.adjust_text(texts, ax=ax)

f, axs = plt.subplots(3,3,figsize=(15,15))
for y in tqdm(range(NR_SAMPLES)):
    for x, perplexity in enumerate(PERPLEXITIES):
        if y == 0:
            axs[y,x].title.set_text(f"perplexity : {perplexity}")
        dummy_plot(axs[y,x])
# Set common labels
f.text(0.5, 0.91, 'Perplexity', ha='center', va='center', fontsize="xx-large")
f.text(0.06, 0.5, 'Attempts', ha='center', va='center', rotation='vertical', fontsize="xx-large")



In [None]:
three_cos_add(word_vectors, init="plus", sub="nat", add="bool")
three_cos_add(word_vectors, init="mult", sub="nat", add="bool")
three_cos_add(word_vectors, init="add", sub="0", add="mult")
three_cos_add(word_vectors, init="andb", sub="bool", add="prop")

three_cos_add(word_vectors, init="0", sub="add", add="mult")

In [None]:
# list of words
word_list = [
            #nat ops
            "add", "sub",
             "mult", "div",
            # bool ops
            "andb", "orb", "negb", "eqb",
            #list
             "app", "cons", "list",
            # relations
             "eq", "sym", "trans"]

word_vectors.vectors

# perform PCA
pca_model = fit_pca(word_vectors)

# plot the list of words
wordlist_2dplot(word_vectors, pca_model, word_list)

In [None]:
import pickle

with open("vocab.pickle", "wb") as f:
    pickle.dump(word_vectors, f)

In [None]:
!ls -a | grep pickle