In [1]:
from glob import glob

import pickle

from nltk.tokenize import RegexpTokenizer
from collections import defaultdict
from typing import List, Dict

from griffon.coq_dataclasses import Stage1Sample, Stage1Token

import random
import numpy as np
import os

def seed(seed = 1810):
    random.seed(seed)
    np.random.seed(seed)
    #torch.manual_seed(seed)
    #torch.cuda.manual_seed(seed)
    #torch.cuda.manual_seed_all(seed)
    #torch.backends.cudnn.deterministic = True
    #torch.backends.cudnn.benchmark = False

# set seed for reproducibility
SEED = 13
seed(SEED)



In [5]:
"test".split("_")

['test']

In [3]:
class Stage1Iterator():

    def __init__(self, stage1_root:str):
        self.files = glob(os.path.join(stage1_root, "**", "*.pickle"), recursive=True)
        assert len(self.files) != 0
        
    def generator(self):

        def flatten(tokens : List[Stage1Token])->List[str]:
            return [subtoken for token in tokens for subtoken in token.subtokens]

        for file in self.files:
            sample:Stage1Sample = pickle.load(open(file, "rb"))
            for hypothesis in sample.hypotheses:
                yield flatten(hypothesis.tokens)
            yield flatten(sample.goal.tokens)
            yield flatten(sample.lemma_used)

    def __iter__(self):
        return self.generator()

In [4]:
from gensim.models import Word2Vec

stage1_iterator = Stage1Iterator("../data/small/data/base/stage1/train")

word2vec_embedding_dim = 64

model = Word2Vec(stage1_iterator,
                 vector_size = word2vec_embedding_dim,
                 window = 3,
                 min_count = 3,
                 seed = SEED,
                 workers = 1)


INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 254937 words, keeping 818 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 455982 words, keeping 1112 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 737772 words, keeping 1462 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 942204 words, keeping 1734 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 1187752 words, keeping 1895 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 1418828 words, keeping 2058 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 1610339 words, keeping 2128 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #80000, processed 1832406 words, keeping 2287 

In [8]:
from pprint import pprint
pprint(vars(model))

{'alpha': 0.025,
 'batch_words': 10000,
 'cbow_mean': 1,
 'comment': None,
 'compute_loss': False,
 'corpus_count': 83147,
 'corpus_total_words': 1921442,
 'cum_table': array([ 121667823,  179733941,  236488238, ..., 2147430505, 2147457076,
       2147483647], dtype=uint32),
 'effective_min_count': 3,
 'epochs': 5,
 'hashfxn': <built-in function hash>,
 'hs': 0,
 'layer1_size': 64,
 'lifecycle_events': [{'datetime': '2021-12-20T17:41:26.416084',
                       'event': 'prepare_vocab',
                       'gensim': '4.0.1',
                       'msg': 'effective_min_count=3 retains 1926 unique words '
                              '(83.81201044386422%% of original 2298, drops '
                              '372)',
                       'platform': 'Linux-5.11.0-41-generic-x86_64-with-debian-bullseye-sid',
                       'python': '3.7.1 (default, Dec 14 2018, 19:28:38) \n'
                                 '[GCC 7.3.0]'},
                      {'datetime': '2021-1

In [None]:
tokenizer = Tokenizer()
input_str = "forall (x y : Carrier (cart E F)) (_ : @Equal (cart E F) x y), @Equal E (proj1 x) (proj1 y) somerandomword"
tokenized = tokenizer(input_str)
print(len(tokenized))
print(input_str)
print(tokenized)
print(vocab.sentence_to_tensor(tokenized).shape)

20
forall (x y : Carrier (cart E F)) (_ : @Equal (cart E F) x y), @Equal E (proj1 x) (proj1 y) somerandomword
['forall', 'x', 'y', 'Carrier', 'cart', 'E', 'F', 'Equal', 'cart', 'E', 'F', 'x', 'y', 'Equal', 'E', 'proj1', 'x', 'proj1', 'y', 'somerandomword']
(22, 64)


In [5]:
with open("../models/vocab.pickle", "wb") as f:
    pickle.dump(vocab, f)

In [None]:
def three_cos_add(word_vec, init, sub, add):
    res = word_vec.most_similar(positive = [init, add], negative = [sub], topn = 5)
    print(f"{init} - {sub} + {add} = {res}")

In [None]:
#FROM DL4NLP CLASS

# import libraries
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

#----------------------------------------------------------------------------------------------------

# perform PCA
def fit_pca(model):
    
    # the pca_model will preserve only the first two principal components
    pca_model = PCA(n_components = 2)
    
    # perform PCA on the model's word embeddings matrix
    pca_model.fit(model.vectors)
    
    # return the pca_model
    return pca_model

#---------------------------------------------------------------------------------------------------

# plot words
def wordlist_2dplot_pca(model, pca_model, word_list):
    
    print(type(model))
    # convert the list of words the their relative word embeddings
    word_vecs = np.vstack([model[w] for w in word_list])
    
    # project the word embeddings to the 2D subspace
    reduced_wordembs = pca_model.transform(word_vecs)
    
    # plot each projected word embedding
    fig, ax = plt.subplots()
    ax.scatter(reduced_wordembs[:, 0], reduced_wordembs[:, 1])
    for i, n in enumerate(word_list):
        ax.annotate(n, (reduced_wordembs[i, 0], reduced_wordembs[i, 1]))

In [None]:
type(word_vectors)

In [None]:

from sklearn.manifold import TSNE
embeddings = word_vectors.vectors

embedded_embeddings = TSNE(n_components=2, learning_rate='auto',
                  init='random').fit_transform(embeddings)

embedded_embeddings.shape

In [None]:
from tqdm import tqdm
from adjustText import adjust_text
%matplotlib notebook


def visualize(model, word_list:List[str]):
    def get_tsne_embeddings(model, perplexity):
        return TSNE(n_components=2, learning_rate='auto',
                      init='random', perplexity=perplexity).fit_transform(model.vectors)

    def wordlist_2dplot_tSNE(ax, model, sne_embeddings, word_list):
        [model.key_to_index[w] for w in word_list]
        # convert the list of words the their relative word embeddings
        word_vecs = np.vstack([model.key_to_index[w] for w in word_list]).reshape((-1))
        print(word_vecs.shape)
        print(sne_embeddings.shape)
        # project the word embeddings to the 2D subspace
        reduced_wordembs = sne_embeddings[word_vecs]
        print(reduced_wordembs.shape)

        # plot each projected word embedding
    #    fig, ax = plt.subplots()
        ax.plot(reduced_wordembs[:,0], reduced_wordembs[:, 1], 'bo')
        text = []
        for i, n in enumerate(word_list):
            text.append(ax.text(reduced_wordembs[i, 0], reduced_wordembs[i, 1], n))

        adjust_text(text, ax=ax)
    
    PERPLEXITIES = [10, 30, 50]
    NR_SAMPLES = 3
    
    f, axs = plt.subplots(NR_SAMPLES,len(PERPLEXITIES),figsize=(15,15))
    for y in tqdm(range(NR_SAMPLES)):
        for x, perplexity in enumerate(PERPLEXITIES):
            if y == 0:
                axs[y,x].title.set_text(f"perplexity : {perplexity}")
            embeddings = get_tsne_embeddings(model, perplexity)
            wordlist_2dplot_tSNE(axs[y,x], model, embeddings, word_list)
        
    # Set common labels
    f.text(0.5, 0.91, 'Perplexity', ha='center', va='center', fontsize="xx-large")
    f.text(0.06, 0.5, 'Attempts', ha='center', va='center', rotation='vertical', fontsize="xx-large")
    
word_list = [
            #nat ops
            "add", "sub",
            "mult", "div",
            "andb", "orb", "negb", "eqb",
            #list
             "app", "cons", "list", "nat", "set",
             "distributive", "commutative", "transitive", "reflexive",
            # relations
             "eq", "symmetric"]        
        
visualize(word_vectors, word_list)


In [None]:
%matplotlib inline
from tqdm import tqdm

PERPLEXITIES = [50]
NR_SAMPLES = 3
    
def dummy_plot(ax):
    x, y = np.random.random((2,10))
    ax.plot(x, y, 'bo')
    texts = [ax.text(x[i], y[i], 'Text%s' %i) for i in range(len(x))]
    adjustText.adjust_text(texts, ax=ax)

f, axs = plt.subplots(3,3,figsize=(15,15))
for y in tqdm(range(NR_SAMPLES)):
    for x, perplexity in enumerate(PERPLEXITIES):
        if y == 0:
            axs[y,x].title.set_text(f"perplexity : {perplexity}")
        dummy_plot(axs[y,x])
# Set common labels
f.text(0.5, 0.91, 'Perplexity', ha='center', va='center', fontsize="xx-large")
f.text(0.06, 0.5, 'Attempts', ha='center', va='center', rotation='vertical', fontsize="xx-large")



In [None]:
three_cos_add(word_vectors, init="plus", sub="nat", add="bool")
three_cos_add(word_vectors, init="mult", sub="nat", add="bool")
three_cos_add(word_vectors, init="add", sub="0", add="mult")
three_cos_add(word_vectors, init="andb", sub="bool", add="prop")

three_cos_add(word_vectors, init="0", sub="add", add="mult")

In [None]:
# list of words
word_list = [
            #nat ops
            "add", "sub",
             "mult", "div",
            # bool ops
            "andb", "orb", "negb", "eqb",
            #list
             "app", "cons", "list",
            # relations
             "eq", "sym", "trans"]

word_vectors.vectors

# perform PCA
pca_model = fit_pca(word_vectors)

# plot the list of words
wordlist_2dplot(word_vectors, pca_model, word_list)

In [None]:
import pickle

with open("vocab.pickle", "wb") as f:
    pickle.dump(word_vectors, f)

In [None]:
!ls -a | grep pickle