# Word embedding


In [1]:
import os
os.chdir("./src") # the source code directory
os.getcwd()
from wordvector import WordVector
from CBOW import CBOW
import loaddoc

import numpy as np
import sklearn.utils
import tensorflow as tf
import pickle

# Load file text8


In [6]:
files = ["../data/text8"] # the input file path
word_array, dictionary, reverse_dict, num_lines, num_words = docload2.build_word_array(
    files, vocab_size=50000, keep_punc=True, sent_token=False)

print('Document loaded and processed: {} lines, {} words.\nDictionary size: {}'
      .format(num_lines, num_words, len(dictionary)))

Document loaded and processed: 1 lines, 17005207 words.
Dictionary size: 50000


In [0]:
# Limit the size of document to limit the size of vocabulary
a = np.where(word_array[0:200000] == len(dictionary))[0]
word_array[a] = np.random.randint(0, len(dictionary), len(a)) # replace non-existing words with random integer

## Neural Net Architecture
![](notebook_images/NN_diagram.png)

In [21]:
graph_params = {'batch_size': 32,
                'vocab_size': len(dictionary),
                "seq_len" : 4,
                'embed_size': 100,
                'hid_size': 100,
                'neg_samples': 64,
                'learning_rate': 0.01,
                'momentum': 0.9,
                'drop_rate': 0.2,
                'embed_noise': 0.1,
                'hid_noise': 0.3,
                'optimizer': 'Momentum',
                'model_name': '../model-save/model_save'}

print('Building training set ...')
x, y = CBOW.build_training_set(word_array[0:200000], graph_params["seq_len"])

# shuffle and split 10% validation data
x_shuf, y_shuf = sklearn.utils.shuffle(x, y, random_state=0)
split = round(x.shape[0]*0.9)
x_val, y_val = (x_shuf[split:, :], y_shuf[split:, :])
x_train, y_train = (x_shuf[:split, :], y_shuf[:split, :])

print('Training set built.')

model = CBOW(graph_params)
print('Model built. Vocab size = {}. Document length = {} words.'
      .format(np.max(x)+1, num_words))

print('Training ...')
results = model.train(x_train, y_train, x_val, y_val, epochs=100, verbose=True, write_summary=True, 
                      pretrain_model=None)
print("Save training history")
with open("../results/hist.pkl", "wb") as f:
    pickle.dump(results, f)


Building training set ...
Training set built.
INFO:tensorflow:Summary name gradients/embed_layer/embedding_lookup_grad/Reshape:0 is illegal; using gradients/embed_layer/embedding_lookup_grad/Reshape_0 instead.
INFO:tensorflow:Summary name gradients/hidden_layer/MatMul_grad/tuple/control_dependency_1:0 is illegal; using gradients/hidden_layer/MatMul_grad/tuple/control_dependency_1_0 instead.
INFO:tensorflow:Summary name gradients/hidden_layer/add_grad/tuple/control_dependency_1:0 is illegal; using gradients/hidden_layer/add_grad/tuple/control_dependency_1_0 instead.
INFO:tensorflow:Summary name gradients/nce_loss/embedding_lookup_grad/Reshape:0 is illegal; using gradients/nce_loss/embedding_lookup_grad/Reshape_0 instead.
INFO:tensorflow:Summary name gradients/nce_loss/embedding_lookup_1_grad/Reshape:0 is illegal; using gradients/nce_loss/embedding_lookup_1_grad/Reshape_0 instead.
Model built. Vocab size = 50000. Document length = 17005207 words.
Training ...
Epoch 1: total batch = 5624,

# Save the results

In [0]:
with open("../results/hist.pkl", "rb") as f:
    results = pickle.load(f)

# Word Similarities
Find closest words by calculating the cosine similarity between the given word and other words in dictionary. The cosine similarity uses two word embeding vectors.
1. From the output of embedding layer
2. From the output of hidden layer



In [0]:
word_vector_embed = WordVector(results['embed_weights'], dictionary)
word_vector_nce = WordVector(results['nce_weights'], dictionary)

In [14]:
word = "see"
print('Embedding layer: 8 closest words to:', "'" + word + "'")
print(word_vector_embed.n_closest(word=word, num_closest=8, metric='cosine'), '\n')
print('Hidden-to-output layer: 8 closest words to:', "'" + word + "'")
print(word_vector_nce.n_closest(word=word, num_closest=8, metric='cosine'))

Embedding layer: 8 closest words to: 'see'
['however', 'but', 'produce', 'therefore', 'timor', 'employees', 'include', 'arsenal'] 

Hidden-to-output layer: 8 closest words to: 'see'
['prevent', 'include', 'live', 'become', 'learn', 'script', 'call', 'show']


# Analogies  

Find some candidates of D in the analogy that A to B is like C to D given A, B, C. It is done by calculating vector $\mathbf{x_d}=\mathbf{x_b}-\mathbf{x_a}+\mathbf{x_c}$.  

In [16]:
print(word_vector_nce.analogy('had', 'has', 'was', 5))

['was', 'has', 'is', 'contains', 'which']


# Predict Words in a Passage

## Original Passage

** american individualist anarchism benjamin tucker in one eight two five josiah warren had participated in a valiantly experiment headed by robert owen called new harmony which failed in a few years amidst much internal conflict warren blamed the community s failure on a lack of individual sovereignty and a lack of private property warren proceeded to organise crick anarchist communities which respected what he called the sovereignty of the individual at utopia and modern times in one eight three three warren wrote and published the peaceful anac which some have noted to be the first anarchist periodical ever published benjamin tucker says that warren**

## Reconstructed Passage

** american democratic anarchism benjamin tucker in one nine two five hours lincoln also thought in a seismic scale on by robert s unicameral new salem which failed in a few years because much internal conflict lincoln blamed the abacus s lives on a topic of individual anthropology and the number of private property aristotle proceeded to be grierson autistic although as held what he called the development of the individual at autistic and modern alchemy in one nine six three lincoln wrote and published a pacific disgraceful class they have said to be the first individual schools ever published benjamin rand says that rand**

In [18]:
# grab 100 word passage from book
# reverse_dict = word_vector_nce.get_reverse_dict()
def get_passage(reverse_dict, word_array, start, end):
    
    passage = [x for x in map(lambda x: reverse_dict[x], word_array[start:end])]
    # print passage with some crude formatting (e.g. space after comma)
    readable = ''
    for word in passage:
        if word == '"':
            readable += word
        elif word in ['?', '!', '.', ',']:
            readable += word + ' '
        else: 
            readable += ' ' + word
    print(readable)

get_passage(reverse_dict, word_array, 1000, 1104)

 american individualist anarchism benjamin tucker in one eight two five josiah warren had participated in a valiantly experiment headed by robert owen called new harmony which failed in a few years amidst much internal conflict warren blamed the community s failure on a lack of individual sovereignty and a lack of private property warren proceeded to organise crick anarchist communities which respected what he called the sovereignty of the individual at utopia and modern times in one eight three three warren wrote and published the peaceful anac which some have noted to be the first anarchist periodical ever published benjamin tucker says that warren


In [24]:
def pred_passage(model, word_array, start, end, seq_len):
    # use model to replace words in original passage with predicted words
    # need to grab 2 words before and after passage
    shift = seq_len // 2
    x, y = CBOW.build_training_set(word_array[start-shift : end+shift], seq_len)
    # x, y = CBOW.build_training_set(word_array[start-shift : end+shift])
    y_pred = model.predict(x, "5")
    passage_predict = [x for x in map(lambda x: reverse_dict[x], y_pred)]
    # print predicted passage
    readable = ''
    for word in passage_predict:
        if word == '"':
            readable += word
        elif word in ['?', '!', '.', ',']:
            readable += word + ' '
        else: 
            readable += ' ' + word
    print(readable)
# model = CBOW(graph_params)
pred_passage(model, word_array, 1000, 1104, graph_params["seq_len"])

INFO:tensorflow:Restoring parameters from ../model-save/model_save-5
 american democratic anarchism benjamin tucker in one nine two five hours lincoln also thought in a seismic scale on by robert s unicameral new salem which failed in a few years because much internal conflict lincoln blamed the abacus s lives on a topic of individual anthropology and the number of private property aristotle proceeded to be grierson autistic although as held what he called the development of the individual at autistic and modern alchemy in one nine six three lincoln wrote and published a pacific disgraceful class they have said to be the first individual schools ever published benjamin rand says that rand
