# Preliminaries

In [None]:
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt

def norm_vec(v):
    mag = np.linalg.norm(v)
    if mag == 0:
        return v
    return v / np.linalg.norm(v)

from sklearn.preprocessing import normalize

def normalize_rows(x):
    return normalize(x, axis=1)

def normalize_columns(x):
    return normalize(x, axis=0)

def check_float(potential_float):
    try:
        float(potential_float)
        return True
    except ValueError:
        return False

def round_if_float(v, prec=3):
    if check_float(v):
        return round(float(v), prec)
    return v

from IPython.core.display import display, HTML
def list_table(the_list, color_nums=False):
    html = ["<table style= 'border: 1px solid black; display:inline-block'>"]
    for row in the_list:
        html.append("<tr>")
        for col in row:
            if color_nums and check_float(col) and not float(col) == 0:
                html.append("<td align='left' style='border: .5px solid gray; color: {1}; font-weight: bold'>{0}</td>".format(round_if_float(col), color_nums))
            else:
                html.append("<td align='left' style='border: .5px solid gray;'>{0}</td>".format(round_if_float(col)))
        html.append("</tr>")
    html.append("</table>")
    return display(HTML(''.join(html)))

def show_labeled_table(mat, col_names=None, row_names=None, nrows=10, ncols=10, color_nums="red"):
    sml = mat[:nrows, :ncols]
    if col_names is not None:
        sml = np.vstack([col_names[:ncols], sml])
    if row_names is not None:
        rnames = [[p] for p in row_names[:nrows]]
        if col_names is not None:
            new_col = np.array([["_"]] + rnames)
        else:
            new_col = np.array(rnames)
        sml = np.hstack((new_col, sml))
    return list_table(sml, color_nums)

def compute_doc_vector(tdoc, vocab):
    return np.array([tdoc.count(w) for w in vocab])

# Load, prepare the training corpus as before

In [None]:
import re
fname = 'corpora/seasons_training.txt'
f = open(fname)
raw = f.read().lower()
whole_training_docs = re.findall(r"<text>([\s\S]*?)</text>", raw)

training_docs = []
para_names = []
for i, d in enumerate(whole_training_docs):
    new_docs = nltk.sent_tokenize(d)
    training_docs += new_docs
    new_names = ["d{}p{}".format(i, p) for p in range(len(new_docs))]
    para_names += new_names
    
len(training_docs)

Tokenize

In [None]:
from seasons_module import seasons_tokenize
tokenized_training_docs = []
f = open("lists/seasons_stop_list.txt")
stop_list = set(f.read().split("\n"))
for doc in training_docs:
    tdoc = seasons_tokenize(doc)
    tdoc = [w for w in tdoc if w not in stop_list]
    tokenized_training_docs.append(tdoc)

# Word2Vec

Word2Vec is an algorithm that will take tokenized sentences and produces a set of word vectors for us.

We'll use the implementation in the gensim library ([docs](https://radimrehurek.com/gensim/models/word2vec.html))

There are a few online pages that go into a bit of detail explaining word2vec. (For [example](https://jalammar.github.io/illustrated-word2vec/).)

In [None]:
from gensim.models import Word2Vec
wv_model = Word2Vec(sentences=tokenized_training_docs,
                    vector_size=200)

This produces what gensim calls a [keyed vector](https://radimrehurek.com/gensim/models/keyedvectors.html#gensim.models.keyedvectors.KeyedVectors) object that we can use to, among other things, get a vector for a word.

In [None]:
kvecs = wv_model.wv
kvecs["side"]

In [None]:
def compare_words(w1, w2, wvecs):
    return wvecs.similarity(w1, w2)

def get_word_vector(w1, wvecs):
    return norm_vec(wvecs[w1])

In [None]:
compare_words("side", "side", kvecs)

In [None]:
compare_words("northern", "hemisphere", kvecs)

In [None]:
compare_words("close", "hemisphere", kvecs)

In [None]:
list_table(kvecs.similar_by_key("hemisphere"))

In [None]:
list_table(kvecs.similar_by_key('tilt'))

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

def squish_matrix(X, ncomponents=2):
    pca = PCA(n_components=ncomponents)
    return pca.fit_transform(X)

def double_squish(X, ncomponents=2):
    pca = PCA(n_components=50)
    reduc = pca.fit_transform(X)
    return TSNE(n_components=ncomponents, random_state=0, perplexity=15).fit_transform(reduc)


def alt_squish(X, ncomponents=2):
    return TSNE(n_components=ncomponents, random_state=0, perplexity=15).fit_transform(X)

In [None]:
def plot_matrix(mat, labels, number_to_plot=10, figsize=(10, 10), c="red"):
    if mat.shape[1] > 3:
        print("too many dimensions")
        return
    if number_to_plot > mat.shape[0]:
        number_to_plot = mat.shape[0]
    fig=plt.figure(figsize=figsize, dpi= 80, facecolor='w', edgecolor='k')
    if mat.shape[1] == 3:
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(mat[:, 0][:number_to_plot], mat[:, 1][:number_to_plot], mat[:, 2][:number_to_plot], c=c)
        for i in range(number_to_plot):
            ax.text(mat[i, 0], mat[i, 1], mat[i, 2], labels[i])

    else:
        plt.scatter(mat[:, 0][:number_to_plot], mat[:, 1][:number_to_plot], c=c)
        for i in range(number_to_plot):
            plt.annotate(labels[i], mat[i])

In [None]:
%matplotlib notebook
import copy, random
def plot_similar(the_word, kvecs, dims=2, n=10):
    similar_words = [w[0] for w in kvecs.similar_by_key(the_word, 1000)]
    folded_vocab_list = [kvecs[the_word]]
    found_vocab = [the_word]
    colors = ["red"]
    for w in similar_words[:n]:
        found_vocab.append(w)
        v = kvecs[w]
        folded_vocab_list.append(v)
        colors.append("blue")
    random.shuffle(similar_words)
    for w in similar_words[:n]:
        found_vocab.append(w)
        v = kvecs[w]
        folded_vocab_list.append(v)
        colors.append("green")
    folded_vocab_matrix = np.array(folded_vocab_list)
    squished_matrix = alt_squish(folded_vocab_matrix, dims)
    plot_matrix(squished_matrix, found_vocab, 50, c=colors)

In [None]:
plot_similar("hemisphere", kvecs, dims=2)

`a : b :: c : d` and we have to find the word ‘d’. 

The associated word vectors va, vb, vc, vd are related to each other in the following relationship: `vb – va = vd – vc`

Example: `Paris:France::Berlin::Germany`

In [None]:
sims = word_vectors.most_similar(positive=['France', 'Berlin'], negative=['Paris'])