# Preliminaries

All of the same preliminaries

In [None]:
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt

def norm_vec(v):
    mag = np.linalg.norm(v)
    if mag == 0:
        return v
    return v / np.linalg.norm(v)

from sklearn.preprocessing import normalize

def normalize_rows(x):
    return normalize(x, axis=1)

def normalize_columns(x):
    return normalize(x, axis=0)

def check_float(potential_float):
    try:
        float(potential_float)
        return True
    except ValueError:
        return False

def round_if_float(v, prec=3):
    if check_float(v):
        return round(float(v), prec)
    return v

from IPython.core.display import display, HTML
def list_table(the_list, color_nums=False):
    html = ["<table style= 'border: 1px solid black; display:inline-block'>"]
    for row in the_list:
        html.append("<tr>")
        for col in row:
            if color_nums and check_float(col) and not float(col) == 0:
                html.append("<td align='left' style='border: .5px solid gray; color: {1}; font-weight: bold'>{0}</td>".format(round_if_float(col), color_nums))
            else:
                html.append("<td align='left' style='border: .5px solid gray;'>{0}</td>".format(round_if_float(col)))
        html.append("</tr>")
    html.append("</table>")
    return display(HTML(''.join(html)))

def show_labeled_table(mat, col_names=None, row_names=None, nrows=10, ncols=10, color_nums="red"):
    sml = mat[:nrows, :ncols]
    if col_names is not None:
        sml = np.vstack([col_names[:ncols], sml])
    if row_names is not None:
        rnames = [[p] for p in row_names[:nrows]]
        if col_names is not None:
            new_col = np.array([["_"]] + rnames)
        else:
            new_col = np.array(rnames)
        sml = np.hstack((new_col, sml))
    return list_table(sml, color_nums)

def compute_doc_vector(tdoc, vocab):
    return np.array([tdoc.count(w) for w in vocab])

# Word vectors for the seasons

## Load the training corpus. 

This time we'll split it into sentences

In [None]:
import re
fname = 'corpora/seasons_training.txt'
f = open(fname)
raw = f.read().lower()
whole_training_docs = re.findall(r"<text>([\s\S]*?)</text>", raw)

In [None]:
training_docs = []
para_names = []
for i, d in enumerate(whole_training_docs):
    new_docs = nltk.sent_tokenize(d)
    training_docs += new_docs
    new_names = ["d{}p{}".format(i, p) for p in range(len(new_docs))]
    para_names += new_names

Now we have close to 14000 documents

In [None]:
len(training_docs)

## Tokenize the training documents

Now we tokenize the ~14000 training documents

In [None]:
from seasons_module import seasons_tokenize
tokenized_training_docs = []
for doc in training_docs:
    tdoc = seasons_tokenize(doc)
    tokenized_training_docs.append(tdoc)

In [None]:
fdist = nltk.FreqDist()
for doc in tokenized_training_docs:
    fdist.update(doc)

f = open("lists/seasons_stop_list.txt")
stop_list = set(f.read().split("\n"))

full_vocab = [w[0] for w in fdist.most_common() if w[0] not in stop_list]
vocab = full_vocab[:500]

## Get the document vectors


When I first tried to compute document vectors executing it took a long time to run - 5 or 10 minutes. I had to poke around the internet and fiddle a bit. I figured out that building the big matrix in chunks and then putting those chunks together made a big difference.

In [None]:
from IPython.display import display, clear_output

def wfactor(tf):
    if tf == 0:
        result = 0
    else:
        result = (1 + np.log(tf))
    return result

def compute_doc_vector(token_list, vocab):
    return np.array([wfactor(token_list.count(word)) for word in vocab])

chunk_list = []
empty_rows = 0
chunk = np.array([], dtype=np.int64).reshape(0,len(vocab))
for i, tdoc in enumerate(tokenized_training_docs):
    new_row = compute_doc_vector(tdoc, vocab)
    if i % 500 == 0:
        clear_output(wait=True)
        display(str(i))
        chunk_list.append(chunk)
        chunk = np.array([], dtype=np.int64).reshape(0,len(vocab))
    if np.linalg.norm(new_row) == 0:
        empty_rows += 1
        continue
    chunk = np.concatenate([chunk, np.array([new_row])])
    
chunk_list.append(chunk)
clear_output(wait=True)
display(str(i) + " joining chunks")

training_dt_matrix2 = np.concatenate(chunk_list)
print("empty rows {}".format(empty_rows))

In [None]:
training_dt_matrix2.shape

In [None]:
show_labeled_table(training_dt_matrix2, vocab, para_names, nrows=15, ncols=10)

## Word vectors with reduced dimensions

If, as before, we think of each of the columns in our table as a vector for the words, then each word is a vector with 13,264 dimensions.

We can use something called the singular value decomposition to reduce the number of dimensions. You read about this in the paper by Landauer. This is the trick that is used in latent semantic analysis.

First we pick a number of dimensions. I'm using a slider widget simply because it's fun.

In [None]:
import ipywidgets as widgets
w = widgets.IntSlider(value=100, max=200, description="rdims")
display(w)

In [None]:
rdims = w.value
# do the SVD and reduce dimensions
T, S, Dt = np.linalg.svd(training_dt_matrix2.transpose(), full_matrices = False)
T_reduced = T[:, 0:rdims]
T_normed = normalize_rows(T_reduced)
print("shape of t_normed is {}".format(T_normed.shape))
show_labeled_table(T_normed, None, vocab, nrows=10, ncols=100, color_nums=None)

The SVD produces three matrices. We can think of the first one "T" as having rows corresponding to our vocabulary. And we can take as many columns as we want.

In [None]:
def get_word_vector(w, vocab, mat):
    return norm_vec(mat[vocab.index(w)])

def compare_word_vectors(w1, w2, vocab, mat):
    return np.dot(get_word_vector(w1, vocab, mat), get_word_vector(w2, vocab, mat))

def get_doc_vector(doc, vocab, td_mat):
    s = np.zeros(td_mat.shape[1])
    for w in doc:
        if w in vocab:
            s = s + get_word_vector(w, vocab, td_mat)
    return s

In [None]:
compare_word_vectors("close", "closer", vocab, T_reduced)

In [None]:
def most_similar(w, vocab, mat, n=10):
    sims = []
    for w2 in vocab:
        if w2 == w:
            continue
        sims.append([w2, compare_word_vectors(w, w2, vocab, mat), fdist[w2]])
    return sorted(sims, key=lambda item: item[1], reverse=True)[:n]

In [None]:
most_similar("hemisphere", vocab, T_reduced)

In [None]:
most_similar("tilt", vocab, T_reduced)

## PCA for plotting

I'll show you a way of squishing any higher dimensional matrix down to a smaller number of dimensions for the purpose of making plots.

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

def squish_matrix(X, ncomponents=2):
    pca = PCA(n_components=ncomponents)
    return pca.fit_transform(X)

def double_squish(X, ncomponents=2):
    pca = PCA(n_components=50)
    reduc = pca.fit_transform(X)
    return TSNE(n_components=ncomponents, random_state=0, perplexity=15).fit_transform(reduc)

def alt_squish(X, ncomponents=2):
    return TSNE(n_components=ncomponents, random_state=0, perplexity=15).fit_transform(X)

In [None]:
%matplotlib widget
def plot_matrix(mat, labels, number_to_plot=10, figsize=(10, 10), c="red"):
    if mat.shape[1] > 3:
        print("too many dimensions")
        return
    if number_to_plot > mat.shape[0]:
        number_to_plot = mat.shape[0]
    fig=plt.figure(figsize=figsize, dpi= 80, facecolor='w', edgecolor='k')
    if mat.shape[1] == 3:
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(mat[:, 0][:number_to_plot], mat[:, 1][:number_to_plot], mat[:, 2][:number_to_plot], c=c)
        for i in range(number_to_plot):
            ax.text(mat[i, 0], mat[i, 1], mat[i, 2], labels[i])

    else:
        plt.scatter(mat[:, 0][:number_to_plot], mat[:, 1][:number_to_plot], c=c)
        for i in range(number_to_plot):
            plt.annotate(labels[i], mat[i])

In [None]:
sm = squish_matrix(T_normed, 3)
plot_matrix(sm, vocab, 10, figsize=(10, 10))

In [None]:
sm = squish_matrix(T_normed, 3)
plot_matrix(sm, vocab, 10, figsize=(10, 10))

Let's instead plot a few selected words

In [None]:
selected_words = ["tilt", "hemisphere", "northern", "southern", "side", "day", "night", "moon", "spin", "closer", "close", "near", "far", "farther"]
folded_vocab_list = []
found_vocab = []
for w in selected_words:
    if w not in vocab:
        continue
    found_vocab.append(w)
    v = get_word_vector(w, vocab, T_normed)
    # v = sm[vocab.index(w)]
    folded_vocab_list.append(v)
folded_vocab_matrix = np.array(folded_vocab_list)

In [None]:
show_labeled_table(folded_vocab_matrix, None, found_vocab, nrows=25, color_nums=None)

In [None]:
sm = squish_matrix(folded_vocab_matrix, 2)
plot_matrix(sm, found_vocab, 10, figsize=(5, 5))

In [None]:
import copy, random
def plot_similar(the_word, vocab, mat, dims=2, n=10):
    similar_words = [w[0] for w in most_similar(the_word, vocab, mat)]
    folded_vocab_list = [mat[vocab.index(the_word)]]
    found_vocab = [the_word]
    colors = ["red"]
    for w in similar_words[:n]:
        if w not in vocab:
            continue
        found_vocab.append(w)
        v = mat[vocab.index(w)]
        folded_vocab_list.append(v)
        colors.append("blue")
    new_vocab = copy.deepcopy(vocab)
    random.shuffle(new_vocab)
    for w in new_vocab[:n]:
        if w not in vocab:
            continue
        found_vocab.append(w)
        v = mat[vocab.index(w)]
        folded_vocab_list.append(v)
        colors.append("green")
    folded_vocab_matrix = np.array(folded_vocab_list)
    squished_matrix = alt_squish(folded_vocab_matrix, dims)
    plot_matrix(squished_matrix, found_vocab, 50, c=colors)

In [None]:
%matplotlib notebook
plot_similar("hemisphere", vocab, T_normed, 2)