# Preliminaries

In [None]:
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt

def norm_vec(v):
    mag = np.linalg.norm(v)
    if mag == 0:
        return v
    return v / np.linalg.norm(v)

from sklearn.preprocessing import normalize

def normalize_rows(x):
    return normalize(x, axis=1)

def normalize_columns(x):
    return normalize(x, axis=0)

def check_float(potential_float):
    try:
        float(potential_float)
        return True
    except ValueError:
        return False

def round_if_float(v, prec=3):
    if check_float(v):
        return round(float(v), prec)
    return v

from IPython.core.display import display, HTML
def list_table(the_list, color_nums=False):
    html = ["<table style= 'border: 1px solid black; display:inline-block'>"]
    for row in the_list:
        html.append("<tr>")
        for col in row:
            if color_nums and check_float(col) and not float(col) == 0:
                html.append("<td align='left' style='border: .5px solid gray; color: {1}; font-weight: bold'>{0}</td>".format(round_if_float(col), color_nums))
            else:
                html.append("<td align='left' style='border: .5px solid gray;'>{0}</td>".format(round_if_float(col)))
        html.append("</tr>")
    html.append("</table>")
    return display(HTML(''.join(html)))

def show_labeled_table(mat, col_names=None, row_names=None, nrows=10, ncols=10, color_nums="red"):
    sml = mat[:nrows, :ncols]
    if col_names is not None:
        sml = np.vstack([col_names[:ncols], sml])
    if row_names is not None:
        rnames = [[p] for p in row_names[:nrows]]
        if col_names is not None:
            new_col = np.array([["_"]] + rnames)
        else:
            new_col = np.array(rnames)
        sml = np.hstack((new_col, sml))
    return list_table(sml, color_nums)

def compute_doc_vector(tdoc, vocab):
    return np.array([tdoc.count(w) for w in vocab])

# Problems and solutions

There are a number of problems with our approach to getting document vectors. When our vocabulary is large, we will have many, many dimensions. This can get unwieldy.

Even more problematic, many blocks of text will have a dot product of zero, even when they have a similar meaning, since they are using different words to say the same thing

One way to think about the source of this problem is that we don't really have any way in which we have captured the meaning of words. For example, "towards" and "closer" probably have related meanings. But if one block of text uses "towards" and another "closer," then our vectors for the two documents will have zero dot product.

### Word vectors to the rescue

* A solution to both of these problems is to come up with a way of representing the words as vectors.
* Words with similar meanings with will correspond to similar vectors.
* We will get the vectors for documents by combining the vectors for words.
* Coming up with good ways of finding the vectors for words is a place where a lot of recent work has focused.

### A "training" or auxiliary corpus

One way to do this, is to use some sort of auxiliary corpus to create a set of word vectors. Then apply those word vectors to your research data. That's what we'll do now.

# Word vectors for the seasons

I went around the internet and collected a hundred or so text documents that use words that are related to the seasons. We'll use that to create some word vectors

## Load the training corpus

In [None]:
import re
fname = 'corpora/seasons_training.txt'
f = open(fname)
raw = f.read().lower()
whole_training_docs = re.findall(r"<text>([\s\S]*?)</text>", raw)

In [None]:
len(whole_training_docs)

So there are 196 training documents - essentially the text of web pages.

We are going to split this up into paragraphs so that we have smaller contexts for words. We'll end up with over 3000 paragraphs.

I'm also create a list called `para_names` that will contain a list of simple names for these paragraphs. You'll see where we'll use these below.

In [None]:
training_docs = []
para_names = []
for i, d in enumerate(whole_training_docs):
    new_docs = d.split("\n\n")
    training_docs += new_docs
    new_names = ["d{}p{}".format(i, p) for p in range(len(new_docs))]
    para_names += new_names

In [None]:
len(training_docs)

## Tokenize the training documents

Now we tokenize the 3000+ training documents

In [None]:
from seasons_module import seasons_tokenize
tokenized_training_docs = []
for doc in training_docs:
    tdoc = seasons_tokenize(doc)
    tokenized_training_docs.append(tdoc)

## From tokenized training documents to word vectors

So how dow we use these tokenized training documents to create words vectors?

There are many ways. In particular there are increasingly sophisticated libraries that will just do this for us, and we will look at one of these later. But I want to go through one approach so you get the sense for what is going on.

### Create document vectors

Create a document vector for each paragraph of the training corpus and put those vectors in a big matrix.

We will use the same small vocabulary we used in the last notebook, just to keep things simple. (I'll just copy it over.)

I'm going to drop any vectors that have all zeros as entries.

In [None]:
vocab = ['towards', 'closer', 'orbits', 'spring', 'center', 'moves', 'point', 'farther', 'middle', 'ovally']

def wfactor(tf):
    if tf == 0:
        result = 0
    else:
        result = (1 + np.log(tf))
    return result


def compute_doc_vector(token_list, vocab):
    return np.array([wfactor(token_list.count(word)) for word in vocab])

training_dt_matrix = np.zeros(len(vocab))

for tdoc in tokenized_training_docs:
    new_row = compute_doc_vector(tdoc, vocab)
    if np.linalg.norm(new_row) == 0:
        continue
    training_dt_matrix = np.vstack([training_dt_matrix, new_row])

The result of this is a table with 10 columns (one for each term in the vocabulary) and 555 rows.

The reason we have only 555 rows is that we dropped all of the rows for documents that didn't include any of the words in our vocabulary

In [None]:
training_dt_matrix.shape

In [None]:
show_labeled_table(training_dt_matrix, vocab, para_names, nrows=15, ncols=10)

## Word vectors

So now we've got this big table. The trick is that we can think of the *columns* as vectors that represent the meaning of words. In this case they will be 555 dimensional vectors.

In [None]:
show_labeled_table(training_dt_matrix, vocab, para_names, nrows=15, ncols=10)

Let's transpose it, so the terms are on the rows

In [None]:
training_td_matrix = training_dt_matrix.transpose()

In [None]:
show_labeled_table(training_td_matrix, para_names, vocab, nrows=15, ncols=15)

Now we can compare two words using these word vectors. Let's look at a couple.

In [None]:
def get_word_vector(w, vocab, mat):
    return norm_vec(mat[vocab.index(w)])

def compare_word_vectors(w1, w2, vocab, mat):
    return np.dot(get_word_vector(w1, vocab, mat), get_word_vector(w2, vocab, mat))

In [None]:
compare_word_vectors("closer", "towards", vocab, training_td_matrix)

In [None]:
compare_word_vectors("closer", "farther", vocab, training_td_matrix)

## Document vectors from word vectors

We can use our word vectors to create document vectors for other documents, such as the ones we were working with in the last notebook

In [None]:
raw_transcript_docs = {
    "d1": "That's because of the sun is in the center and the Earth moves around the sun and the Earth is like at one point in the winter", 
    "d2": "it's like farther away from the sun and towards the summer it's closer it's near, towards the sun.",
    "d3": "The sun's in the middle  and the Earth kind of orbits around it.",
    "d4": "And like say at one - it's probably more of an ovally type thing  In the winter, er probably this will be winter since it's further away",
    "d5": "that's winter would be like, the Earth orbits around the sun .  Like summer is the closest to the sun", 
    "d6": "Spring is kind of a little further away, and then like Fall  is further away then spring but not as far as winter, and then winter is the furthest.",
    "d7": "the sun doesn't, like the flashlight and the bulb, it hits summer, the lines like fade in , they get there closer, like quicker",
    "d8": "And by the time they get there [winter], it fades and it's a lot colder for winter"
}

transcript_doc_names = list(raw_transcript_docs.keys())
tokenized_transcript_docs = [seasons_tokenize(doc) for doc in raw_transcript_docs.values()]

We take the vector for each word in the document and add them together.

In [None]:
def get_doc_vector(doc, vocab, td_mat):
    s = np.zeros(td_mat.shape[1])
    for w in doc:
        if w in vocab:
            s = s + get_word_vector(w, vocab, td_mat)
    return s

In [None]:
v1 = get_doc_vector(tokenized_transcript_docs[0], vocab, training_td_matrix)
v1.shape

In [None]:
def compare_folded_doc_vectors(docA, docB, vocab, mat):
    v1 = get_doc_vector(tokenized_transcript_docs[transcript_doc_names.index(docA)], vocab, mat)
    v2 = get_doc_vector(tokenized_transcript_docs[transcript_doc_names.index(docB)], vocab, mat)
    return np.dot(norm_vec(v1), norm_vec(v2))

In [None]:
compare_folded_doc_vectors("d1", "d3", vocab, training_td_matrix)