In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from numpy import dot
import pandas as pd

In [None]:
def norm_vec(v):
    return v / np.linalg.norm(v)
np.set_printoptions(precision=3)

def normalize_rows(x):
    return x/np.linalg.norm(x, ord=2, axis=1, keepdims=True)

def normalize_columns(x):
    return x/np.linalg.norm(x, ord=2, axis=0, keepdims=True)

# Vectors for *words*

We're going to prepare a version of the corpus here where we treat each student turn of talk (utterance) as a document.

I'm not going to normalize the vectors this time.

In [None]:
from utilities import *
from seasons_module import load_seasons_corpus_as_utterances
from seasons_module import load_seasons_comparison_files

import numpy as np
def norm_vec(vec):
    mag = np.dot(vec, vec)
    if mag == 0:
        return vec
    else:
        return(vec / np.sqrt(mag))
    
def pure_tf(tf, df, cf, N):
    return tf

def tf(tf):
    if tf == 0:
        result = 0
    else:
        result = (1 + np.log(tf))
    return result

def weighted_word(the_text, word):
    return tf(the_text.count(word))

def compute_normed_doc_vector(word_list, vocab):
    return norm_vec(np.array([weighted_word(word_list, word) for word in vocab]))

def compute_doc_vector(word_list, vocab):
    return np.array([weighted_word(word_list, word) for word in vocab])

def prepare_seasons_corpus_by_utterance(vocab_size=50, norm_vecs=False):
    seasons_corpus = load_seasons_corpus_as_utterances()
    f = open("lists/seasons_stop_list.txt")
    stop_list = set(f.read().split("\n"))
    word_fdist = nltk.FreqDist()
    for fname in seasons_corpus.keys():
        for utterance in seasons_corpus[fname][0]:
            pruned_transcript_words = [w for w in utterance if w not in stop_list]
            word_fdist.update(pruned_transcript_words)
    new_vocab = [w[0] for w in word_fdist.most_common(vocab_size) if w not in stop_list]
    
    # compute the document vector for each document
    doc_vectors = []
    for fname in seasons_corpus.keys():
        for utterance in seasons_corpus[fname][0]:
            if norm_vecs:
                doc_vectors.append(compute_normed_doc_vector(utterance, new_vocab))
            else:
                doc_vectors.append(compute_doc_vector(utterance, new_vocab))
    return doc_vectors, new_vocab


In [None]:
doc_vectors, vocab = prepare_seasons_corpus_by_utterance()

In [None]:
vp = doc_vectors[0]

Next we put all of these vectors into a big array.

In [None]:
X = np.array(doc_vectors)

In [None]:
X.shape

Now we have a matrix where each row corresponds to a one of the 1212 utterances and each column corresponds to one of the words

One way to look at this is to make a big heatmap. We're just going to do it for the first 100 rows

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
fig=plt.figure(figsize=(10, 15), dpi= 80, facecolor='w', edgecolor='k')
n = len(vocab)
part_of_X = X[:100]
x_tick_marks = np.arange(n)
y_tick_marks = np.arange(part_of_X.shape[0])
plt.xticks(x_tick_marks, vocab, fontsize=8, rotation=90)
# plt.yticks(y_tick_marks, labels=None)
plt.tick_params("x", top=True, labeltop=True, bottom=False, labelbottom=False)
plt.imshow(part_of_X, norm=matplotlib.colors.LogNorm(), interpolation='nearest', cmap='YlOrBr')

## Get ready to have your mind blown

This is the cool, tricky, surprising part. We can think of each column as telling us something about the meaning of the word. That means we have, for each word, a vector 1212 numbers long. And we can find the similarity in meaning of two words by finding the dot product of these vectors.

The idea here is that "we know a word by the company it keeps."

In [None]:
def get_column_vector(w, X):
    cnumber = vocab.index(w)
    return X[:, cnumber]

def compare_words(w1, w2, X):
    v1 = get_column_vector(w1, X)
    v2 = get_column_vector(w2, X)
    return np.dot(norm_vec(v1), norm_vec(v2))

In [None]:
get_column_vector("earth", X)

In [None]:
compare_words("northern", "hemisphere", X)

In [None]:
compare_words("northern", "side", X)

Using the power of math, it is easy to compare every word vector to every other word vector and to put it in a big array. This isn't particularly important. But it's nifty.

First we need to normalize the columns. 
Then we transpose the array.
Finally we multiply it times the original untransposed array.

In [None]:
Xnorm = normalize_columns(X)

In [None]:
Xtnorm = np.transpose(Xnorm)

In [None]:
mat = np.dot(Xtnorm, Xnorm)

Now we can make a heatmap that displays this array.

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
fig=plt.figure(figsize=(10, 10), dpi= 80, facecolor='w', edgecolor='k')
n = len(vocab)
x_tick_marks = np.arange(n)
y_tick_marks = np.arange(n)
plt.xticks(x_tick_marks, vocab, fontsize=8, rotation=90)
plt.yticks(y_tick_marks, vocab, fontsize=8)
plt.tick_params("x", top=True, labeltop=True, bottom=False, labelbottom=False)
plt.imshow(mat, norm=matplotlib.colors.LogNorm(), interpolation='nearest', cmap='YlOrBr')

### Wordvecs with a larger corpus

In [None]:
def training_tokenize(text):

    # Separate most punctuation
    text = re.sub(r"(\w)([\.\-\/&\";:\(\)\?\!\]\[\{\}\*])", r'\1 \2 ', text)

    # Separate commas if they're followed by space.
    # (E.g., don't separate 2,500)
    text = re.sub(r"(,\s)", r' \1', text)

    # Separate leading and trailing single and double quotes .
    text = re.sub(r"('\s)", r' \1', text)
    text = re.sub(r"(\s\')", r'\1 ', text)
    text = re.sub(r"(\"\s)", r' \1', text)
    text = re.sub(r"(\s\")", r'\1 ', text)

    #Separate parentheses where appropriate
    text = re.sub(r"(\)\s)", r' \1', text)
    text = re.sub(r"(\s\()", r'\1 ', text)

    # Separate periods that come before newline or end of string.
    text = re.sub('\. *(\n|$)', ' . ', text)
    return text.split()


contraction_patterns = re.compile(r"(?i)(.)('ll|'re|'ve|n't|'s|'m|'d)\b")
def is_contraction(the_text):
        return contraction_patterns.search(the_text)

def alpha_only (ltext):
    return [w.lower() for w in ltext if (len(w) > 0) and (w.isalpha() or w[0]=='<' or is_contraction(w))]

In [None]:
import re
def load_training_corpus(fname):
    f = open(fname)
    raw = f.read().lower()
    docs = re.findall(r"<text>([^<]*)", raw)
    paras = []
    for doc in docs:
        paras += doc.split("\n\n")
    tdocs = [alpha_only(training_tokenize(para)) for para in paras]
    return tdocs

In [None]:
tdocs = load_training_corpus('corpora/seasons_training.txt')

In [None]:
len(tdocs)

In [None]:
# create the vocabulary
import nltk
all_words = []
fdist = nltk.FreqDist()
for tdoc in tdocs:
    fdist.update(tdoc)
f = open("lists/seasons_stop_list.txt")
stop_list = set(f.read().split("\n"))
vocab = [w[0] for w in fdist.most_common(2000) if not (w[0] in stop_list)]

In [None]:
doc_vectors = []
for tdoc in tdocs:
    dvec = [tdoc.count(w) for w in vocab]
    doc_vectors.append(dvec)

In [None]:
len(vocab)

In [None]:
dt_matrix = np.array(doc_vectors)
dt_normed = normalize_columns(dt_matrix)

In [None]:
"winter" in stop_list

In [None]:
compare_words("northern", "hemisphere", dt_normed)