In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from numpy import linalg
from numpy import dot
import pandas as pd

In [None]:
def norm_vec(v):
    return v / np.linalg.norm(v)
np.set_printoptions(precision=3)

def normalize_rows(x):
    return x/np.linalg.norm(x, ord=2, axis=1, keepdims=True)

def normalize_columns(x):
    return x/np.linalg.norm(x, ord=2, axis=0, keepdims=True)

<h1>Seasons with LSA</h1>

This is all about squishing down the number of dimensions

In [None]:
from utilities import *
from seasons_module import load_seasons_corpus_as_utterances
from seasons_module import load_seasons_comparison_files

import numpy as np
def norm_vec(vec):
    mag = np.dot(vec, vec)
    if mag == 0:
        return vec
    else:
        return(vec / np.sqrt(mag))
    
def pure_tf(tf, df, cf, N):
    return tf

def tf(tf):
    if tf == 0:
        result = 0
    else:
        result = (1 + np.log(tf))
    return result

def weighted_word(the_text, word):
    return tf(the_text.count(word))

def compute_normed_doc_vector(word_list, vocab):
    return norm_vec(np.array([weighted_word(word_list, word) for word in vocab]))

def compute_doc_vector(word_list, vocab):
    return np.array([weighted_word(word_list, word) for word in vocab])

def prepare_seasons_corpus_by_utterance(vocab_size=50, norm_vecs=False):
    seasons_corpus = load_seasons_corpus_as_utterances()
    f = open("lists/seasons_stop_list.txt")
    stop_list = set(f.read().split("\n"))
    word_fdist = nltk.FreqDist()
    for fname in seasons_corpus.keys():
        for utterance in seasons_corpus[fname][0]:
            pruned_transcript_words = [w for w in utterance if w not in stop_list]
            word_fdist.update(pruned_transcript_words)
    new_vocab = [w[0] for w in word_fdist.most_common(vocab_size) if w not in stop_list]
    
    # compute the document vector for each document
    doc_vectors = []
    for fname in seasons_corpus.keys():
        for utterance in seasons_corpus[fname][0]:
            if norm_vecs:
                doc_vectors.append(compute_normed_doc_vector(utterance, new_vocab))
            else:
                doc_vectors.append(compute_doc_vector(utterance, new_vocab))
    return doc_vectors, new_vocab


In [None]:
doc_vectors, vocab = prepare_seasons_corpus_by_utterance()
X = np.array(doc_vectors)

In [None]:
X.shape

<h2>Seasons with LSI</h2>

The SVD gives us a way of taking the word vectors and squishing them down to fewer dimensions.

We are going to squish them all the way down to two dimensions.

In [None]:
dims = 2

In [None]:
T, S, Dt = np.linalg.svd(X.transpose(), full_matrices = False)
T_reduced = T[:, 0:dims]
T_normed = normalize_rows(T_reduced)

Now T_reduced is an array where each row is a vector corresponding to each of the words in our vocabulary

In [None]:
T_reduced.shape

In [None]:
def get_row_vector(w, T):
    rnumber = vocab.index(w)
    return T[rnumber]

def compare_words(w1, w2, T):
    v1 = get_row_vector(w1, T)
    v2 = get_row_vector(w2, T)
    return np.dot(norm_vec(v1), norm_vec(v2))

In [None]:
get_row_vector("tilt", T_reduced)

In [None]:
compare_words("farther", "closer", T_reduced)

Since there are only two dimensions we can show the word vectors on a plot

In [None]:
fig=plt.figure(figsize=(20, 10), dpi= 80, facecolor='w', edgecolor='k')
plt.scatter(T_reduced.transpose()[0], T_reduced.transpose()[1])
for i in range(len(vocab)):
    plt.annotate(vocab[i], T_reduced[i])

In [None]:
fig=plt.figure(figsize=(20, 10), dpi=80, facecolor='w', edgecolor='k')
plt.scatter(T_normed.transpose()[0], T_normed.transpose()[1])
for i in range(len(vocab)):
    plt.annotate(vocab[i], T_normed[i])