# Week 3: Vectors in Context
This notebook accompanies the week 3 lecture

In [None]:
# doing this to avoid some warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# setup
import sys
import subprocess
import pkg_resources
from collections import Counter
import re


required = {'spacy', 'scikit-learn', 'numpy', 'pandas', 'torch'}
installed = {pkg.key for pkg in pkg_resources.working_set}
missing = required - installed

if missing:
    python = sys.executable
    subprocess.check_call([python, '-m', 'pip', 'install', *missing], stdout=subprocess.DEVNULL)

import spacy
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
import pickle

from spacy.lang.en import English
en = English()

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
device = torch.device("cpu")

def simple_tokenizer(doc, model=en):
    # a simple tokenizer for individual documents (different from above)
    tokenized_docs = []
    parsed = model(doc)
    return([t.lower_ for t in parsed if (t.is_alpha)&(not t.like_url)])

## Moving beyond unigrams
Our work up to this point has mainly revolved around single-word tokens.  One way to include a bit more context is to move to bigrams, trigrams and maybe beyond (N-grams).  

Let's see how this may help us get better measures of similarity.

In [None]:
docs = ['The movie was good',
        'The movie was not bad, it was good',
        'The movie was bad']


In [None]:
for i in range(1,4):
    cv = CountVectorizer(ngram_range=(1, i))
    counts = cv.fit_transform(docs)
    print('Using %s-grams' % i)
    print(cosine_similarity(counts))

You can see here that with only unigrams, the second review, which has a negation of the word "bad", is marked as just as similar to the "good review" as the "bad review".  But once you get to the bigrams and trigrams, the second review is closer to the good review, which actually makes more sense if you read it.

But let's take a look at what this does to the vocabulary size:

In [None]:
for i in range(1,4):
    cv = CountVectorizer(ngram_range=(1, i))
    counts = cv.fit_transform(docs)
    print('Using %s-grams' % i)
    print(cv.vocabulary_)
    print(len(cv.vocabulary_))

And this is just a simple corpus! Imagine if we had a realistic set of reviews, we could imagine many possible combinations of bigrams and trigrams.

This is one of the reasons why it makes sense to move into sequence-based models, where we have some information being shared throughout the full parsing of the document.  This leads us to:

## Recurrent Neural Networks


In [None]:
def doc_to_index(docs, vocab):
    # transform docs into series of indices
    docs_idxs = []
    for d in docs:
        w_idxs = []
        for w in d:
            if w in vocab:
                w_idxs.append(vocab[w])
            else:
                # unknown token = 1
                w_idxs.append(1)
        docs_idxs.append(w_idxs)
    return(docs_idxs)

def pad_sequence(seqs, seq_len=200):
    # function for adding padding to ensure all seq same length
    features = np.zeros((len(seqs), seq_len),dtype=int)
    for i, seq in enumerate(seqs):
        if len(seq) != 0:
            features[i, -len(seq):] = np.array(seq)[:seq_len]
    return features

def onehot_encode(data, vocab, seq_len=200):
    # given dataset, turn each observation into a set of one-hot encoded vector
    onehot_data = np.zeros((len(data), seq_len, len(vocab)),
                          dtype='float32')
    for i, d in enumerate(data):
        for ii, w in enumerate(d):
            onehot_data[i, ii, w] = 1
    return(onehot_data)

In [None]:
# you will need to change this to where ever the file is stored
data_location = '../data/assignment_1_reviews.pkl'
with open(data_location, 'rb') as f:
    all_text = pickle.load(f)
neg, pos = all_text.values()
# join all reviews
all_reviews = neg+pos
# create binary indicator for positive review
is_positive = np.array([0]*len(neg)+[1]*len(pos))
# sample random 70% for fitting model (training)
# 30% will be simulating "new observations" (testing)
pct_sample = 0.7
train_bool = np.random.random(len(all_reviews))<pct_sample
reviews_train = [d for i, d in enumerate(all_reviews) if train_bool[i]]
reviews_test = [d for i, d in enumerate(all_reviews) if not train_bool[i]]
is_positive_train = is_positive[train_bool]
is_positive_test = is_positive[~train_bool]
print(len(reviews_train), len(reviews_test))

In [None]:
# transform all data to work with model
# tokenizing ahead of time for easier match with word idx
parsed_train = [simple_tokenizer(d) for d in reviews_train]
parsed_test = [simple_tokenizer(d) for d in reviews_test]
# this formulation works if you have previously tokenized
cv = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
# **important** just fit on trained: prevents information from test in training 
cv.fit(parsed_train)
# get out the vocab
vocab = cv.vocabulary_

A note here: We'll be using this vocabulary to transform each token to an index (numeric).  However there are two "special" tokens that we'll need to add:

\_PAD: The model expects all inputs to be of the same length.  So we've specified a sequence length.  If a document is longer than that, it gets truncated.  If it's shorted than that, it gets padded.  This token indicates that a particular element of the input document is padding.  This is useful information for the model

\_UNK: Depending on the vocab design, we may have certain tokens that are not included (i.e. do not have an index).  Any of these tokens are labelled as "unknown".

In [None]:
# need to adapt vocab, leave space for padding
vocab = dict([(v, vocab[v]+2) for v in vocab])
vocab['_UNK'] = 1
vocab['_PAD'] = 0
parsed_train = doc_to_index(parsed_train, vocab)
padded_train = pad_sequence(parsed_train)
parsed_test = doc_to_index(parsed_test, vocab)
padded_test = pad_sequence(parsed_test)
# onehot encoding
onehot_train = onehot_encode(padded_train, vocab)
onehot_test = onehot_encode(padded_test, vocab)

In [None]:
# construct datasets for loading by PyTorch
train_data = TensorDataset(torch.from_numpy(onehot_train), torch.from_numpy(is_positive_train))
#val_data = TensorDataset(torch.from_numpy(val_sentences), torch.from_numpy(val_labels))
test_data = TensorDataset(torch.from_numpy(onehot_test), torch.from_numpy(is_positive_test))

batch_size = 100

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size,
                         drop_last=True) # this is to keep the size consistent
#val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size,
                        drop_last=True)

In [None]:
class SentimentNet(nn.Module):
    # sentiment classifier with single LSTM layer + Fully-connected layer, sigmoid activation and dropout
    # adapted from https://blog.floydhub.com/long-short-term-memory-from-zero-to-hero-with-pytorch/
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        super(SentimentNet, self).__init__()
        # size of the output, in this case it's one input to one output
        self.output_size = output_size
        # number of layers = 2, one LSTM layer, one fully-connected layer
        self.n_layers = n_layers
        # dimensions of our hidden state, what is passed from one time point to the next
        self.hidden_dim = hidden_dim
        # LSTM layer, where the magic happens
        self.lstm = nn.LSTM(vocab_size, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        # dropout, similar to regularization
        self.dropout = nn.Dropout(drop_prob)
        # fully connected layer
        self.fc = nn.Linear(hidden_dim, output_size)
        # sigmoid activiation
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        # forward pass of the network
        batch_size = x.size(0)
        lstm_out, hidden = self.lstm(x, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        out = self.dropout(lstm_out)
        out = self.fc(out)
        out = self.sigmoid(out)        
        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out, hidden
    
    def init_hidden(self, batch_size):
        # initializes the hidden state
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        return hidden
    
    def init_hidden(self, batch_size):
        hidden = (torch.zeros(self.n_layers, batch_size, self.hidden_dim),
                  torch.zeros(self.n_layers, batch_size, self.hidden_dim))
        return hidden

In [None]:
vocab_size = len(vocab)
output_size = 1
hidden_dim = 512
n_layers = 2
embedding_dim = 400

model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
model.to(device)

lr=0.005
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
epochs = 2
counter = 0
print_every = 5
clip = 5
valid_loss_min = np.Inf

model.train()
for i in range(epochs):
    #h = model.init_hidden(batch_size)
    h = model.initHidden(batch_size)
    for inputs, labels in train_loader:
        counter += 1
        h = tuple([e.data for e in h])
        inputs, labels = inputs.to(device), labels.to(device)
        model.zero_grad()
        output, h = model(inputs, h)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        if counter%print_every == 0:
            val_h = model.init_hidden(batch_size)
            val_losses = []
            model.eval()
            #for inp, lab in val_loader:
            for inp, lab in test_loader:
                val_h = tuple([each.data for each in val_h])
                inp, lab = inp.to(device), lab.to(device)
                out, val_h = model(inp, val_h)
                val_loss = criterion(out.squeeze(), lab.float())
                val_losses.append(val_loss.item())
                
            model.train()
            print("Epoch: {}/{}...".format(i+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))
            if np.mean(val_losses) <= valid_loss_min:
                torch.save(model.state_dict(), './state_dict.pt')
                print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
                valid_loss_min = np.mean(val_losses)

Now that we have a trained LSTM model, we can compare it to the results of something simple like count vectors + SVM.



In [None]:
# count vector approach
parsed_train = [simple_tokenizer(d) for d in reviews_train]
parsed_test = [simple_tokenizer(d) for d in reviews_test]
train_vecs = cv.transform(parsed_train).toarray()
test_vecs = cv.transform(parsed_test).toarray()
svc = LinearSVC()
svc.fit(train_vecs, is_positive_train)
svc_preds = svc.predict(test_vecs)
# scoring accuracy
print('SVC accuracy:', accuracy_score(is_positive_test, test_preds))

In [None]:
# pytorch LSTM model
num_correct = 0
for inputs, labels in test_loader:
    h = tuple([each.data for each in h])
    inputs, labels = inputs.to(device), labels.to(device)
    output, h = model(inputs, h)
    # takes output, rounds to 0/1
    pred = torch.round(output.squeeze())
    # take the correct labels, check against preds
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.cpu().numpy())
    # sum the number of correct
    num_correct += np.sum(correct)
# calc accuracy
test_acc = num_correct/len(test_loader.dataset)
print('LSTM accuracy:', test_acc)

Yikes.  All that work and we have a model that doesn't perform as well as the simple model.

There's several issues to be elaborated in the slides.  But the main one we're going to focus on is word embeddings.  Currently each element in an observation is a one-hot encoded vector for word index.  That's a pretty huge vector, and it's mostly zero.  What if we had a more dense, informative representation of an individual word?

### Word-level representations
Remember from Week 2: Our document-level representations cam also be used to create word-level representations.  For count vectors and tfidf vectors, we can just invert the matrix from document-word to word-document.  For matrix factorization, part of the estimation involves creating a word-component matrix.

In [None]:
cv = CountVectorizer(tokenizer=simple_tokenizer)
tfidf = TfidfVectorizer(tokenizer=simple_tokenizer)
# get vectors
count_vecs = cv.fit_transform(all_reviews)
tfidf_vecs = tfidf.fit_transform(all_reviews)
n_components = 10
nmf = NMF(n_components=n_components)
nmf_vecs = nmf.fit_transform(tfidf_vecs)
lda = LatentDirichletAllocation(n_components=n_components)
lda_vecs = lda.fit_transform(count_vecs)

In [None]:
# create word-level representations
count_words = count_vecs.T
tfidf_words = tfidf_vecs.T
nmf_words = nmf.components_.T
lda_words = lda.components_.T
for rep in [count_words, tfidf_words, nmf_words, lda_words]:
    print(rep.shape)

Ideally, these word-level representations encode some amount of the word's meaning in them.  So let's test it with a few words.  Intuitively, we know that the words "good" and "bad" should be pretty different from eachother (semantically, at least).  We know that "good" and "great" should be pretty similar.  Let's see how our representations capture that.

In [None]:
seed_words = ['good', 'great', 'bad']
# get index of seed words
seed_idxs = [cv.vocabulary_[w] for w in seed_words]
for rep in [count_words, tfidf_words, nmf_words, lda_words]:
    print(cosine_similarity(rep[seed_idxs]))

None of them seem to do great with the good to bad similarity; they're often more similar than good to great.  Bad to great, however, seems more promising.

But generally: These representations are based on a very small corpus and a very specific context.  What word representations (or embeddings) like Word2vec or GloVe try to do is make more general representations of the word based on its context in a large corpus of non-specific context.  Let's see how SpaCy's GloVe-based representations do on this task.

In [None]:
# only the md and lg models contain GloVe vectors
nlp = spacy.load('en_core_web_md')

In [None]:
# we need to parse it with the model, then we can use the vector attribute
glove_words = [nlp(w).vector for w in seed_words]
# each vector is 300-dimensional dense representation
print(glove_words[0][:10])
print(glove_words[0].shape)

In [None]:
cosine_similarity(glove_words)

Wow! This works really well.  We can see that good is pretty close to great, less close to bad.  Bad is far from good, but farther from great.  This is what we'd intuitively expect.

This isn't to say that GloVe is always preferred.  Depending on the context, other word representations might be more useful.  But let's go with GloVe and run our RNN model with this instead of the sparse representation.

### RNN with GloVe vectors
Instead of one-hot vectors for each word, we need the 300-dimensional word vector.  We can get this from SpaCy and a quick way of doing so is to use the vocabulary we already fit.

In [None]:
# collect vectors in matrix
glove_vecs = np.zeros(shape=(len(vocab), 300))
for k, v in vocab.items():
    glove_vecs[v] = nlp(k).vector

In [None]:
def glove_lookup(data, glove_vecs, seq_len=200):
    # given dataset and vectors, turn into array of vectors
    glove_data = np.zeros((len(data), seq_len, 300),
                          dtype='float32')
    for i, d in enumerate(data):
        for ii, w in enumerate(d):
            glove_data[i, ii] = glove_vecs[w]
    return(glove_data)

In [None]:
# glove lookup
glove_train = glove_lookup(padded_train, glove_vecs)
glove_test = glove_lookup(padded_test, glove_vecs)

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

train_data = TensorDataset(torch.from_numpy(glove_train), torch.from_numpy(is_positive_train))
#val_data = TensorDataset(torch.from_numpy(val_sentences), torch.from_numpy(val_labels))
test_data = TensorDataset(torch.from_numpy(glove_test), torch.from_numpy(is_positive_test))

batch_size = 100

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size,
                         drop_last=True) # this is to keep the size consistent
#val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size,
                        drop_last=True)

In [None]:
# vocab_size = the shape of the input for a particular time point
# with GloVe vectors, it's the length of the vector; 300
vocab_size = 300
output_size = 1
hidden_dim = 512
n_layers = 2
embedding_dim = 400

model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
model.to(device)

lr=0.005
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
epochs = 2
counter = 0
print_every = 5
clip = 5
valid_loss_min = np.Inf

model.train()
for i in range(epochs):
    #h = model.init_hidden(batch_size)
    h = model.initHidden(batch_size)
    for inputs, labels in train_loader:
        counter += 1
        h = tuple([e.data for e in h])
        inputs, labels = inputs.to(device), labels.to(device)
        model.zero_grad()
        output, h = model(inputs, h)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        if counter%print_every == 0:
            val_h = model.init_hidden(batch_size)
            val_losses = []
            model.eval()
            #for inp, lab in val_loader:
            for inp, lab in test_loader:
                val_h = tuple([each.data for each in val_h])
                inp, lab = inp.to(device), lab.to(device)
                out, val_h = model(inp, val_h)
                val_loss = criterion(out.squeeze(), lab.float())
                val_losses.append(val_loss.item())
                
            model.train()
            print("Epoch: {}/{}...".format(i+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))
            if np.mean(val_losses) <= valid_loss_min:
                torch.save(model.state_dict(), './state_dict.pt')
                print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
                valid_loss_min = np.mean(val_losses)

In [None]:
# pytorch LSTM model
num_correct = 0
for inputs, labels in test_loader:
    h = tuple([each.data for each in h])
    inputs, labels = inputs.to(device), labels.to(device)
    output, h = model(inputs, h)
    # takes output, rounds to 0/1
    pred = torch.round(output.squeeze())
    # take the correct labels, check against preds
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.cpu().numpy())
    # sum the number of correct
    num_correct += np.sum(correct)
# calc accuracy
test_acc = num_correct/len(test_loader.dataset)
print('LSTM accuracy:', test_acc)