In [0]:
%%bash
!(stat -t /usr/local/lib/*/dist-packages/google/colab > /dev/null 2>&1) && exit 
rm -rf 6864-hw1
git clone https://github.com/lingo-mit/6864-hw1.git

Cloning into '6864-hw1'...


In [0]:
import sys
sys.path.append("/content/6864-hw1")

import csv
import itertools as it
import numpy as np
np.random.seed(0)

import lab_util

## Introduction

This lab will explore three different ways of using unlabeled text data to learn pretrained word representations. It will describe the effects of different modeling decisions (representation learning objective, context size, etc.) on both qualitative properties of learned representations and their effect on a downstream prediction problem.


The lab will work with a dataset of product reviews. It looks like this:

In [0]:
data = []
n_positive = 0
n_disp = 0
with open("/content/6864-hw1/reviews.csv") as reader:
  csvreader = csv.reader(reader)
  next(csvreader)
  for id, review, label in csvreader:
    label = int(label)

    # hacky class balancing
    if label == 1:
      if n_positive == 2000:
        continue
      n_positive += 1
    if len(data) == 4000:
      break

    data.append((review, label))
    
    if n_disp > 5:
      continue
    n_disp += 1
    print("review:", review)
    print("rating:", label, "(good)" if label == 1 else "(bad)")
    print()

print(f"Read {len(data)} total reviews.")
np.random.shuffle(data)
reviews, labels = zip(*data)
train_reviews = reviews[:3000]
train_labels = labels[:3000]
val_reviews = reviews[3000:3500]
val_labels = labels[3000:3500]
test_reviews = reviews[3500:]
test_labels = labels[3500:]

review: I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.
rating: 1 (good)

review: Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".
rating: 0 (bad)

review: This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother an

## Part 1: word representations via matrix factorization

First, the term--document matrix is constructed (look at `lab_util.py` in the file browser to see how this works).

In [0]:
vectorizer = lab_util.CountVectorizer()
vectorizer.fit(train_reviews)
td_matrix = vectorizer.transform(train_reviews).T
print(f"TD matrix is {td_matrix.shape[0]} x {td_matrix.shape[1]}")

print(td_matrix)

TD matrix is 2006 x 3000
[[3. 3. 5. ... 1. 6. 4.]
 [1. 0. 0. ... 0. 0. 0.]
 [2. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [0]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

def learn_reps_lsa(matrix, rep_size):
  # `matrix` is a `|V| x n` matrix, where `|V|` is the number of words in the
  # vocabulary. This function should return a `|V| x rep_size` matrix with each
  # row corresponding to a word representation. The `sklearn.decomposition` 
  # package may be useful.

    svd_model = TruncatedSVD(n_components=rep_size)
    svd_model.fit(matrix)
    svd_matrix=svd_model.transform(matrix)
    
    return svd_matrix

Let's look at some representations:

In [0]:
reps = learn_reps_lsa(td_matrix, 64)
words = ["good", "bad", "cookie", "jelly", "dog", "the", "4"]
show_tokens = [vectorizer.tokenizer.word_to_token[word] for word in words]
lab_util.show_similar_words(vectorizer.tokenizer, reps, show_tokens)

good 47
  pretty 0.670
  liked 0.782
  everyone 0.811
  really 0.817
  beat 0.865
bad 201
  taste 0.595
  really 0.614
  ok 0.623
  . 0.638
  didn't 0.654
cookie 504
  nana's 0.471
  cookies 0.601
  gluten 0.816
  free 0.838
  shortbread 0.844
jelly 351
  bread 1.039
  online 1.043
  low 1.043
  hoping 1.094
  twist 1.102
dog 925
  pet 0.264
  dogs 0.308
  food 0.385
  nutritious 0.433
  pets 0.475
the 36
  . 0.331
  <unk> 0.366
  of 0.394
  and 0.402
  to 0.422
4 292
  1 0.206
  6 0.210
  2 0.319
  70 0.434
  5 0.458


Here, the TF-IDF transform is implemented.

In [0]:
import math

def transform_tfidf(matrix):
  # `matrix` is a `|V| x |D|` matrix of raw counts, where `|V|` is the 
  # vocabulary size and `|D|` is the number of documents in the corpus. This
  # function should (nondestructively) return a version of `matrix` with the
  # TF-IDF transform appliied.

  thresholded = matrix > 1
  dfs = thresholded.sum(axis=1)[:, np.newaxis]
  idfs = np.log(matrix.shape[1]) - np.log(dfs + 1e-8)

  return matrix * idfs

Let's see how does this change the learned similarity function.

In [0]:
td_matrix_tfidf = transform_tfidf(td_matrix)
#reps = learn_reps_lsa(td_matrix, 1000)  #SVD만 적용 bigger the better
reps_tfidf = learn_reps_lsa(td_matrix_tfidf, 64)  #TFIDF로 Term-doc 관련성/중요도 적용 후 SVD 적용
#w_co = learn_reps_lsa(w_tt, 1000)  #TFIDF로 Term-doc 관련성/중요도 적용 후 SVD 적용
lab_util.show_similar_words(vectorizer.tokenizer, reps_tfidf, show_tokens)

good 47
  but 0.326
  . 0.401
  like 0.413
  is 0.430
  too 0.433
bad 201
  taste 0.391
  like 0.451
  just 0.511
  me 0.514
  but 0.521
cookie 504
  cookies 0.658
  nana's 0.697
  gluten 0.744
  flour 0.821
  free 0.830
jelly 351
  mixing 0.809
  advertised 0.963
  save 0.963
  muffins 0.976
  vanilla 0.986
dog 925
  pet 0.362
  dogs 0.404
  food 0.468
  pets 0.544
  switched 0.554
the 36
  . 0.086
  of 0.103
  to 0.122
  and 0.132
  in 0.157
4 292
  6 0.178
  1 0.206
  2 0.468
  70 0.506
  5 0.601


Now that we have some representations, let's see if we can do something useful with them.

Below is the implementation of a feature function that represents a document as the sum of its
learned word embeddings.

The remaining code trains a logistic regression model on a set of *labeled* reviews; we're interested in seeing how much representations learned from *unlabeled* reviews improve classification.

In [0]:
REP_DICT = learn_reps_lsa(td_matrix_tfidf, 64)

def word_featurizer(xs):
  # normalize
  return xs / np.sqrt((xs ** 2).sum(axis=1, keepdims=True))

def lsa_featurizer(xs):
  # This function takes in a matrix in which each row contains the word counts
  # for the given review. It should return a matrix in which each row contains
  # the learned feature representation of each review (e.g. the sum of LSA 
  # word representations).

  feats = sum(np.outer(xs[:, i], REP_DICT[i, :]) for i in range(xs.shape[1]))
  # normalize
  return feats / np.sqrt((feats ** 2).sum(axis=1, keepdims=True))

def combo_featurizer(xs):
  return np.concatenate((word_featurizer(xs), lsa_featurizer(xs)), axis=1)

def train_model(featurizer, xs, ys):
  import sklearn.linear_model
  xs_featurized = featurizer(xs)
  model = sklearn.linear_model.LogisticRegression()
  model.fit(xs_featurized, ys)
  return model

def eval_model(model, featurizer, xs, ys):
  xs_featurized = featurizer(xs)
  pred_ys = model.predict(xs_featurized)
  print("test accuracy", np.mean(pred_ys == ys))

def training_experiment(name, featurizer, n_train):
  print(f"{name} features, {n_train} examples")
  train_rv = vectorizer.transform(train_reviews[:n_train])
  train_lb = train_labels[:n_train]
  test_rv = vectorizer.transform(test_reviews)
  test_lb = test_labels
  model = train_model(featurizer, train_rv, train_lb)
  eval_model(model, featurizer, test_rv, test_lb)
  print()

training_experiment("word", word_featurizer, 20)
training_experiment("lsa", lsa_featurizer, 20)
training_experiment("combo", combo_featurizer, 20)

training_experiment("word", word_featurizer, 100)
training_experiment("lsa", lsa_featurizer, 100)
training_experiment("combo", combo_featurizer, 100)

training_experiment("word", word_featurizer, 1000)
training_experiment("lsa", lsa_featurizer, 1000)
training_experiment("combo", combo_featurizer, 1000)

word features, 20 examples
test accuracy 0.526

lsa features, 20 examples
test accuracy 0.524

combo features, 20 examples
test accuracy 0.526

word features, 100 examples
test accuracy 0.616

lsa features, 100 examples
test accuracy 0.6

combo features, 100 examples
test accuracy 0.624

word features, 1000 examples
test accuracy 0.784

lsa features, 1000 examples
test accuracy 0.682

combo features, 1000 examples
test accuracy 0.784



## Part 2: word representations via language modeling

This section will train a word embedding model with a word2vec-style objective rather than a matrix factorization objective.

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as torch_data
from torch.autograd import Variable

class Word2VecModel(nn.Module):
  # A torch module implementing a word2vec predictor. The `forward` function
  # should take a batch of context word ids as input and predict the word 
  # in the middle of the context as output, as in the CBOW model from lecture.

  def __init__(self, vocab_size, embed_dim):
        # Your code here!
        super().__init__()
        self.embeds = nn.Embedding(vocab_size, embed_dim)
        self.linear1 = nn.Linear(embed_dim, 64)
        self.linear2 = nn.Linear(64, vocab_size)
      
  def forward(self, context):
      # Context is an `n_batch x n_context` matrix of integer word ids
      # this function should return a set of scores for predicting the word 
      # in the middle of the context
        # Your code here!
        
        output = self.embeds(context) # get the embeddings
        output = output.sum(dim=1)
        output = F.relu(self.linear1(output)) # pass through first layer
        output = self.linear2(output) # pass through second layer

        return output

In [0]:
import time

def learn_reps_word2vec(corpus, window_size, rep_size, n_epochs, n_batch):
  # This method takes in a corpus of training sentences. It returns a matrix of
  # word embeddings with the same structure as used in the previous section of 
  # the assignment. (You can extract this matrix from the parameters of the 
  # Word2VecModel.)

  tokenizer = lab_util.Tokenizer()
  tokenizer.fit(corpus)
  tokenized_corpus = tokenizer.tokenize(corpus)
  #(array([48,  0,  8, 44]), 49),
  #(array([ 0, 49, 44,  3]), 8),   ngrams format: (context words , target word)
  ngrams = lab_util.get_ngrams(tokenized_corpus, window_size) #context matrix

  device = torch.device('cuda')  # run on colab gpu
  model = Word2VecModel(tokenizer.vocab_size, rep_size).to(device)
  opt = optim.Adam(model.parameters(), lr=0.001) 
  loss_fn = nn.CrossEntropyLoss()

  loader = torch_data.DataLoader(ngrams, batch_size=n_batch, shuffle=True)

  start = time.time()
  for epoch in range(n_epochs):
    epoch_loss = 0
    n_batches = 0
    #total_loss = 0
    for context, label in loader:
      # as described above, `context` is a batch of context word ids, and
      # `label` is a batch of predicted word labels
      #context_var = torch.grad.Variable(torch.LongTensor(context))
      preds = model(context.to(device))
      loss=loss_fn(preds, label.to(device))
      
      opt.zero_grad()
      loss.backward()
      opt.step()

      epoch_loss += loss.item()
      n_batches += 1

    epoch_loss /= n_batches
    if (epoch+1)%10 ==0:
      print(f"epoch {epoch+1}: {epoch_loss}, {time.time()-start}s")
      start = time.time()

  # reminder: you want to return a `vocab_size x embedding_size` numpy array
  embedding_matrix = next(model.embeds.parameters())
  return embedding_matrix.cpu().detach().numpy()

In [0]:
reps_word2vec = learn_reps_word2vec(train_reviews, 1, 32, 200, 1024)

epoch 10: 4.260442705190137, 27.9047908782959s
epoch 20: 3.991017733620347, 27.69009256362915s
epoch 30: 3.8554277089651157, 27.87818193435669s
epoch 40: 3.770531847235862, 27.725236415863037s
epoch 50: 3.7098305814721613, 27.70301628112793s
epoch 60: 3.6639440443631863, 27.525153875350952s
epoch 70: 3.628050940760066, 27.614866018295288s
epoch 80: 3.598982144830825, 27.57706379890442s
epoch 90: 3.5748120631171525, 27.48516273498535s
epoch 100: 3.55465012453915, 27.574446439743042s
epoch 110: 3.537092481213116, 27.471062660217285s
epoch 120: 3.5218868746739647, 27.32981514930725s
epoch 130: 3.5082695037684637, 27.559032678604126s
epoch 140: 3.4957497602098444, 27.448981046676636s
epoch 150: 3.4851980548672907, 27.528251886367798s
epoch 160: 3.475864617565598, 27.421628952026367s
epoch 170: 3.4667931335249196, 27.49956250190735s
epoch 180: 3.459347671337342, 27.48327136039734s
epoch 190: 3.4517817997307367, 27.20704436302185s
epoch 200: 3.4457431411028803, 27.177741527557373s


After training the embeddings, we can try to visualize the embedding space to see if it makes sense. First, we can take any word in the space and check its closest neighbors.

In [0]:
lab_util.show_similar_words(vectorizer.tokenizer, reps_word2vec, show_tokens)

good 47
  fine 0.529
  bad 0.559
  nice 0.596
  decent 0.627
  tasty 0.716
bad 201
  good 0.559
  easy 0.665
  weird 0.809
  strong 0.811
  greatest 0.826
cookie 504
  sized 0.827
  fat 0.895
  version 1.000
  yum 1.025
  meals 1.030
jelly 351
  stock 0.848
  run 1.014
  loose 1.035
  chowder 1.055
  spot 1.097
dog 925
  baby 0.775
  cat 0.792
  junk 0.814
  old 0.918
  pet 0.938
the 36
  their 0.679
  my 0.810
  your 0.841
  our 0.967
  any 0.990
4 292
  2 0.485
  5 0.522
  25 0.727
  3 0.731
  15 0.773


We can also cluster the embedding space. Clustering in 4 or more dimensions is hard to visualize, and even clustering in 2 or 3 can be difficult because there are so many words in the vocabulary. One thing we can try to do is assign cluster labels and qualitiatively look for an underlying pattern in the clusters.

In [0]:
from sklearn.cluster import KMeans

indices = KMeans(n_clusters=10).fit_predict(reps_word2vec)
zipped = list(zip(range(vectorizer.tokenizer.vocab_size), indices))
np.random.shuffle(zipped)
zipped = zipped[:100]
zipped = sorted(zipped, key=lambda x: x[1])
for token, cluster_idx in zipped:
  word = vectorizer.tokenizer.token_to_word[token]
  print(f"{word}: {cluster_idx}")

junk: 0
owner: 0
greta: 0
preference: 0
barbecue: 0
switch: 0
manufacturer: 0
mccann's: 0
his: 0
tin: 0
pain: 0
suspect: 0
chicken: 0
lobster: 0
kinds: 0
record: 0
soup: 0
negative: 1
total: 1
tells: 1
bone: 1
source: 1
deal: 1
snack: 1
many: 1
larger: 1
suggested: 2
eating: 2
molasses: 2
began: 2
how: 2
purchasing: 2
needed: 2
wanting: 2
lost: 2
stock: 2
left: 2
off: 2
bed: 3
high: 3
economical: 3
decent: 3
missing: 3
order: 4
avoid: 4
thinking: 4
went: 4
seemed: 4
teaspoon: 4
dip: 4
chocolates: 5
possible: 5
nectar: 5
lemonade: 5
oven: 5
ago: 5
reviews: 5
bodied: 6
restaurant: 6
orange: 6
kibble: 6
balance: 6
salty: 6
pasta: 6
acid: 6
oils: 6
.: 7
hoping: 7
otherwise: 7
wow: 7
else: 7
i've: 7
they're: 7
i'll: 7
since: 7
yeah: 7
china: 7
purina: 8
benefit: 8
home: 8
multiple: 8
from: 8
figured: 8
walmart: 8
gritty: 8
sure: 8
u: 9
serving: 9
breakfast: 9
favor: 9
minute: 9
wheat: 9
value: 9
homemade: 9
above: 9
fried: 9
person: 9
hands: 9
salads: 9
maple: 9


Finally, we can use the trained word embeddings to construct vector representations of full reviews. One common approach is to simply average all the word embeddings in the review to create an overall embedding.

In [0]:
def lsa_featurizer(xs):
  feats = sum(np.outer(xs[:, i], reps_word2vec[i, :]) for i in range(xs.shape[1]))

  # normalize
  return feats / np.sqrt((feats ** 2).sum(axis=1, keepdims=True))

training_experiment("word2vec", lsa_featurizer, 10)

word2vec features, 10 examples
test accuracy 0.474

