In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

import nltk
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize
nltk.download("movie_reviews")

from collections import defaultdict, Counter
import math
import random

random.seed(0) # Don't change
torch.manual_seed(0)  # Don't change
np.random.seed(0) # Don't change


train_X, train_Y = [], []
test_X, test_Y = [], []

for polarity in movie_reviews.categories():
    label = 0 if polarity == 'neg' else 1
    for fid in movie_reviews.fileids(polarity):
        if random.randrange(5) == 0:
            test_X.append([w for w in movie_reviews.words(fid)])
            test_Y.append(label)
        else:
            train_X.append([w for w in movie_reviews.words(fid)])
            train_Y.append(label)

print(train_X[0], train_Y[0])

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', 'what', "'", 's', 'the', 'deal', '?', 'watch', 'the', 'movie', 'and', '"', 'sorta', '"', 'find', 'out', '.', '.', '.', 'critique', ':', 'a', 'mind', '-', 'fuck', 'movie', 'for', 'the', 'teen', 'generation', 'that', 'touches', 'on', 'a', 'very', 'cool', 'idea', ',', 'but', 'presents', 'it', 'in', 'a', 'very', 'bad', 'package', '.', 'which', 'is', 'what', 'makes', 'this', 'review', 'an', 'even', 'harder', 'one', 'to', 'write', ',', 'since', 'i', 'generally', 'applaud', 'films', 'which', 'attempt', 'to', 'break', 'the', 'mold', ',', 'mess', 'with', 'your', 'head', 'and', 'such', '(', 'lost', 'highway', '&', 'memento', ')', ',', 'but', 'there', 'are', 'good', 'and', 'ba

# Assignment II
Doing Assignment II by modifying the following code cell.
Your solution should be based on feedforward neural network (FNN or MLP) with word embeddings.
You are free to adjust the FNN with different dimension settings, vocabulary, overfitting prevention, and so on,
but you can not use other architectures (e.g., CNN/RNN/Transformer or the Naive Bayes classifier from Assignment I) in this assignment.


In [2]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


EMBEDDING_DIM = 100
EPOCHS = 50

class TextClassifier(nn.Module):
  def init_embeddings(self, vocab):
    self.word_to_ix = {}
    weights = []
    ix = 0
    for w in vocab:
      self.word_to_ix[w] = ix
      ix += 1
    self.vocab_size = len(self.word_to_ix)
    self.embeddings = nn.EmbeddingBag(self.vocab_size, EMBEDDING_DIM)

  def __init__(self, vocab, classes):
    super(TextClassifier, self).__init__()
    self.classes = classes
    self.init_embeddings(vocab)
    self.fc1 = nn.Linear(self.embeddings.embedding_dim, 50)
    self.fc1.weight.data.uniform_(-0.5, 0.5)
    self.fc1.bias.data.zero_()
    self.fc2 = nn.Linear(50, 20)
    self.fc2.weight.data.uniform_(-0.5, 0.5)
    self.fc2.bias.data.zero_()
    self.out = nn.Linear(20, len(self.classes))
    self.out.weight.data.uniform_(-0.5, 0.5)
    self.out.bias.data.zero_()
    self.relu = nn.ReLU()

  def forward(self, inputs, offsets):
    embedded = self.embeddings(inputs, offsets)
    return self.out(self.relu(self.fc2(self.relu(self.fc1(embedded)))))

def make_doc_vector(doc, word_to_ix):
  idxs = [word_to_ix[w] for w in doc if w in word_to_ix]
  return torch.tensor(idxs, dtype=torch.long)

def generate_batch(batch):
  label = torch.tensor([entry[0] for entry in batch])
  text = [entry[1] for entry in batch]
  offsets = [0] + [len(entry) for entry in text]
  offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
  text = torch.cat(text)
  return text, offsets, label

def build_vocab(X):
  word_count = Counter()
  for x in X:
    for w in x:
      word_count[w] += 1
  # The order of keys in a dictionary/set is not deterministic,
  # so sorting in the following statement is important to avoid randomness.
  return [w for (w, c) in sorted(word_count.items()) if c >= 6]

def build_model(X, Y):
  model = TextClassifier(build_vocab(X), [0, 1]).to(device)
  loss_function = nn.CrossEntropyLoss().to(device)
  optimizer = optim.Adam(model.parameters())

  train_set = []
  yc = Counter()
  for x, y in zip(X, Y):
    entry = []
    yc[y] += 1
    entry.append(torch.LongTensor([y]))
    entry.append(make_doc_vector(x, model.word_to_ix))
    train_set.append(entry)
  print(yc)
  data = DataLoader(train_set, batch_size=16, shuffle=True, collate_fn=generate_batch)

  for epoch in range(EPOCHS):
    train_loss, train_acc = 0, 0
    print("Epoch: %d" % epoch)
    for _, (x, offsets, y) in enumerate(data):
      model.zero_grad()
      x, offsets, y = x.to(device), offsets.to(device), y.to(device)
      pred = model(x, offsets)
      loss = loss_function(pred, y)
      loss.backward()
      optimizer.step()
      train_loss += loss.item()
      train_acc += (pred.argmax(1) == y).sum().item()
    print("Loss: %g, Acc: %g" % (train_loss / len(train_set), train_acc / len(train_set)))
  return model

In [3]:
model = build_model(train_X, train_Y)

Counter({1: 803, 0: 775})
Epoch: 0
Loss: 0.0421303, Acc: 0.579214
Epoch: 1
Loss: 0.038291, Acc: 0.687579
Epoch: 2
Loss: 0.0318874, Acc: 0.774398
Epoch: 3
Loss: 0.0239092, Acc: 0.842205
Epoch: 4
Loss: 0.0152749, Acc: 0.922687
Epoch: 5
Loss: 0.00909809, Acc: 0.963878
Epoch: 6
Loss: 0.00535729, Acc: 0.97782
Epoch: 7
Loss: 0.00286174, Acc: 0.995564
Epoch: 8
Loss: 0.00137265, Acc: 0.998099
Epoch: 9
Loss: 0.00074278, Acc: 1
Epoch: 10
Loss: 0.00044762, Acc: 1
Epoch: 11
Loss: 0.000293346, Acc: 1
Epoch: 12
Loss: 0.000203297, Acc: 1
Epoch: 13
Loss: 0.000149962, Acc: 1
Epoch: 14
Loss: 0.000113523, Acc: 1
Epoch: 15
Loss: 8.80247e-05, Acc: 1
Epoch: 16
Loss: 6.99068e-05, Acc: 1
Epoch: 17
Loss: 5.76091e-05, Acc: 1
Epoch: 18
Loss: 4.58473e-05, Acc: 1
Epoch: 19
Loss: 3.7902e-05, Acc: 1
Epoch: 20
Loss: 3.19071e-05, Acc: 1
Epoch: 21
Loss: 2.67353e-05, Acc: 1
Epoch: 22
Loss: 2.34634e-05, Acc: 1
Epoch: 23
Loss: 2.05173e-05, Acc: 1
Epoch: 24
Loss: 1.73228e-05, Acc: 1
Epoch: 25
Loss: 1.50802e-05, Acc: 1
Epoc

In [4]:
def predict(model, document):
  probs = model(make_doc_vector(document, model.word_to_ix).to(device), torch.tensor([0]).cumsum(dim=0).to(device))
  return int(torch.argmax(probs))

print(predict(model, "this is a uninteresting movie".split(" ")))
print(predict(model, "a good movie of this year".split(" ")))


0
1


## Do Evaluation

In [5]:
correct, total = 0, 0

for x, y in zip(test_X, test_Y):
    prediction = predict(model, x)
    if prediction == y:
        correct += 1
    total += 1

print("%d / %d = %g" % (correct, total, correct / total))

361 / 422 = 0.85545
