In [14]:
import os
from google.colab import drive
import pandas as pd
import numpy as np
import re
from collections import Counter
import spacy
import nltk
from nltk.corpus import stopwords
from string import punctuation
nltk.download("stopwords")


from gensim.sklearn_api import D2VTransformer, W2VTransformer
from sklearn import metrics
from sklearn.base import BaseEstimator, MetaEstimatorMixin, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.svm import LinearSVC


import torch
from torch import nn
import torchtext
from torchtext import data, datasets
from torchtext.vocab import Vocab, GloVe, Vectors
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_packed_sequence as unpack
from torch.nn.utils.rnn import pack_padded_sequence as pack
from torch.nn.utils.rnn import pad_sequence
import torch.utils.model_zoo as model_zoo

spacy_en = spacy.load('en')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# CoVe Model

In [9]:
model_urls = {
    'wmt-lstm' : 'https://s3.amazonaws.com/research.metamind.io/cove/wmtlstm-8f474287.pth'
}

MODEL_CACHE = '.torch'


class MTLSTM(nn.Module):

    def __init__(self, n_vocab=None, vectors=None, residual_embeddings=False, layer0=False, layer1=True, trainable=False, model_cache=MODEL_CACHE):
        """Initialize an MTLSTM. If layer0 and layer1 are True, they are concatenated along the last dimension so that layer0 outputs
           contribute the first 600 entries and layer1 contributes the second 600 entries. If residual embeddings is also true, inputs
           are also concatenated along the last dimension with any outputs such that they form the first 300 entries.
         
        Arguments:
            n_vocab (int): If not None, initialize MTLSTM with an embedding matrix with n_vocab vectors
            vectors (Float Tensor): If not None, initialize embedding matrix with specified vectors (These should be 300d CommonCrawl GloVe vectors)
            residual_embedding (bool): If True, concatenate the input GloVe embeddings with contextualized word vectors as final output
            layer0 (bool): If True, return the outputs of the first layer of the MTLSTM
            layer1 (bool): If True, return the outputs of the second layer of the MTLSTM
            trainable (bool): If True, do not detach outputs; i.e. train the MTLSTM (recommended to leave False)
            model_cache (str): path to the model file for the MTLSTM to load pretrained weights (defaults to the best MTLSTM from (McCann et al. 2017) -- 
                               that MTLSTM was trained with 300d 840B GloVe on the WMT 2017 machine translation dataset.
        """
        super(MTLSTM, self).__init__()
        self.layer0 = layer0
        self.layer1 = layer1
        self.residual_embeddings = residual_embeddings
        self.trainable = trainable
        self.embed = False
        if n_vocab is not None:
            self.embed = True
            self.vectors = nn.Embedding(n_vocab, 300)
            if vectors is not None:
                self.vectors.weight.data = vectors
        state_dict = model_zoo.load_url(model_urls['wmt-lstm'], model_dir=model_cache)
        if layer0:
            layer0_dict = {k: v for k, v in state_dict.items() if 'l0' in k}
            self.rnn0 = nn.LSTM(300, 300, num_layers=1, bidirectional=True, batch_first=True)
            self.rnn0.load_state_dict(layer0_dict)
            if layer1:
                layer1_dict = {k.replace('l1', 'l0'): v for k, v in state_dict.items() if 'l1' in k}
                self.rnn1 = nn.LSTM(600, 300, num_layers=1, bidirectional=True, batch_first=True)
                self.rnn1.load_state_dict(layer1_dict)
        elif layer1:
            self.rnn1 = nn.LSTM(300, 300, num_layers=2, bidirectional=True, batch_first=True)
            self.rnn1.load_state_dict(model_zoo.load_url(model_urls['wmt-lstm'], model_dir=model_cache))
        else:
            raise ValueError('At least one of layer0 and layer1 must be True.')
         

    def forward(self, inputs, lengths, hidden=None):
        """
        Arguments:
            inputs (Tensor): If MTLSTM handles embedding, a Long Tensor of size (batch_size, timesteps).
                             Otherwise, a Float Tensor of size (batch_size, timesteps, features).
            lengths (Long Tensor): lenghts of each sequence for handling padding
            hidden (Float Tensor): initial hidden state of the LSTM
        """
        if self.embed:
            inputs = self.vectors(inputs)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.Tensor(lengths).long()
            if inputs.is_cuda:
                with torch.cuda.device_of(inputs):
                    lengths = lengths.cuda(torch.cuda.current_device())
        lens, indices = torch.sort(lengths, 0, True)
        outputs = [inputs] if self.residual_embeddings else []
        len_list = lens.tolist()
        packed_inputs = pack(inputs[indices], len_list, batch_first=True)

        if self.layer0:
            outputs0, hidden_t0 = self.rnn0(packed_inputs, hidden)
            unpacked_outputs0 = unpack(outputs0, batch_first=True)[0]
            _, _indices = torch.sort(indices, 0)
            unpacked_outputs0 = unpacked_outputs0[_indices]
            outputs.append(unpacked_outputs0)
            packed_inputs = outputs0
        if self.layer1:
            outputs1, hidden_t1 = self.rnn1(packed_inputs, hidden)
            unpacked_outputs1 = unpack(outputs1, batch_first=True)[0]
            _, _indices = torch.sort(indices, 0)
            unpacked_outputs1 = unpacked_outputs1[_indices]
            outputs.append(unpacked_outputs1)

        outputs = torch.cat(outputs, 2)
        return outputs if self.trainable else outputs.detach()

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load IMDb

In [1]:
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# Bring in IMDB data and make training/testing splits
df = pd.read_csv('/content/drive/MyDrive/cs510_nlp/project/IMDB_Dataset.csv')

# Shuffle data before training
idx = np.arange(50000)
np.random.shuffle(idx)
train_idx = idx[:25000]
test_idx = idx[25000:]

# Make splits
x_train = df.review[train_idx].to_list()
y_train = [0 if s == 'negative' else 1 for s in df.sentiment[train_idx]]
x_test = df.review[test_idx].to_list()
y_test = [0 if s == 'negative' else 1 for s in df.sentiment[test_idx]]

# Transformers

In [18]:
class WordTokenizer(BaseEstimator, MetaEstimatorMixin):
  """Tokenize input strings based on a simple word-boundary pattern."""
  
  def fit(self, X, y=None):
    return self
    
  def transform(self, X):
    token_pattern = re.compile(r"(?u)\b\w\w+\b")
    parser = lambda doc: token_pattern.findall(doc)
    X = [parser(x) for x in X]
    return X

class W2VTransformerDocLevel(W2VTransformer):
  """Extend Gensim's Word2Vec sklearn-wrapper class to further transform
  word-vectors into doc-vectors by averaging the words in each document."""
    
  def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1,
    workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
    trim_rule=None, sorted_vocab=1, batch_words=10000):
    super().__init__(size, alpha, window, min_count, max_vocab_size, sample, seed, workers, min_alpha, sg, hs, negative, cbow_mean, hashfxn, iter, null_word, trim_rule, sorted_vocab, batch_words)
    
  def transform(self, docs):      
    doc_vecs = []
    for doc in docs:
      # for each document generate a word matrix
      word_vectors_per_doc = []
      for word in doc:
        # handle out-of vocabulary words
        if word in self.gensim_model.wv:
          word_vectors_per_doc.append(self.gensim_model.wv[word])            
      word_vectors_per_doc = np.array(word_vectors_per_doc)
      # take the column-wise mean of this matrix and store
      doc_vecs.append(word_vectors_per_doc.mean(axis=0))
    return np.array(doc_vecs)


class CoVeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, tokenizer=WordTokenizer(), vectors='glove.6B.300d', device='cpu'):
        self.tokenizer = tokenizer.transform
        self.vectors = vectors
        self.device = device
        self.vocab = None
        self.text_pipeline = lambda x: [self.vocab[token] for token in self.tokenizer(x)[0]]

    
    def fit(self, X=None, y=None):
        return self


    def transform(self, X):
        print(f"Generating CoVe vectors")
        if self.vocab is None:
            self._build_vocab(X)

        print(f"Converting reviews to CoVe vectors")
        dataloader = DataLoader(X, batch_size=1, shuffle=False, collate_fn=self._collate_batch)
        self.model = MTLSTM(n_vocab=len(self.vocab), vectors=self.vocab.vectors, residual_embeddings=True)
        self.model.to(self.device)
        cove_out = []
        for i, x in enumerate(X):
            if i % 1000 == 0:
                print(f"Transforming record {i}")
            text, lens = self._collate_batch([x])
            vec = self.model(text, lens)
            cove_out.append(vec.median(dim=1).values.squeeze(0))

        return torch.stack(cove_out).cpu().numpy()


    def _build_vocab(self, X):
        print(f"Building vocabulary")
        self.counter = Counter()
        for tokens in self.tokenizer(X):
            self.counter.update(tokens)
        self.vocab = Vocab(self.counter, vectors=self.vectors)
    
    
    def _collate_batch(self, batch):
        text_list, len_list = [], []
        for _text in batch:
            processed_text = torch.tensor(self.text_pipeline([_text]), dtype=torch.int64)
            text_list.append(processed_text)
            len_list.append(processed_text.shape[0])
        text_list = pad_sequence(text_list, batch_first=True)
        len_list = torch.tensor(len_list).long()

        return text_list.to(self.device), len_list.to(self.device)

# GloVe (6B) + CoVe

In [22]:
pipeline = Pipeline([('feature_extraction', CoVeTransformer(device=device)), 
                     ('clf', LinearSVC(C=1.0, class_weight="balanced"))])

In [23]:
# Train model
pipeline.fit(x_train, y_train)

# Make prediction
predicted = pipeline.predict(x_test)

# Check results
results = metrics.classification_report(y_test, predicted)

Transforming record 3000
Transforming record 4000
Transforming record 5000
Transforming record 6000
Transforming record 7000
Transforming record 8000
Transforming record 9000
Transforming record 10000
Transforming record 11000
Transforming record 12000
Transforming record 13000
Transforming record 14000
Transforming record 15000
Transforming record 16000
Transforming record 17000
Transforming record 18000
Transforming record 19000
Transforming record 20000
Transforming record 21000
Transforming record 22000
Transforming record 23000
Transforming record 24000
Generating CoVe vectors
Converting reviews to CoVe vectors
Transforming record 0
Transforming record 1000
Transforming record 2000
Transforming record 3000
Transforming record 4000
Transforming record 5000
Transforming record 6000
Transforming record 7000
Transforming record 8000
Transforming record 9000
Transforming record 10000
Transforming record 11000
Transforming record 12000
Transforming record 13000
Transforming record 14000

In [24]:
print(results)

              precision    recall  f1-score   support

           0       0.81      0.81      0.81     12479
           1       0.81      0.81      0.81     12521

    accuracy                           0.81     25000
   macro avg       0.81      0.81      0.81     25000
weighted avg       0.81      0.81      0.81     25000



# GloVe (840B) + CoVe

In [19]:
pipeline2 = Pipeline([('feature_extraction', CoVeTransformer(vectors='glove.840B.300d', device=device)), 
                     ('clf', LinearSVC(C=1.0, class_weight="balanced"))])

In [20]:
# Train model
pipeline2.fit(x_train, y_train)

# Make prediction
predicted2 = pipeline2.predict(x_test)

# Check results
results2 = metrics.classification_report(y_test, predicted2)

Generating CoVe vectors
Building vocabulary
Converting reviews to CoVe vectors
Transforming record 0
Transforming record 1000
Transforming record 2000
Transforming record 3000
Transforming record 4000
Transforming record 5000
Transforming record 6000
Transforming record 7000
Transforming record 8000
Transforming record 9000
Transforming record 10000
Transforming record 11000
Transforming record 12000
Transforming record 13000
Transforming record 14000
Transforming record 15000
Transforming record 16000
Transforming record 17000
Transforming record 18000
Transforming record 19000
Transforming record 20000
Transforming record 21000
Transforming record 22000
Transforming record 23000
Transforming record 24000
Generating CoVe vectors
Converting reviews to CoVe vectors
Transforming record 0
Transforming record 1000
Transforming record 2000
Transforming record 3000
Transforming record 4000
Transforming record 5000
Transforming record 6000
Transforming record 7000
Transforming record 8000
Tra

In [21]:
print(results2)

              precision    recall  f1-score   support

           0       0.85      0.85      0.85     12479
           1       0.85      0.85      0.85     12521

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000



# Glove (6B) + CoVe + NGrams

In [25]:
# Build ngrams pipeline
vect = CountVectorizer(ngram_range=(1,2))
tf_idf = TfidfTransformer()
ngram_tf_idf_pipeline = Pipeline([
  ("vect", vect),
  ("tf_idf", tf_idf)
])

# Build GloVe -> CoVe pipeline
glove_cove_pipeline = CoVeTransformer(device=device)

# Build feature union of all pipelines
features = FeatureUnion([
  ("ngram_tf_idf_pipeline", ngram_tf_idf_pipeline),
  ("glove_cove_pipeline", glove_cove_pipeline)
])

# Set up classifier
classifier = LinearSVC(C=1.0, class_weight="balanced")

# Build main pipeline
pipeline3 = Pipeline([
     ("features", features),
     ("classifier", classifier)
 ])

In [26]:
# Train model
pipeline3.fit(x_train, y_train)

# Make prediction
predicted3 = pipeline3.predict(x_test)

# Check results
results3 = metrics.classification_report(y_test, predicted3)

Generating CoVe vectors
Building vocabulary
Converting reviews to CoVe vectors
Transforming record 0
Transforming record 1000
Transforming record 2000
Transforming record 3000
Transforming record 4000
Transforming record 5000
Transforming record 6000
Transforming record 7000
Transforming record 8000
Transforming record 9000
Transforming record 10000
Transforming record 11000
Transforming record 12000
Transforming record 13000
Transforming record 14000
Transforming record 15000
Transforming record 16000
Transforming record 17000
Transforming record 18000
Transforming record 19000
Transforming record 20000
Transforming record 21000
Transforming record 22000
Transforming record 23000
Transforming record 24000
Generating CoVe vectors
Converting reviews to CoVe vectors
Transforming record 0
Transforming record 1000
Transforming record 2000
Transforming record 3000
Transforming record 4000
Transforming record 5000
Transforming record 6000
Transforming record 7000
Transforming record 8000
Tra

In [27]:
print(results3)

              precision    recall  f1-score   support

           0       0.91      0.89      0.90     12479
           1       0.89      0.92      0.90     12521

    accuracy                           0.90     25000
   macro avg       0.90      0.90      0.90     25000
weighted avg       0.90      0.90      0.90     25000



# Glove (840B) + CoVe + NGrams

In [28]:
# Build ngrams pipeline
vect = CountVectorizer(ngram_range=(1,2))
tf_idf = TfidfTransformer()
ngram_tf_idf_pipeline = Pipeline([
  ("vect", vect),
  ("tf_idf", tf_idf)
])

# Build GloVe -> CoVe pipeline
glove_cove_pipeline = CoVeTransformer(vectors='glove.840B.300d', device=device)

# Build feature union of all pipelines
features = FeatureUnion([
  ("ngram_tf_idf_pipeline", ngram_tf_idf_pipeline),
  ("glove_cove_pipeline", glove_cove_pipeline)
])

# Set up classifier
classifier = LinearSVC(C=1.0, class_weight="balanced")

# Build main pipeline
pipeline4 = Pipeline([
     ("features", features),
     ("classifier", classifier)
 ])

In [31]:
# Train model
pipeline4.fit(x_train, y_train)

# Make prediction
predicted4 = pipeline4.predict(x_test)

# Check results
results4 = metrics.classification_report(y_test, predicted4)

Generating CoVe vectors
Converting reviews to CoVe vectors
Transforming record 0
Transforming record 1000
Transforming record 2000
Transforming record 3000
Transforming record 4000
Transforming record 5000
Transforming record 6000
Transforming record 7000
Transforming record 8000
Transforming record 9000
Transforming record 10000
Transforming record 11000
Transforming record 12000
Transforming record 13000
Transforming record 14000
Transforming record 15000
Transforming record 16000
Transforming record 17000
Transforming record 18000
Transforming record 19000
Transforming record 20000
Transforming record 21000
Transforming record 22000
Transforming record 23000
Transforming record 24000


In [32]:
print(results4)

              precision    recall  f1-score   support

           0       0.91      0.90      0.91     12479
           1       0.90      0.91      0.91     12521

    accuracy                           0.91     25000
   macro avg       0.91      0.91      0.91     25000
weighted avg       0.91      0.91      0.91     25000

