# Lab 6: RNNs
## Goal:
- Understand the mechanics of RNNs in Pytorch
- Train RNN based neural networks on text data
- Basics of word embedding and how to use them

In [29]:
import re
import os
import time
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import gensim
from gensim.models import KeyedVectors
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, Dataset

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

print("Device being used: %s" %device)

No GPU available, using the CPU instead.
Device being used: cpu


[nltk_data] Downloading package punkt to /Users/yuxuan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Problem Setup

In [30]:
np.random.seed(1111)

df = pd.read_csv('./Sentiment.csv')
df.head()

Unnamed: 0,id,candidate,candidate_confidence,relevant_yn,relevant_yn_confidence,sentiment,sentiment_confidence,subject_matter,subject_matter_confidence,candidate_gold,...,relevant_yn_gold,retweet_count,sentiment_gold,subject_matter_gold,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,1,No candidate mentioned,1.0,yes,1.0,Neutral,0.6578,None of the above,1.0,,...,,5,,,RT @NancyLeeGrahn: How did everyone feel about...,,2015-08-07 09:54:46 -0700,629697200650592256,,Quito
1,2,Scott Walker,1.0,yes,1.0,Positive,0.6333,None of the above,1.0,,...,,26,,,RT @ScottWalker: Didn't catch the full #GOPdeb...,,2015-08-07 09:54:46 -0700,629697199560069120,,
2,3,No candidate mentioned,1.0,yes,1.0,Neutral,0.6629,None of the above,0.6629,,...,,27,,,RT @TJMShow: No mention of Tamir Rice and the ...,,2015-08-07 09:54:46 -0700,629697199312482304,,
3,4,No candidate mentioned,1.0,yes,1.0,Positive,1.0,None of the above,0.7039,,...,,138,,,RT @RobGeorge: That Carly Fiorina is trending ...,,2015-08-07 09:54:45 -0700,629697197118861312,Texas,Central Time (US & Canada)
4,5,Donald Trump,1.0,yes,1.0,Positive,0.7045,None of the above,1.0,,...,,156,,,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,,2015-08-07 09:54:45 -0700,629697196967903232,,Arizona


Let's first look at some basic intuition and stats of the data

In [31]:
# Training data is a string of words
df.loc[0, 'text']

'RT @NancyLeeGrahn: How did everyone feel about the Climate Change question last night? Exactly. #GOPDebate'

In [32]:
pd.DataFrame(df.groupby('sentiment').count()['text'])

Unnamed: 0_level_0,text
sentiment,Unnamed: 1_level_1
Negative,8493
Neutral,3142
Positive,2236


For simplicity, 
- we only use ```X = 'text'``` and ```y = 'Sentiment'``` from the original dataframe. 
- We only look at positive (1) and negative (0) tweets.

In [33]:
df = df[['sentiment', 'text']]
df = df[df['sentiment'] != 'Neutral']
df['sentiment'] = [1 if s == "Positive" else 0 for s in df['sentiment']]
df.groupby('sentiment').count()

Unnamed: 0_level_0,text
sentiment,Unnamed: 1_level_1
0,8493
1,2236


In [34]:
train_data, test_data = train_test_split(df, test_size=0.10, random_state=42)
train_data.index = np.arange(len(train_data))
test_data.index = np.arange(len(test_data))
train_data.groupby('sentiment').count().apply(lambda x: 100 * x / float(x.sum()))

Unnamed: 0_level_0,text
sentiment,Unnamed: 1_level_1
0,79.152858
1,20.847142


### Input representations

#### Build vocabulary
We need to build a vocabulary using words in our training data. Any words in the test set that are not in our vocabulary will be replaced with an ```<UNK>``` token. We will also add a ```<PAD>``` token as padding.

For computational purposes, we'll only take words that appeared more than 3 times.

In [7]:
UNK = "<UNK>"
PAD = "<PAD>"

def build_vocab(sentences, min_count=3, max_vocab=None):
    """
    Build vocabulary from sentences (list of strings)
    """
    # keep track of the number of appearance of each word
    word_count = Counter()
    
    for sentence in sentences:
        # Regular expression operations: [] (indicate a set of characters), 
        sentence = re.sub('[\\(\[#.!?,\'\/\])0-9]', ' ', sentence)
        word_count.update(word_tokenize(sentence.lower()))
    
    vocabulary = list([w for w in word_count if word_count[w] > min_count]) + [UNK, PAD]
    indices = dict(zip(vocabulary, range(len(vocabulary))))

    return vocabulary, indices

vocabulary, vocab_indices = build_vocab(train_data['text'])

print(len(vocabulary))

3069


In [8]:
word_tokenize("I love NYU")

['I', 'love', 'NYU']

In [35]:
vocab_indices

{'this': 0,
 'is': 1,
 'great': 2,
 '-': 3,
 'let': 4,
 's': 5,
 'have': 6,
 'a': 7,
 'bunch': 8,
 'of': 9,
 'rich': 10,
 'men': 11,
 'make': 12,
 'decisions': 13,
 'about': 14,
 'plannedparenthood': 15,
 'gopdebates': 16,
 '@': 17,
 'megynkelly': 18,
 'had': 19,
 'jebbush': 20,
 'trump': 21,
 'was': 22,
 'foxnews': 23,
 'has': 24,
 'changed': 25,
 'for': 26,
 'the': 27,
 'worse': 28,
 'tcot': 29,
 'rt': 30,
 ':': 31,
 'gopdebate': 32,
 'show': 33,
 'hands': 34,
 'like': 35,
 'st': 36,
 'day': 37,
 'school': 38,
 'we': 39,
 're': 40,
 'very': 41,
 'little': 42,
 'too': 43,
 'many': 44,
 '&': 45,
 'amp': 46,
 ';': 47,
 'i': 48,
 'want': 49,
 'driving': 50,
 'his': 51,
 'fellow': 52,
 'patients': 53,
 'around': 54,
 'in': 55,
 '``': 56,
 'one': 57,
 'over': 58,
 'http': 59,
 't': 60,
 'co': 61,
 'monaeltahawy': 62,
 'any': 63,
 'candidate': 64,
 'received': 65,
 'word': 66,
 'from': 67,
 'god': 68,
 'presidential': 69,
 'hopefuls': 70,
 'america': 71,
 'christian': 72,
 'brotherhood': 73

## Model Time

#### Word representations
Next, we neeed to convert each word/token in the sentences into its index in the vocabulary so that pytorch can use it. We do this for both train and test set.

### DataLoader

In [36]:
class TweetDataset(Dataset):
    def __init__(self, vocab_index, df, label = 'sentiment'):
        self.vocab_index = vocab_index
        self.df = df
        self.label = label
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, key):
        sentence = self.df.loc[key, 'text']
        sentence = re.sub('[\\(\[#.!?,\'\/\])0-9]', ' ', sentence)
        token_indices = np.array([self.vocab_index[word] if word in self.vocab_index else self.vocab_index['<UNK>'] for word in word_tokenize(sentence.lower())])
        return (torch.tensor(token_indices) , self.df.loc[key, self.label])


def pad_collate(batch):
    (xx, yy) = zip(*batch)
    x_lens = [len(x) for x in xx]
    
    # I want to    eat an     apple
    # I am   going to  sleep  PAD  
    # batch_first: output will be in B x T x * if True, or in T x B x * otherwise
    xx_pad = pad_sequence(xx, batch_first=True, padding_value=len(vocabulary)-1)

    return torch.as_tensor(xx_pad), torch.as_tensor(x_lens), torch.LongTensor(yy)
    

BATCH_SIZE = 32
# shuffle: set to True to have the data reshuffled at every epoch
train_loader = DataLoader(TweetDataset(vocab_indices, train_data),
                          batch_size=BATCH_SIZE,
                          shuffle=True,
                          collate_fn = pad_collate)
test_loader = DataLoader(TweetDataset(vocab_indices, test_data),
                         batch_size=BATCH_SIZE,
                         shuffle=True,
                         collate_fn = pad_collate)

Let's get a general idea of what an instance of training batch will be like.

In [37]:
sample_input = next(iter(train_loader))
print("Padded sequence".center(80, '*'))
print(sample_input[0][0])
print("Length of sequence".center(80, '*'))
print(sample_input[1])
print("Label of sequence".center(80, '*'))
print(sample_input[2])

********************************Padded sequence*********************************
tensor([  30,   17, 3067,   31,   48,  319, 2201, 3067, 3067,  264,   27, 3067,
        1965,    9, 3067,  152,  266,   32, 3068, 3068, 3068, 3068, 3068, 3068,
        3068, 3068, 3068, 3068, 3068, 3068])
*******************************Length of sequence*******************************
tensor([18, 16, 15, 23, 27, 25, 27, 26, 22, 17, 16, 12, 26, 11, 28, 25, 30, 21,
        24, 30, 28, 23, 24, 21, 19, 20, 21, 11, 12,  5, 21, 21])
*******************************Label of sequence********************************
tensor([0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
        0, 0, 0, 0, 1, 1, 0, 0])


### RNN

- Each input word is represented by a vector of dimension ```embedding_dim```. Check out ```nn.Embedding``` to see how to initialize embeddings randomly.
- Your model should take the following input parameters
    - ```hidden_dim```: The number of features in the hidden state h of your RNN layer
    - ```output_dim```: Number of output classes
    - ```vocab_size``` Size of your vocabulary. 
    - ```embedding_dim```: Dimension of word embeddings
- Your model should consist of an RNN layer (you can use either ```nn.RNN``` or ```nn.LSTM```) followed by a linear layer.
- $h_{0}$ (and $c$ if you use LSTM) should be initialized as a zero vector of dimension ```hidden_dim```. You might want to check out ```nn.Parameter```

In [38]:
class RNN(nn.Module):
    def __init__(self, hidden_dim, output_dim, 
                 vocab_size, embedding_dim, rnn='LSTM'):
        super(RNN, self).__init__()
        
        self.emb = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab_size-1)
        self.hidden_dim = hidden_dim
        self.rnn_fn = rnn
        assert self.rnn_fn in ['LSTM', 'RNN']
        self.rnn = getattr(nn, rnn)(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x, x_len):
        x = self.emb(x)
        
        # output:  tensor containing the output features (h_t) from the last layer of the RNN, tensor containing the hidden state for t = seq_len.
        # pack_padded_sequence: Packs a Tensor containing padded sequences of variable length.
        # enforce_sorted: if True, the input is expected to contain sequences sorted by length in a decreasing order. If False, the input will get sorted unconditionally.
        _, last_hidden = self.rnn(pack_padded_sequence(x, x_len.to('cpu'), batch_first=True, enforce_sorted=False))
        if self.rnn_fn == 'LSTM':
            # (h,c)
            # c_0: tensor containing the initial cell state for each element in the batch.
            last_hidden = last_hidden[0]
        out = self.fc(last_hidden.view(-1, self.hidden_dim))
        return out

### Train and validation loop

In [39]:
def train(model, train_loader=train_loader, test_loader=test_loader, 
          learning_rate=0.001, num_epoch=10, print_every=100):
    # Training steps
    start_time = time.time()
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=10**(-5))
    for epoch in range(num_epoch):
        model.train()
        for i, (data, data_len, labels) in enumerate(train_loader):
            data, data_len, labels = data.to(device), data_len.to(device), labels.to(device)
            outputs = model(data, data_len)
            model.zero_grad()
            loss = loss_fn(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()

             # report performance
            if (i + 1) % print_every == 0:
                print('Train set | epoch: {:3d} | {:6d}/{:6d} batches | Loss: {:6.4f}'.format(
                    epoch, i + 1, len(train_loader), loss.item()))     
    
    # Evaluate after every epochh
        correct = 0
        total = 0
        model.eval()

        predictions = []
        truths = []

        with torch.no_grad():
            for i, (data, data_len, labels) in enumerate(test_loader):
                data, data_len, labels = data.to(device), data_len.to(device), labels.to(device)
                outputs = model(data, data_len)
                pred = outputs.data.max(-1)[1]
                predictions += list(pred.cpu().numpy())
                truths += list(labels.cpu().numpy())
                total += labels.size(0)
                correct += (pred == labels).sum()
                
            acc = (100 * correct / total)
            auc = roc_auc_score(truths, predictions)
            elapse = time.strftime('%H:%M:%S', time.gmtime(int((time.time() - start_time))))
            print('Test set | Accuracy: {:6.4f} | AUC: {:4.2f} | time elapse: {:>9}'.format(
                acc, auc, elapse))

Run the code block below to check your model performance.

In [40]:
torch.manual_seed(42)
rnn_model = RNN(40, 2, len(vocabulary), 50, rnn='RNN').to(device)
train(rnn_model, train_loader, test_loader)

Train set | epoch:   0 |    100/   302 batches | Loss: 0.5967
Train set | epoch:   0 |    200/   302 batches | Loss: 0.4869
Train set | epoch:   0 |    300/   302 batches | Loss: 0.6579
Test set | Accuracy: 80.9879 | AUC: 0.58 | time elapse:  00:00:01
Train set | epoch:   1 |    100/   302 batches | Loss: 0.5395
Train set | epoch:   1 |    200/   302 batches | Loss: 0.6252
Train set | epoch:   1 |    300/   302 batches | Loss: 0.3192
Test set | Accuracy: 81.9199 | AUC: 0.62 | time elapse:  00:00:02
Train set | epoch:   2 |    100/   302 batches | Loss: 0.3423
Train set | epoch:   2 |    200/   302 batches | Loss: 0.4585
Train set | epoch:   2 |    300/   302 batches | Loss: 0.3538
Test set | Accuracy: 83.0382 | AUC: 0.66 | time elapse:  00:00:04
Train set | epoch:   3 |    100/   302 batches | Loss: 0.3827
Train set | epoch:   3 |    200/   302 batches | Loss: 0.2538
Train set | epoch:   3 |    300/   302 batches | Loss: 0.4507
Test set | Accuracy: 84.0634 | AUC: 0.68 | time elapse:  0

In [41]:
lstm_model = RNN(40, 2, len(vocabulary), 50, rnn='LSTM').to(device)
train(lstm_model)

Train set | epoch:   0 |    100/   302 batches | Loss: 0.3871
Train set | epoch:   0 |    200/   302 batches | Loss: 0.7016
Train set | epoch:   0 |    300/   302 batches | Loss: 0.5600
Test set | Accuracy: 81.2675 | AUC: 0.58 | time elapse:  00:00:02
Train set | epoch:   1 |    100/   302 batches | Loss: 0.4215
Train set | epoch:   1 |    200/   302 batches | Loss: 0.4763
Train set | epoch:   1 |    300/   302 batches | Loss: 0.2787
Test set | Accuracy: 84.2498 | AUC: 0.66 | time elapse:  00:00:04
Train set | epoch:   2 |    100/   302 batches | Loss: 0.4776
Train set | epoch:   2 |    200/   302 batches | Loss: 0.2461
Train set | epoch:   2 |    300/   302 batches | Loss: 0.1980
Test set | Accuracy: 83.9702 | AUC: 0.67 | time elapse:  00:00:07
Train set | epoch:   3 |    100/   302 batches | Loss: 0.3883
Train set | epoch:   3 |    200/   302 batches | Loss: 0.2145
Train set | epoch:   3 |    300/   302 batches | Loss: 0.2379
Test set | Accuracy: 85.0885 | AUC: 0.70 | time elapse:  0

### Model predictions

In [42]:
def sentences_to_padded_index_sequences(words, sentences):
    for i, s in enumerate(sentences):
        # h tokens
        token_indices = np.array([words[w] if w in words else words['<UNK>'] for w in word_tokenize(s.lower())])
    return token_indices, len(token_indices)

In [43]:
def test_sentence(sentence, model):
    model.eval()
    test_tensor, len_sent = sentences_to_padded_index_sequences(vocab_indices, [sentence])
    score = model(torch.LongTensor(test_tensor.astype(int)).unsqueeze(0).to(device), torch.as_tensor([len_sent]).to(device)).cpu().data.numpy().squeeze()
    label = np.argmax(score)
    return ("positive" if label == 1 else "negative", score[label])

In [44]:
test_sentence("Today's weather is so good!", rnn_model)

('positive', 0.7944534)

In [45]:
test_sentence("The way he talked to me is awful", rnn_model)

('negative', 1.5269691)

## Word Embeddings and How to Use Them

When using deep learning methods on NLP tasks, we usually utilize [word embedding](https://en.wikipedia.org/wiki/Word_embedding). To put it briefly, word embedding represent words, or tokens, in a vocabulary as a distributed numerical vector. There are a lot of methods to obtain a word embedding, with some of the most famous shallow models being Word2Vec, GloVe, and FastText while the deeper models are BERT, RoBERTa, T5. It is not difficult to find a general purpose word embedding trained by one of the aforementioned methods on the Internet that's been trained with a massive amount of data. It is usually a good idea to use these pre-trained embedding to save yourself some time and computing resource.

In this lab, we will be using the [GloVe embedding](https://nlp.stanford.edu/projects/glove/) developed by Stanford,  one of the state-of-the-art word embedding. Please download the file ```glove.6B.50d.txt``` [here](https://drive.google.com/file/d/1JweINiA5JvTNLTm663LH8OdWssK2Kcid/view?usp=sharing).

### Find similar words

The word embedding vectors can help us find words with similar meanings. Word similarities can be measured by [Cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity).

In [46]:
from gensim.scripts.glove2word2vec import glove2word2vec
# load embedding

_ = glove2word2vec('./glove.6B.50d.txt', 'tmp_file')
glove_embedding = KeyedVectors.load_word2vec_format('tmp_file')

  _ = glove2word2vec('./glove.6B.50d.txt', 'tmp_file')


In [49]:
glove_embedding.similar_by_word('school', topn=5)

[('college', 0.9344995617866516),
 ('schools', 0.868353009223938),
 ('campus', 0.8472231030464172),
 ('graduate', 0.8460071682929993),
 ('elementary', 0.8369437456130981)]

### Word arithmetic

In [50]:
glove_embedding.similar_by_word(glove_embedding['worse'] - glove_embedding['better'] + glove_embedding['best'], topn=1)

[('worst', 0.8109661340713501)]

### Train an LSTM model with GloVe embedding

- Create the new 'dictionary' to send it into our dataset class
- Load the GloVe Embedding into our nn.Embedding layer of the model

In [52]:
import numpy as np
from tqdm import tqdm
# load embedding
emb_dim = 50
with open('./glove.6B.50d.txt') as f:
    glove_embedding = []
    words = {}

    for i, line in tqdm(enumerate(f)):
        s = line.split()
        glove_embedding.append(np.asarray(s[1:]))
        
        words[s[0]] = len(words)
 
        
# add unknown to word and char
glove_embedding.append(np.random.rand(emb_dim))
words["<UNK>"] = len(words)

# add padding
glove_embedding.append(np.zeros(emb_dim))
words["<PAD>"] = len(words)


glove_embedding = np.array(glove_embedding).astype(float)

400000it [00:03, 133295.33it/s]


In [53]:
len(words)

400002

In [54]:
glove_embedding.shape

(400002, 50)

In [55]:
train_loader_glove = DataLoader(TweetDataset(words, train_data),
                                batch_size=BATCH_SIZE,
                                shuffle=True,
                                collate_fn = pad_collate)
test_loader_glove = DataLoader(TweetDataset(words, test_data),
                               batch_size=BATCH_SIZE,
                               shuffle=True,
                               collate_fn = pad_collate)

In [56]:
glove_model = RNN(40, 2, len(glove_embedding), 50, rnn='LSTM')
glove_model.emb.weight.data.copy_(torch.from_numpy(glove_embedding))

tensor([[ 0.4180,  0.2497, -0.4124,  ..., -0.1841, -0.1151, -0.7858],
        [ 0.0134,  0.2368, -0.1690,  ..., -0.5666,  0.0447,  0.3039],
        [ 0.1516,  0.3018, -0.1676,  ..., -0.3565,  0.0164,  0.1022],
        ...,
        [ 0.0726, -0.5139,  0.4728,  ..., -0.1891, -0.5902,  0.5556],
        [ 0.0560,  0.8742,  0.3537,  ...,  0.2650,  0.6108,  0.7820],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [28]:
train(glove_model.to(device), train_loader=train_loader_glove, test_loader=test_loader_glove)

Train set | epoch:   0 |    100/   302 batches | Loss: 0.4760
Train set | epoch:   0 |    200/   302 batches | Loss: 0.4253
Train set | epoch:   0 |    300/   302 batches | Loss: 0.3312
Test set | Accuracy: 80.9879 | AUC: 0.58 | time elapse:  00:00:11
Train set | epoch:   1 |    100/   302 batches | Loss: 0.2679
Train set | epoch:   1 |    200/   302 batches | Loss: 0.4546
Train set | epoch:   1 |    300/   302 batches | Loss: 0.3785
Test set | Accuracy: 84.4362 | AUC: 0.67 | time elapse:  00:00:23
Train set | epoch:   2 |    100/   302 batches | Loss: 0.3231
Train set | epoch:   2 |    200/   302 batches | Loss: 0.5888
Train set | epoch:   2 |    300/   302 batches | Loss: 0.4438
Test set | Accuracy: 81.5471 | AUC: 0.76 | time elapse:  00:00:35
Train set | epoch:   3 |    100/   302 batches | Loss: 0.4040
Train set | epoch:   3 |    200/   302 batches | Loss: 0.3123
Train set | epoch:   3 |    300/   302 batches | Loss: 0.3777
Test set | Accuracy: 83.7838 | AUC: 0.72 | time elapse:  0