In [None]:
import glob
import json
import os
import random
import time

import spacy
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext import data
from torchtext import datasets

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
os.chdir('/content/gdrive/My Drive/Colab Notebooks')
!ls -alt

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
total 13581
-rw------- 1 root root    25159 Jun 27 13:25  lstm.ipynb
-rw------- 1 root root 13835868 Jun 27 13:24  tut2-model.pt
drwx------ 2 root root     4096 Jun 27 13:18  .vector_cache
drwx------ 3 root root     4096 Jun 26 06:54  .data
-rw------- 1 root root    36496 Jun 25 17:08 ' Upgraded Sentiment Analysis.ipynb'


In [None]:
EMBEDDING_DIM = 100

In [None]:
class Steam(data.Dataset):

    name = 'steam'
    dirname = 'aclSteam'
    
    @staticmethod
    def sort_key(ex):
        return len(ex.text)

    def __init__(self, path, text_field, label_field, **kwargs):
        fields = [('text', text_field), ('label', label_field)]
        examples = []
        
        path = f'{path}.json'
        with open(path, 'r') as f:
            lines = json.load(f)
    
        for d in lines:
            label = d[5]
            text = d[6]
            if not text:
                continue
            examples.append(data.Example.fromlist([text, label], fields))
        super().__init__(examples, fields, **kwargs)

    @classmethod
    def splits(cls, text_field, label_field, root='.data',
               train='train', test='test', **kwargs):
        path = os.path.join(root, cls.name, cls.dirname)
        return super().splits(
            path=path, root=root,
            text_field=text_field, label_field=label_field,
            train=train, validation=None, test=test, **kwargs)

    @classmethod
    def iters(cls, batch_size=32, device=0, root='.data', vectors=None, **kwargs):
        TEXT = data.Field()
        LABEL = data.Field(sequential=False)

        train, test = cls.splits(TEXT, LABEL, root=root, **kwargs)

        TEXT.build_vocab(train, vectors=vectors)
        LABEL.build_vocab(train)

        return data.BucketIterator.splits(
            (train, test), batch_size=batch_size, device=device)


In [None]:
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize='spacy', include_lengths=True)
LABEL = data.LabelField(dtype = torch.float)

In [None]:
train_data, test_data = Steam.splits(TEXT, LABEL)

In [None]:
train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [None]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = f"glove.6B.{EMBEDDING_DIM}d", 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

In [None]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)

In [None]:
INPUT_DIM = len(TEXT.vocab)
# EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,810,857 trainable parameters


In [None]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([25002, 100])


In [None]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.3398,  0.2094,  0.4635,  ..., -0.2339,  0.4730, -0.0288],
        ...,
        [-0.2000,  0.0117, -0.1582,  ...,  0.0754, -1.0379, -0.2080],
        [ 0.2750, -0.2916,  0.3303,  ..., -0.1843,  0.1615,  0.2162],
        [-0.8096,  1.1721,  1.4240,  ..., -0.1483,  0.1093, -0.8568]])

In [None]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.3398,  0.2094,  0.4635,  ..., -0.2339,  0.4730, -0.0288],
        ...,
        [-0.2000,  0.0117, -0.1582,  ...,  0.0754, -1.0379, -0.2080],
        [ 0.2750, -0.2916,  0.3303,  ..., -0.1843,  0.1615,  0.2162],
        [-0.8096,  1.1721,  1.4240,  ..., -0.1483,  0.1093, -0.8568]])


In [None]:
optimizer = optim.Adam(model.parameters())

In [None]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def F_score(logit, label, threshold=0.5, beta=1):
    prob = torch.sigmoid(logit)
    prob = prob > threshold
    label = label > threshold

    TP = (prob & label).sum().float()
    TN = ((~prob) & (~label)).sum().float()
    FP = (prob &(~label)).sum().float()
    FN = ((~prob) & label).sum().float()

    precision = torch.mean(TP / (TP + FP + 1e-12))
    recall = torch.mean(TP / (TP + FN + 1e-12))
    f1_score = (1 + beta**2) * precision * recall / (beta**2 * precision + recall + 1e-12)
    return precision, recall, f1_score

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        
        predictions = model(text, text_lengths).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_precision = 0
    epoch_recall = 0
    epoch_f1_score = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.text
            
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)
            precision, recall, f1_score = F_score(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

            epoch_precision += precision.item()
            epoch_recall += recall.item()
            epoch_f1_score += f1_score.item()
   
    num_data = len(iterator)
    epoch_loss /= num_data
    epoch_acc /= num_data
    epoch_precision /= num_data
    epoch_recall /= num_data
    epoch_f1_score /= num_data

    return epoch_loss, epoch_acc, epoch_precision, epoch_recall, epoch_f1_score

    # return epoch_loss / len(iterator), epoch_acc / len(iterator), epoch_f1_score / len(iterator)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)

    valid_loss, valid_acc, valid_precision, valid_recall, valid_f1_score = evaluate(model, valid_iterator, criterion)
    # valid_loss, valid_acc, valid_f1_score = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    print(f'\t Val. Recall: {valid_recall*100:.2f}% |  Val. Precision: {valid_precision*100:.2f}% | Val. F1 score: {valid_f1_score*100:.2f}%')

Epoch: 01 | Epoch Time: 1m 23s
	Train Loss: 0.396 | Train Acc: 81.62%
	 Val. Loss: 0.299 |  Val. Acc: 86.93%
	 Val. Recall: 70.43% |  Val. Precision: 81.29% | Val. F1 score: 74.88%
Epoch: 02 | Epoch Time: 1m 23s
	Train Loss: 0.306 | Train Acc: 86.67%
	 Val. Loss: 0.280 |  Val. Acc: 88.04%
	 Val. Recall: 74.40% |  Val. Precision: 81.91% | Val. F1 score: 77.36%
Epoch: 03 | Epoch Time: 1m 23s
	Train Loss: 0.279 | Train Acc: 88.14%
	 Val. Loss: 0.265 |  Val. Acc: 88.71%
	 Val. Recall: 76.53% |  Val. Precision: 82.36% | Val. F1 score: 78.90%
Epoch: 04 | Epoch Time: 1m 23s
	Train Loss: 0.262 | Train Acc: 88.98%
	 Val. Loss: 0.274 |  Val. Acc: 88.47%
	 Val. Recall: 80.67% |  Val. Precision: 79.57% | Val. F1 score: 79.57%
Epoch: 05 | Epoch Time: 1m 23s
	Train Loss: 0.250 | Train Acc: 89.57%
	 Val. Loss: 0.267 |  Val. Acc: 89.10%
	 Val. Recall: 75.44% |  Val. Precision: 84.52% | Val. F1 score: 79.33%


In [None]:
model.load_state_dict(torch.load('tut2-model.pt'))

test_loss, test_acc, test_precision, test_recall, test_f1_score = evaluate(model, test_iterator, criterion)
# test_loss, test_acc, test_f1_score = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
print(f'\t Test. Recall: {test_recall*100:.2f}% |  Test. Precision: {test_precision*100:.2f}% | Test. F1 score: {test_f1_score*100:.2f}%')

Test Loss: 0.269 | Test Acc: 88.66%
	 Test. Recall: 76.33% |  Test. Precision: 82.22% | Test. F1 score: 78.76%


In [None]:
nlp = spacy.load('en')

def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    return prediction.item()

In [None]:
predict_sentiment(model, "This game is terrible")

0.8718990683555603

In [None]:
predict_sentiment(model, "This game is great")

0.007436361629515886