In [None]:
'''
sentiment analysis 
IMDB dataset
'''

In [None]:
import torch
from torchtext.legacy import data
from torchtext.legacy import datasets
import random
torch.backends.cudnn.deterministic = True
text = data.Field(tokenize='spacy',tokenizer_language='en_core_web_sm',include_lengths=True)
label = data.LabelField(dtype = torch.float)
train_data, test_data = datasets.IMDB.splits(text, label)
train_data, valid_data = train_data.split(split_ratio=0.8)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:08<00:00, 10.5MB/s]


In [None]:
text.build_vocab(train_data, 
                 max_size=25000, 
                 vectors="glove.6B.100d", 
                 unk_init=torch.Tensor.normal_)
label.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [02:43, 5.29MB/s]                           
100%|█████████▉| 399157/400000 [00:22<00:00, 18013.63it/s]

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=64,
    sort_within_batch=True,
    device=device)

In [None]:
import torch.nn as nn
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim,
                 hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):     
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,
                                      embedding_dim,
                                      padding_idx=pad_idx)      
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout) 
        self.fc = nn.Linear(hidden_dim*2, output_dim)   
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):        
        embedded = self.dropout(self.embedding(text))                
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded,
                                                            text_lengths.to('cpu'))        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)        
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)  
        hidden = self.dropout(torch.cat((hidden[-2,:,:],
                                         hidden[-1,:,:]), dim=1))
        out = self.fc(hidden)                 
        return out

In [None]:
# Create model
import torch.optim as optim
pad_ix = text.vocab.stoi[text.pad_token]
model = LSTM(len(text.vocab), 100, 256, 1, 2, True, 0.5, pad_ix)
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def binary_accuracy(preds, y):
    preds = torch.round(torch.sigmoid(preds))
    correct = (preds == y).float() 
    return correct.sum() / len(correct)

def train_model(model, iter, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in iter:
        optimizer.zero_grad()
        text, text_lengths = batch.text
        preds = model(text, text_lengths).squeeze(1)
        loss = criterion(preds, batch.label) 
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += binary_accuracy(preds, batch.label)
    return epoch_loss / len(iter), epoch_acc / len(iter)

def evaluate_model(model, iter, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iter:
            text, text_lengths = batch.text    
            predictions = model(text, text_lengths).squeeze(1)            
            loss = criterion(predictions, batch.label)            
            epoch_loss += loss.item()
            epoch_acc += binary_accuracy(predictions, batch.label)      
    return epoch_loss / len(iter), epoch_acc / len(iter)

In [None]:
# Train the model

best_valid_loss = float('inf')
for epoch in range(10):  
    train_loss, train_acc = train_model(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate_model(model, valid_iterator, criterion)  
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'LSTM.pt')
    print(f'Epoch: {epoch+1}')
    print(f'Train Loss: {train_loss:.3f}')
    print(f'Train Acc: {train_acc*100:.2f}%')
    print(f'Val Loss: {valid_loss:.3f}')
    print(f'Val Acc: {valid_acc*100:.2f}%')
    print('---------------')

Epoch: 1
Train Loss: 0.673
Train Acc: 57.65%
Val Loss: 0.644
Val Acc: 63.59%
---------------
Epoch: 2
Train Loss: 0.658
Train Acc: 60.37%
Val Loss: 0.674
Val Acc: 55.44%
---------------
Epoch: 3
Train Loss: 0.653
Train Acc: 62.04%
Val Loss: 0.652
Val Acc: 60.90%
---------------
Epoch: 4
Train Loss: 0.569
Train Acc: 70.90%
Val Loss: 0.472
Val Acc: 78.34%
---------------
Epoch: 5
Train Loss: 0.474
Train Acc: 78.09%
Val Loss: 0.412
Val Acc: 81.53%
---------------
Epoch: 6
Train Loss: 0.401
Train Acc: 82.05%
Val Loss: 0.361
Val Acc: 84.30%
---------------
Epoch: 7
Train Loss: 0.350
Train Acc: 84.94%
Val Loss: 0.332
Val Acc: 86.17%
---------------
Epoch: 8
Train Loss: 0.315
Train Acc: 86.95%
Val Loss: 0.317
Val Acc: 87.01%
---------------
Epoch: 9
Train Loss: 0.289
Train Acc: 87.95%
Val Loss: 0.319
Val Acc: 87.60%
---------------
Epoch: 10
Train Loss: 0.264
Train Acc: 89.27%
Val Loss: 0.354
Val Acc: 85.11%
---------------


In [None]:
model.load_state_dict(torch.load('LSTM.pt'))
test_loss, test_acc = evaluate_model(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f}')
print(f'Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.319
Test Acc: 86.62%


In [None]:
# Load embedded layer from Question 4 & create new model

import torch.optim as optim
cbow = torch.load('CBOW.pt')
model = LSTM(len(cbow['embeddings.weight']), 100, 256, 1, 2, True, 0.5, 1)
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

In [None]:
# Train the model

best_valid_loss = float('inf')
for epoch in range(10):
    train_loss, train_acc = train_model(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate_model(model, valid_iterator, criterion)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'CBOW_LSTM.pt')
    print(f'Epoch: {epoch+1}')
    print(f'Train Loss: {train_loss:.3f}')
    print(f'Train Acc: {train_acc*100:.2f}%')
    print(f'Val Loss: {valid_loss:.3f}')
    print(f'Val Acc: {valid_acc*100:.2f}%')
    print('---------------')

Epoch: 1
Train Loss: 0.674
Train Acc: 57.71%
Val Loss: 0.633
Val Acc: 67.76%
---------------
Epoch: 2
Train Loss: 0.645
Train Acc: 62.78%
Val Loss: 0.556
Val Acc: 72.63%
---------------
Epoch: 3
Train Loss: 0.566
Train Acc: 71.55%
Val Loss: 0.546
Val Acc: 72.78%
---------------
Epoch: 4
Train Loss: 0.569
Train Acc: 69.96%
Val Loss: 0.601
Val Acc: 68.73%
---------------
Epoch: 5
Train Loss: 0.529
Train Acc: 73.34%
Val Loss: 0.564
Val Acc: 72.03%
---------------
Epoch: 6
Train Loss: 0.404
Train Acc: 82.14%
Val Loss: 0.383
Val Acc: 84.08%
---------------
Epoch: 7
Train Loss: 0.357
Train Acc: 84.41%
Val Loss: 0.435
Val Acc: 84.67%
---------------
Epoch: 8
Train Loss: 0.325
Train Acc: 86.27%
Val Loss: 0.397
Val Acc: 84.49%
---------------
Epoch: 9
Train Loss: 0.294
Train Acc: 87.91%
Val Loss: 0.489
Val Acc: 82.67%
---------------
Epoch: 10
Train Loss: 0.274
Train Acc: 88.87%
Val Loss: 0.300
Val Acc: 87.72%
---------------


In [None]:
model.load_state_dict(torch.load('CBOW_LSTM.pt'))
test_loss, test_acc = evaluate_model(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f}')
print(f'Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.301
Test Acc: 87.56%
