# 0- Import packages

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext import data
from torchtext import datasets

import matplotlib.pyplot as plt1
import matplotlib.pyplot as plt2
import numpy as np

import time

# 1- Import data

In [2]:
torch.backends.cudnn.deterministic = True
LABEL = data.LabelField(dtype = torch.float)
TEXT = data.Field(tokenize = 'spacy')
fields = [(None, None),(None, None), (None, None), (None, None),(None, None),(None, None), ('label', LABEL),(None, None),(None, None), (None, None),(None, None),('text',TEXT)]
train_data = data.TabularDataset(path = 'data/data_filtered2_balanced_all.csv', format = 'csv', fields = fields, skip_header = True)
print(f'Number of data examples: {len(train_data)}')

train_data, test_data = train_data.split(0.8)
train_data, valid_data = train_data.split(0.8)
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of data examples: 98474
Number of training examples: 63023
Number of validation examples: 15756
Number of testing examples: 19695


# 2- Build vocabulary

In [3]:
MAX_VOCAB_SIZE = 95_000 #most common 25000 words
LABEL.build_vocab(train_data)
TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)

print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

print(f"Most common words: {TEXT.vocab.freqs.most_common(20)}") #most common words
print(f"Vocabulary:{TEXT.vocab.itos[:10]}") #to see vocabulary
print(f"Labels: {LABEL.vocab.stoi}") #to see labels

Unique tokens in TEXT vocabulary: 91383
Unique tokens in LABEL vocabulary: 2
Most common words: [('.', 294859), (',', 234939), ('the', 218011), ('I', 199050), ('and', 153835), ('a', 144628), ('to', 130619), (' ', 123870), ('it', 108243), ('of', 106011), ('is', 91993), ('in', 68043), ('this', 66577), ('for', 64489), ('that', 58664), ('was', 44935), ('you', 44331), ('my', 44159), ('with', 43051), ('not', 42503)]
Vocabulary:['<unk>', '<pad>', '.', ',', 'the', 'I', 'and', 'a', 'to', ' ']
Labels: defaultdict(<function _default_unk_index at 0x000002F1004157B8>, {'0': 0, '1': 1})


# 3- Build iterators

In [4]:
BATCH_SIZE = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

''''train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device)'''

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    sort_key = lambda x: x.text, #sort by s attribute (quote)
    sort_within_batch=False,
    batch_size=BATCH_SIZE,
    device=device)

# 4- Build the model

In [8]:
#1- Define the model

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):

        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))
    
#2- Create an instance of the RNN class
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 50
HIDDEN_DIM = 100
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

#3- Function to know how many trainable parameters
def count_parameters(model): 
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')


The model has 4,584,451 trainable parameters


# 5 - Train the model

In [9]:
#0- Set up the plotting figure
%matplotlib inline
%config InlineBackend.figure_format ='svg'
%load_ext autoreload
%autoreload 2
%matplotlib notebook

#1- Set up the optimizer

#optimizer = optim.SGD(model.parameters(), lr=1e-3)
optimizer = optim.Adam(model.parameters())

#2- Sigmoid and boundary cross entropy
criterion = nn.BCEWithLogitsLoss()

#3- Move them to GPU
model = model.to(device)
criterion = criterion.to(device)

#4- Accuracy function
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

#5- Train function
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()

        predictions = model(batch.text).squeeze(1)
        #breakpoint()
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

#6- Evaluate function
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

#7- Epoch running time function
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

#8- Train the model
N_EPOCHS = 200

best_valid_loss = float('inf')

cont = -1
train_loss_vec = np.zeros(N_EPOCHS)
valid_loss_vec = np.zeros(N_EPOCHS)
train_acc_vec = np.zeros(N_EPOCHS)
valid_acc_vec = np.zeros(N_EPOCHS)

best_valid_loss = float('inf')

fig1 = plt1.figure()
ax1 = fig1.add_subplot(111)
fig1.show()
fig1.canvas.draw()

fig2 = plt2.figure()
ax2 = fig2.add_subplot(111)
fig2.show()
fig2.canvas.draw()

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    
    cont = cont + 1
    train_loss_vec[cont]=train_loss
    valid_loss_vec[cont]=valid_loss
    train_acc_vec[cont]=train_acc
    valid_acc_vec[cont]=valid_acc
    
    ax1.clear()
    ax1.plot(range(1,epoch+2),train_acc_vec[0:cont+1],'-o', label=r'Training accuracy') 
    ax1.plot(range(1,epoch+2),valid_acc_vec[0:cont+1],'-o', label=r'Validation accuracy')
    fig1.legend(loc='best')
    fig1.canvas.draw()
    
    ax2.clear()
    ax2.plot(range(1,epoch+2),train_loss_vec[0:cont+1],'-o', label=r'Training loss') 
    ax2.plot(range(1,epoch+2),valid_loss_vec[0:cont+1],'-o', label=r'Validation loss')
    fig2.legend(loc='best')
    fig2.canvas.draw()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Epoch: 01 | Epoch Time: 2m 21s
	Train Loss: 0.693 | Train Acc: 50.50%
	 Val. Loss: 0.693 |  Val. Acc: 49.93%




Epoch: 02 | Epoch Time: 2m 18s
	Train Loss: 0.693 | Train Acc: 50.44%
	 Val. Loss: 0.693 |  Val. Acc: 49.88%
Epoch: 03 | Epoch Time: 2m 20s
	Train Loss: 0.693 | Train Acc: 50.35%
	 Val. Loss: 0.693 |  Val. Acc: 49.96%
Epoch: 04 | Epoch Time: 2m 21s
	Train Loss: 0.693 | Train Acc: 50.53%
	 Val. Loss: 0.693 |  Val. Acc: 49.99%
Epoch: 05 | Epoch Time: 2m 21s
	Train Loss: 0.693 | Train Acc: 50.76%
	 Val. Loss: 0.693 |  Val. Acc: 50.00%
Epoch: 06 | Epoch Time: 2m 21s
	Train Loss: 0.692 | Train Acc: 50.63%
	 Val. Loss: 0.693 |  Val. Acc: 50.37%
Epoch: 07 | Epoch Time: 2m 21s
	Train Loss: 0.693 | Train Acc: 50.57%
	 Val. Loss: 0.693 |  Val. Acc: 50.05%
Epoch: 08 | Epoch Time: 2m 22s
	Train Loss: 0.692 | Train Acc: 50.76%
	 Val. Loss: 0.693 |  Val. Acc: 50.05%
Epoch: 09 | Epoch Time: 2m 18s
	Train Loss: 0.692 | Train Acc: 50.69%
	 Val. Loss: 0.693 |  Val. Acc: 50.05%
Epoch: 10 | Epoch Time: 2m 20s
	Train Loss: 0.692 | Train Acc: 50.66%
	 Val. Loss: 0.693 |  Val. Acc: 50.05%


KeyboardInterrupt: 

# 6- Final results

In [10]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.692 | Test Acc: 50.40%
