# This is a practice notebook for pytorch on text (XXX)

We implemented 4 models:
* DNN

In [21]:
import numpy as np
import pandas as pd
import torch
import torchtext
import matplotlib.pyplot as plt
import time
from torch.autograd import Variable
from torch import nn, optim
from torch.optim import SGD,Adam
import torch.nn.functional as F
import random

In [8]:
seed = 1234
torch.manual_seed(seed)
#torch.device("cpu");
torch.device("cuda")

device(type='cuda')

### Dataset preparation

In [9]:
TEXT = torchtext.data.Field(tokenize = 'spacy')
LABEL = torchtext.data.LabelField(dtype = torch.float)

In [10]:
train, test = torchtext.datasets.IMDB.splits(TEXT, LABEL)

downloading aclImdb_v1.tar.gz


.data\imdb\aclImdb_v1.tar.gz: 100%|███████████████████████████████████████████████| 84.1M/84.1M [00:35<00:00, 2.37MB/s]


In [20]:
print(f'Number of training examples: {len(train)}')
print(f'Number of testing examples: {len(test)}')

Number of training examples: 25000
Number of testing examples: 25000


In [24]:
train, valid = train.split(random_state = random.seed(seed))

In [26]:
print(f'Number of training examples: {len(train)}')
print(f'Number of validation examples: {len(valid)}')
print(f'Number of testing examples: {len(test)}')

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


In [28]:
## Build the vocabulary
MAX_VOCAB_SIZE = 5000

TEXT.build_vocab(train, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train)

In [29]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 5002
Unique tokens in LABEL vocabulary: 2


In [30]:
## Most common word
print(TEXT.vocab.freqs.most_common(20))

[('the', 203566), (',', 192495), ('.', 165618), ('and', 109442), ('a', 109116), ('of', 100702), ('to', 93766), ('is', 76328), ('in', 61255), ('I', 54004), ('it', 53508), ('that', 49187), ('"', 44282), ("'s", 43329), ('this', 42445), ('-', 36692), ('/><br', 35752), ('was', 35034), ('as', 30384), ('with', 29774)]


In [32]:
print(TEXT.vocab.itos[:10])
print(LABEL.vocab.stoi)

['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']
defaultdict(None, {'neg': 0, 'pos': 1})


In [35]:
## place into iterators
train_iterator, valid_iterator, test_iterator = torchtext.data.BucketIterator.splits(
    (train, valid, test), 
    batch_size = 128)

In [36]:
len(TEXT.vocab)

5002

# MODELS

### Recurrent Neural Net

In [81]:
class RNN(nn.Module):
    def __init__(self):
        super(RNN,self).__init__()
        
        self.embed = nn.Embedding(num_embeddings=5002,
                                  embedding_dim=100)
        self.rnn1 = nn.RNN(input_size=100,
                      hidden_size=256,
                      num_layers=2,
                      batch_first=True)
        self.fc1  = nn.Linear(100,1)
        
    def forward(self,x):
        
        #embed = [sent len, batch size, emb dim]
        x = self.embed(x)
        
        r_out,(h_n,h_c) = self.rnn1(x, None)
        
        x = self.fc1(r_out[:,-1,:])
        return(x)

model = RNN()
print(model)

RNN(
  (embed): Embedding(5002, 100)
  (rnn1): RNN(100, 256, num_layers=2, batch_first=True)
  (fc1): Linear(in_features=100, out_features=1, bias=True)
)


# TRAIN & TEST

### Count parameters

In [82]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')



The model has 723,533 trainable parameters


In [83]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc



### Training 

In [79]:
############### TRAINING for RNN

## Define optimizer
optimizer = SGD(model.parameters(), lr = 0.01, momentum = 0.5)
#optimizer = Adam(model.parameters(), lr = 0.01)

## Define loss function
criterion = nn.BCEWithLogitsLoss()

## Training starts here
epochs = 5
for e in range(epochs):
    running_loss = 0
    running_acc = 0
    timer = time.time()
    for batch in train_iterator:

        ## For each iteration reset gradient
        optimizer.zero_grad()
        
        ## Run the input data through the network (forward pass)
        output = model(batch.text)
        
        ## Calculate the losses using the loss functioon
        loss = criterion(output,batch.label)
        
        ## Binary acc
        acc = binary_accuracy(predictions, batch.label)
        
        ## Perform backpropagation 
        loss.backward()
        
        ## Updates the weights
        optimizer.step()
        
        running_loss += loss.item()
        running_acc += acc.item()
        
    else:
        
        with torch.no_grad():
            running_loss_val = 0
            running_acc_val = 0
            for batch_val in valid_iterator:

                ## Run the input data through the network (forward pass)
                output_val = model(batch_val.text).squeeze(1)

                ## Binary acc
                acc_val = binary_accuracy(output_val, batch_val.label)
                
                ## Calculate the losses using the loss functioon
                loss_val = criterion(output_val,batch_val.label)

                running_loss_val += loss_val.item()
                running_acc_val += acc_val.item()
            
        
        print("Epoch {} - Training acc: {:.6f} -Training loss: {:.6f} - Val acc: {:.6f} - Val loss: {:.6f} - Time: {:.4f}s".format(e+1, running_acc/len(train_iterator), running_loss/len(train_iterator), running_acc_val/len(valid_iterator), running_loss_val/len(valid_iterator), (time.time()-timer)))

AssertionError: 