# This is a practice notebook for pytorch on text (IMDB)

We implemented 4 models:
* RNN
* LSTM
* CNN
* Stacked CNN

In [1]:
import numpy as np
import pandas as pd
import os
import torch
import torchtext
import matplotlib.pyplot as plt
import time
from torch.autograd import Variable
from torch import nn, optim
from torch.optim import SGD,Adam
import torch.nn.functional as F
import random

In [2]:
seed = 1234
torch.manual_seed(seed)
#torch.device("cpu");
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

### Dataset preparation

In [3]:
## batch_first for CNN

TEXT = torchtext.data.Field(tokenize = 'spacy',batch_first=True)
LABEL = torchtext.data.LabelField(dtype = torch.float)

In [4]:
train, test = torchtext.datasets.IMDB.splits(TEXT, LABEL)

In [5]:
print(f'Number of training examples: {len(train)}')
print(f'Number of testing examples: {len(test)}')

Number of training examples: 25000
Number of testing examples: 25000


In [6]:
train, valid = train.split(random_state = random.seed(seed))

In [7]:
print(f'Number of training examples: {len(train)}')
print(f'Number of validation examples: {len(valid)}')
print(f'Number of testing examples: {len(test)}')

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


In [8]:
## Build the vocabulary
MAX_VOCAB_SIZE = 8000

TEXT.build_vocab(train, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train)

In [9]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 8002
Unique tokens in LABEL vocabulary: 2


In [10]:
## Most common word
print(TEXT.vocab.freqs.most_common(20))

[('the', 203566), (',', 192495), ('.', 165618), ('and', 109442), ('a', 109116), ('of', 100702), ('to', 93766), ('is', 76328), ('in', 61255), ('I', 54004), ('it', 53508), ('that', 49187), ('"', 44282), ("'s", 43329), ('this', 42445), ('-', 36692), ('/><br', 35752), ('was', 35034), ('as', 30384), ('with', 29774)]


In [11]:
print(TEXT.vocab.itos[:10])
print(LABEL.vocab.stoi)

['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']
defaultdict(None, {'neg': 0, 'pos': 1})


In [12]:
## place into iterators
train_iterator, valid_iterator, test_iterator = torchtext.data.BucketIterator.splits(
    (train, valid, test), 
    batch_size = 128)

In [13]:
len(TEXT.vocab)

8002

# MODELS

### Recurrent Neural Net

In [366]:
class RNN(nn.Module):
    def __init__(self):
        super(RNN,self).__init__()
        
        self.embed = nn.Embedding(num_embeddings=52,
                                  embedding_dim=20)
        self.rnn1 = nn.RNN(input_size=20,
                      hidden_size=10,
                      num_layers=1)
        self.fc1  = nn.Linear(10,2)
        
    def forward(self,x):
        
        #embed = [sent len, batch size, emb dim]
        x = self.embed(x)
        rnn_out, hidden = self.rnn1(x,None)
        x = rnn_out[-1,:,:]
        x = self.fc1(x.squeeze(0))
        x = F.log_softmax(x)
        return(x)

model = RNN()
print(model)

RNN(
  (embed): Embedding(52, 20)
  (rnn1): RNN(20, 10)
  (fc1): Linear(in_features=10, out_features=2, bias=True)
)


### Long-short Term Memory

In [243]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN,self).__init__()
        self.embed = nn.Embedding(num_embeddings=52,
                                  embedding_dim=20)
        self.lstm = nn.LSTM(input_size=20,
                            hidden_size=20,
                            num_layers=1)
        self.fc = nn.Linear(20,1)

        
    def forward(self,x):
        embed = self.embed(x)
        lstm_out, hidden = self.lstm(embed,None)
        x = lstm_out[-1,:,:]
        x = self.fc(x.squeeze(1))
        return(x)

model = CNN()
print(model)

CNN(
  (embed): Embedding(52, 20)
  (lstm): LSTM(20, 20)
  (fc): Linear(in_features=20, out_features=1, bias=True)
)


### Convolutional Neural Net (Sequential)

In [14]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN,self).__init__()
        
        self.embed = nn.Embedding(num_embeddings=8002,
                                  embedding_dim=100)
        self.CNN = nn.Conv2d(in_channels=1,
                             out_channels=100,
                             kernel_size=3)
        self.fc1 = nn.Linear(80,30)
        self.fc2 = nn.Linear(30,2)
        
    def forward(self,x):

        x = self.embed(x)
        x = x.unsqueeze(1)
        x = self.CNN(x)
        x = F.max_pool2d(x,2)
        k = x.shape
        x = torch.flatten(x,start_dim = 1)
        self.fc1 = nn.Linear(k[1]*k[2]*k[3],30)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x.squeeze(0))
        x = F.log_softmax(x)
        return(x)

model = CNN().to(device)
print(model)

CNN(
  (embed): Embedding(8002, 100)
  (CNN): Conv2d(1, 100, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=80, out_features=30, bias=True)
  (fc2): Linear(in_features=30, out_features=2, bias=True)
)


### Convolutional Neural Net (Stack 3 Convs)

In [None]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN,self).__init__()
        
        self.embed = nn.Embedding(num_embeddings=52,
                                  embedding_dim=30)
        self.cnn1 = nn.Conv2d(in_channels=1,
                              out_channels=20,
                              kernel_size=2)
        self.cnn2 = nn.Conv2d(in_channels=1,
                              out_channels=20,
                              kernel_size=3)
        self.cnn3 = nn.Conv2d(in_channels=1,
                              out_channels=20,
                              kernel_size=4)
        self.fc1 = nn.Linear(60,30)
        self.fc2 = nn.Linear(30,2)
    
    def forward(self,x):
        
        x = self.embed(x)
        x = x.unsqueeze(1)
        
        x1 = F.relu(self.cnn1(x))
        x1 = F.max_pool2d(x1,2)
        x1 = torch.flatten(x1,start_dim=1)
        
        x2 = F.relu(self.cnn2(x))
        x2 = F.max_pool2d(x2,3)
        x2 = torch.flatten(x2,start_dim=1)
        
        x3 = F.relu(self.cnn3(x))
        x3 = F.max_pool2d(x3,4)
        x3 = torch.flatten(x3,start_dim=1)
        
        x = torch.cat((x1,x2,x3), dim=1)
        self.fc1 = nn.Linear(x.shape[1],30)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x.squeeze(0))
        x = F.log_softmax(x)
        return(x)

model = CNN()
print(model)

# TRAIN & TEST

### Count parameters

In [15]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 803,692 trainable parameters


### Prepare accuracy function

In [16]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [17]:
def softmax_accuracy(probs,all_labels):
    def getClass(x):
        return(x.index(max(x)))

    all_predicted = probs.apply(getClass)
    all_predicted.reset_index(drop=True, inplace=True)
    
    acc = (pd.value_counts(all_predicted == all_labels)[1])/len(all_labels)
    return(acc)

### Pre-trained Embeddings (if available)

In [None]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

# Then zero the initial weights of the unknown and padding tokens.
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

### Training 

In [None]:
############### TRAINING for LSTM

## Define optimizer
optimizer = SGD(model.parameters(), lr = 0.01, momentum = 0.5)
#optimizer = Adam(model.parameters(), lr = 0.01)

## Define loss function
criterion = nn.BCEWithLogitsLoss()

## Training starts here
epochs = 5

for e in range(epochs):
    running_loss = 0
    running_acc = 0
    timer = time.time()
    for batch in train_iterator:

        ## For each iteration reset gradient
        optimizer.zero_grad()
        
        ## Run the input data through the network (forward pass)
        output = model(batch.text).squeeze(1)
        
        ## Calculate the losses using the loss functioon
        loss = criterion(output,batch.label)
        
        ## Binary acc
        acc = binary_accuracy(output, batch.label)
        
        ## Perform backpropagation 
        loss.backward()
        
        ## Updates the weights
        optimizer.step()
        
        running_loss += loss.item()
        running_acc += acc.item()
        
    else:
        
        with torch.no_grad():
            running_loss_val = 0
            running_acc_val = 0
            for batch_val in valid_iterator:

                ## Run the input data through the network (forward pass)
                output_val = model(batch_val.text).squeeze(1)

                ## Binary acc
                acc_val = binary_accuracy(output_val, batch_val.label)
                
                ## Calculate the losses using the loss functioon
                loss_val = criterion(output_val,batch_val.label)

                running_loss_val += loss_val.item()
                running_acc_val += acc_val.item()
            
        
        print("Epoch {} - Training acc: {:.6f} -Training loss: {:.6f} - Val acc: {:.6f} - Val loss: {:.6f} - Time: {:.4f}s".format(e+1, running_acc/len(train_iterator), running_loss/len(train_iterator), running_acc_val/len(valid_iterator), running_loss_val/len(valid_iterator), (time.time()-timer)))

In [19]:
############### TRAINING for RNN using SOFTMAX

## Define optimizer
optimizer = SGD(model.parameters(), lr = 0.01, momentum = 0.5)
#optimizer = Adam(model.parameters(), lr = 0.01)

## Define loss function
criterion = nn.NLLLoss().to(device)

## Training starts here
epochs = 5

for e in range(epochs):
    running_loss = 0
    all_labels = []
    timer = time.time()
    probs = pd.Series()
    for batch in train_iterator:

        ## For each iteration reset gradient
        optimizer.zero_grad()
        
        ## Run the input data through the network (forward pass)
        output = model(batch.text.to(device))
        
        ## Calculate the losses using the loss functioon
        loss = criterion(output,batch.label.type(torch.LongTensor).to(device))
        
        ## Softmax acc
        output = torch.exp(output)
        output = output.tolist()
        probs = probs.append(pd.Series(output),ignore_index=True)
        
        ## Perform backpropagation 
        loss.backward()
        
        ## Updates the weights
        optimizer.step()
        
        running_loss += loss.item()
        all_labels += batch.label.tolist()
        
    else:
        
        with torch.no_grad():
            running_loss_val = 0
            all_labels_val = []
            probs_val = pd.Series()
            for batch_val in valid_iterator:

                ## Run the input data through the network (forward pass)
                output_val = model(batch_val.text.to(device))

                ## Calculate the losses using the loss functioon
                loss_val = criterion(output_val,batch_val.label.type(torch.LongTensor).to(device))
                
                ## Softmax acc
                output_val = torch.exp(output_val)
                output_val = output_val.tolist()
                probs_val = probs_val.append(pd.Series(output_val),ignore_index=True)

                running_loss_val += loss_val.item()
                all_labels_val += batch_val.label.tolist()
            
        
        print("Epoch {} - Training acc: {:.6f} -Training loss: {:.6f} - Val acc: {:.6f} - Val loss: {:.6f} - Time: {:.4f}s".format(e+1, softmax_accuracy(probs,all_labels), running_loss/len(train_iterator), softmax_accuracy(probs_val,all_labels_val), running_loss_val/len(valid_iterator), (time.time()-timer)))

RuntimeError: CUDA out of memory. Tried to allocate 1.38 GiB (GPU 0; 8.00 GiB total capacity; 5.58 GiB already allocated; 402.77 MiB free; 5.60 GiB reserved in total by PyTorch)

### Testing

In [None]:
### For binary

with torch.no_grad():
    running_loss_test=0
    running_acc_test=0
    for batch in test_iterator:

        output_test = model(batch.text).squeeze(1)
        acc_test = binary_accuracy(output_test, batch.label)
        loss_test = criterion(output_test,batch.label)
        running_loss_test += loss_test.item()
        running_acc_test += acc_test.item()

    print('Test acc: ',running_acc_test/len(test_iterator))
    print('Test loss: ',running_loss_test/len(test_iterator))

In [None]:
### For SOFTMAX
with torch.no_grad():
    running_loss_test=0
    all_labels_test = []
    probs_test = pd.Series()
    for batch in test_iterator:

                ## Run the input data through the network (forward pass)
                output_test = model(batch.text)

                ## Calculate the losses using the loss functioon
                loss_test = criterion(output_test,batch.label.type(torch.LongTensor))
                
                ## Softmax acc
                output_test = torch.exp(output_test)
                output_test = output_test.tolist()
                probs_test = probs_test.append(pd.Series(output_test),ignore_index=True)

                running_loss_test += loss_test.item()
                all_labels_test += batch.label.tolist()

    print('Test acc: ', softmax_accuracy(probs_test,all_labels_test))
    print('Test loss: ',running_loss_test/len(test_iterator))