# Neural Network approach with RNN

## Libraries and data import

In [None]:
!pip install torchtext
!python -m spacy download en_core_web_sm

from google.colab import drive
import pandas as pd

import torch
from torchtext.legacy import data
from torchtext.legacy.data import Field, Dataset, Example

import random

import torch.optim as optim

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 8.3 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [None]:
#import of the processed dataset and the columns' names
from_drive = True
dataset = dict()
path = "/content/gdrive/MyDrive/Magistrale/Stage/data"

if from_drive == True: 
  drive.mount("/content/gdrive")
  dataset["ace"] = pd.read_csv(path + "/preprocessed_ace2.csv")
  dataset["copd"] = pd.read_csv(path + "/preprocessed_copd2.csv")
  dataset["ppi"] = pd.read_csv(path + "/preprocessed_ppi2.csv")
else: 
  dataset["ace"] = pd.read_csv(path + "/content/preprocessed_ace2.csv")
  dataset["copd"] = pd.read_csv(path + "/content/preprocessed_copd2.csv")
  dataset["ppi"] = pd.read_csv(path + "/content/preprocessed_ppi2.csv")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Preparing Data


In [None]:
#random seed for reproducibility
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

#FIELD: how the data should be processed
TEXT = data.Field(tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm')
LABEL = data.LabelField(dtype = torch.float)
fields = {'label' : LABEL, "text" : TEXT}

#HYPERPARAMETERS

#dataset configuration
i = "ace" #which dataset
clean_text = True #otherwise, it will be used "text"

#bucket iterator configuration
BATCH_SIZE = 64 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#neural network configuration
EMBEDDING_DIM = 100 #size of the dense word vectors (usually 50-250)
HIDDEN_DIM = 256 #size of the hidden states (usually 100-500)
OUTPUT_DIM = 1 #number of classes

In [None]:
#removal and renaming of columns

if clean_text == True: 
  dataset[i].drop(dataset[i].columns.difference(['Label', "text_clean"]), 1, inplace=True)
  dataset[i].rename(columns={'text_clean': 'text'}, inplace=True)

else: 
  dataset[i].drop(dataset[i].columns.difference(['Label', "text"]), 1, inplace=True)

dataset[i].rename(columns={'Label': 'label'}, inplace=True)

#final dataset with "label" and "text"

  


In [None]:
print(dataset[i].shape)
dataset[i].head()

(2496, 2)


Unnamed: 0,label,text
0,0,distinct and combined vascular effects of ace ...
1,0,computerized surveillance of adverse drug reac...
2,0,glomerular size selective dysfunction in niddm...
3,0,total arterial compliance in ambulatory hypert...
4,0,racial differences in the outcome of left vent...


In [None]:
class DataFrameDataset(Dataset):
    """Class for using pandas DataFrames as a datasource"""
    def __init__(self, examples, fields, filter_pred=None):
        """
        Create a dataset from a pandas dataframe of examples and Fields
        Arguments:
            examples pd.DataFrame: DataFrame of examples
            fields {str: Field}: The Fields to use in this tuple. The
                string is a field name, and the Field is the associated field.
            filter_pred (callable or None): use only exanples for which
                filter_pred(example) is true, or use all examples if None.
                Default is None
        """
        self.examples = examples.apply(SeriesExample.fromSeries, args=(fields,), axis=1).tolist()
        if filter_pred is not None:
            self.examples = filter(filter_pred, self.examples)
        self.fields = dict(fields)
        # Unpack field tuples
        for n, f in list(self.fields.items()):
            if isinstance(n, tuple):
                self.fields.update(zip(n, f))
                del self.fields[n]

class SeriesExample(Example):
    """Class to convert a pandas Series to an Example"""
  
    @classmethod
    def fromSeries(cls, data, fields):
        return cls.fromdict(data.to_dict(), fields)

    @classmethod
    def fromdict(cls, data, fields):
        ex = cls()
        
        for key, field in fields.items():
            if key not in data:
                raise ValueError("Specified key {} was not found in "
                "the input data".format(key))
            if field is not None:
                setattr(ex, key, field.preprocess(data[key]))
            else:
                setattr(ex, key, data[key])
        return ex

In [None]:
#pytorch dataset 
df = DataFrameDataset(dataset[i], fields)

### Train, test and val split

In [None]:
#train and test split
train_data, test_data = df.split(split_ratio=0.8, stratified=True, strata_field='label', random_state = random.seed(SEED))

print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

#checking an example
print(vars("\n", train_data.examples[0]))

In [None]:
#train and val split
train_data, valid_data = train_data.split(split_ratio=0.8, stratified=True, strata_field='label', random_state = random.seed(SEED))

print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

In [None]:
#VOCABULARY: look up table where every unique word in your data set has a corresponding _index_ (an integer).
#each _index_ is used to construct a _one-hot_ vector for each word.
TEXT.build_vocab(train_data) #optional parameter: max_size = MAX_VOCAB_SIZE
LABEL.build_vocab(train_data)

Only build the vocabulary on the training set because when testing a machine learning system you do not want to look at the test set in any way. Also, do not include the validation set as you want it to reflect the test set as much as possible.

In [None]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 11152
Unique tokens in LABEL vocabulary: 2


There is the addition of the `<pad>` token.

When feeding sentences into our model, you feed a _batch_ of them at a time, i.e. more than one at a time, and all sentences in the batch need to be the same size. Thus, to ensure each sentence in the batch is the same size, any shorter than the longest within the batch are padded.

In [None]:
print("Most common words:", TEXT.vocab.freqs.most_common(20))

[('the', 14847), ('of', 13867), ('and', 12581), ('in', 11351), ('with', 7105), ('to', 6979), ('patients', 5788), ('a', 5301), ('drug', 4476), ('0', 4106), ('effects', 3924), ('use', 3730), ('was', 3666), ('blood', 3643), ('therapeutic', 3491), ('were', 3414), ('therapy', 3130), ('1', 2926), ('treatment', 2891), ('or', 2817)]


In [None]:
#see the vocabulary directly using either stoi (string to int) or itos (int to string)
print(TEXT.vocab.itos[:10])

#check the labels, ensuring 0 is for negative and 1 is for positive
print(LABEL.vocab.stoi)

In [None]:
#creating the iterators

#ITERATORS: you iterate over iterators in the training/evaluation loop, 
#and they return a batch of examples (indexed and converted into tensors) at each iteration.

#BUCKET ITERATORS: special type of iterator that will return a batch of examples 
#where each example is of a similar length, minimizing the amount of padding per example.

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device, 
    sort = False)

## Build the Model


All layers have their parameters initialized to random values, unless explicitly specified.

- **Embedding layer**: is used to transform the sparse one-hot vector into a dense embedding vector (dense as the dimensionality is a lot smaller and all the elements are real numbers). It is simply a single fully connected layer. 
As well as reducing the dimensionality of the input to the RNN, there is the theory that words with similar meaning are mapped close together in this dense vector space.

- **RNN**: takes in the dense vector and the previous hidden state $h_{t-1}$, which it uses to calculate the next hidden state, $h_t$.

- **Linear layer**: takes the final hidden state and feeds it through a fully connected layer, $f(h_T)$, transforming it to the correct output dimension.

In [1]:
import torch.nn as nn

class RNN(nn.Module): #the RNN class is a sub-class of nn.Module
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim): #define the layers of the module 
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim) #embedding layer
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim) #rnn
        
        self.fc = nn.Linear(hidden_dim, output_dim) #linear layer
        
    def forward(self, text): #called when feeding examples into the model

        #text = [sent len, batch size] (tensor)
        #text is a batch of senteces, each having each word converted into a one-hot vector 

        #the input batch is passed through the embedding layer to get `embedded`, 
        #which gives us a dense vector representation of our sentences.
        embedded = self.embedding(text) 
        
        #embedded = [sent len, batch size, emb dim] (tensor)
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim] (tensor)
        #output is the concatenation of the hidden state from every time step

        #hidden = [1, batch size, hid dim] (tensor)
        #hidden is simply the final hidden state
        
        #assert statement udse to verify if output is the concatenation of the 
        #hidden state from every time step and hidden is the final hidden state
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        #squeeze is used to remove a dimension of size 1
        
        #the last hidden state, hidden, is fed through the linear layer to produce a prediction
        return self.fc(hidden.squeeze(0))

The tensor `text` should have another dimension due to the one-hot vectors, however PyTorch conveniently stores a one-hot vector as it's index value, i.e. the tensor representing a sentence is just a tensor of the indexes for each token in that sentence. The act of converting a list of tokens into a list of indexes is commonly called *numericalizing*.

In [None]:
#create an instance of the RNN class
INPUT_DIM = len(TEXT.vocab) #input dimension: dimension of the one-hot vectors (equal to the vocabulary size)

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

- **Input dimension**: dimension of the one-hot vectors (equal to the vocabulary size).
- **Embedding dimension**: size of the dense word vectors. This is usually around 50-250 dimensions, but depends on the size of the vocabulary.
- **Hidden dimension**: size of the hidden states. This is usually around 100-500 dimensions, but also depends on factors such as on the vocabulary size, the size of the dense vectors and the complexity of the task.
- **Output dimension**: usually the number of classes. However in the case of only 2 classes the output value is between 0 and 1 and thus can be 1-dimensional, i.e. a single scalar real number.

In [None]:
def count_parameters(model):
  """ 
  Function that tells how many trainable parameters the model has 
  so we can compare the number of parameters across different models.
  """
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,207,105 trainable parameters


## Train the Model

In [None]:
#OPTIMIZER: algorithm we use to update the parameters of the module. 
#chosen optimizer: stochastic gradient descent_ (SGD) 
optimizer = optim.SGD(model.parameters(), lr=1e-3)

#CRITERION: loss function
#chosen loss function: binary cross entropy with logits
criterion = nn.BCEWithLogitsLoss()

#place the model and the criterion on the GPU (if we have one)
model = model.to(device)
criterion = criterion.to(device)

The model currently outputs an unbound real number. As the labels are either 0 or 1, you want to restrict the predictions to a number between 0 and 1. This can be done using the _sigmoid_ or _logit_ functions. 

Is it possible to use the bound scalar to calculate the loss using binary cross entropy. 

The `BCEWithLogitsLoss` criterion carries out both the sigmoid and the binary cross entropy steps.

In [None]:
def binary_accuracy(preds, y):
  
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #feed the prediction through a sigmoid layer, squashing the values in [0, 1], then round them to the nearest integer
    rounded_preds = torch.round(torch.sigmoid(preds)) 

    #calculate how many rounded predictions equal the actual labels and average it across the batch
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)

    return acc

The loss and accuracy is accumulated across the epoch, the `.item()` method is used to extract a scalar from a tensor which only contains a single value.

Finally, we return the loss and accuracy, averaged across the epoch. The `len` of an iterator is the number of batches in the iterator.

You may recall when initializing the `LABEL` field, we set `dtype=torch.float`. This is because TorchText sets tensors to be `LongTensor`s by default, however our criterion expects both inputs to be `FloatTensor`s. Setting the `dtype` to be `torch.float`, did this for us. The alternative method of doing this would be to do the conversion inside the `train` function by passing `batch.label.float()` instad of `batch.label` to the criterion. 

In [1]:
def train(model, iterator, optimizer, criterion): #iterates over all examples, one batch at a time
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train() #put the model in "training mode", which turns on dropout and batch normalization
    #although we aren't using them in this model, it's good practice to include it
    
    for batch in iterator: #for each batch
        
        optimizer.zero_grad() #we first zero the gradients

        #feed the batch of sentences, batch.text, into the model 
        predictions = model(batch.text).squeeze(1)
        #squeeze is needed as the predictions are initially size [batch size, 1]
        #and we need to remove the dimension of size 1, as PyTorch expects the 
        #predictions input to our criterion function to be of size [batch size]        
        

        #computation of loss 
        loss = criterion(predictions, batch.label)
        
        #computation of accuracy
        acc = binary_accuracy(predictions, batch.label)

        #loss and accuracy are then calculated using the predictions and the labels, batch.label, 
        #with the loss being averaged over all examples in the batch.

        loss.backward() #calculate the gradient of each parameter
        
        optimizer.step() #update the parameters using the gradients and optimizer algorithm
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

Each parameter in a model has a `grad` attribute which stores the gradient calculated by the `criterion`. PyTorch does not automatically remove (or "zero") the gradients calculated from the last gradient calculation, so they must be manually zeroed.

Note, you do not need to do `model.forward(batch.text)`, simply calling the model works.

`evaluate` is similar to `train`, with a few modifications as you don't want to update the parameters when evaluating.

`model.eval()` puts the model in "evaluation mode", this turns off _dropout_ and _batch normalization_. Again, we are not using them in this model, but it is good practice to include them.

No gradients are calculated on PyTorch operations inside the `with no_grad()` block. This causes less memory to be used and speeds up computation.

The rest of the function is the same as `train`, with the removal of `optimizer.zero_grad()`, `loss.backward()` and `optimizer.step()`, as we do not update the model's parameters when evaluating.

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

We'll also create a function to tell us how long an epoch takes to compare training times between models.

In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

We then train the model through multiple epochs, an epoch being a complete pass through all examples in the training and validation sets.

At each epoch, if the validation loss is the best we have seen so far, we'll save the parameters of the model and then after training has finished we'll use that model on the test set.

In [None]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 2s
	Train Loss: 0.520 | Train Acc: 97.31%
	 Val. Loss: 0.433 |  Val. Acc: 92.66%
Epoch: 02 | Epoch Time: 0m 2s
	Train Loss: 0.332 | Train Acc: 97.81%
	 Val. Loss: 0.333 |  Val. Acc: 92.66%
Epoch: 03 | Epoch Time: 0m 1s
	Train Loss: 0.243 | Train Acc: 97.68%
	 Val. Loss: 0.286 |  Val. Acc: 92.66%
Epoch: 04 | Epoch Time: 0m 0s
	Train Loss: 0.195 | Train Acc: 97.69%
	 Val. Loss: 0.261 |  Val. Acc: 92.89%
Epoch: 05 | Epoch Time: 0m 0s
	Train Loss: 0.166 | Train Acc: 97.87%
	 Val. Loss: 0.248 |  Val. Acc: 92.89%


You may have noticed the loss is not really decreasing and the accuracy is poor. This is due to several issues with the model which we'll improve in the next notebook.

Finally, the metric we actually care about, the test loss and accuracy, which we get from our parameters that gave us the best validation loss.

In [None]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.159 | Test Acc: 97.70%
