# Neural Network approach with RNN

In [1]:
!python --version

Python 3.7.13


## Libraries and data import

In [2]:
!pip install -U torch==1.10.0
!pip install -U torchtext==0.11.0
!python -m spacy download en_core_web_sm

from google.colab import drive
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import torch
torch.cuda.empty_cache()
import torchtext
from torchtext.legacy import data
from torchtext.legacy.data import Field, Dataset, Example

import random

import torch.nn as nn
import torch.optim as optim

from sklearn.metrics import recall_score, precision_score, fbeta_score

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 14.5 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [3]:
#import of the processed dataset and the columns' names
from_drive = True
dataset = dict()
path = "/content/gdrive/MyDrive/Magistrale/Stage/data"

if from_drive == True: 
  drive.mount("/content/gdrive")
  dataset["ace"] = pd.read_csv(path + "/preprocessed_ace2.csv")
  dataset["copd"] = pd.read_csv(path + "/preprocessed_copd2.csv")
  dataset["ppi"] = pd.read_csv(path + "/preprocessed_ppi2.csv")
else: 
  dataset["ace"] = pd.read_csv(path + "/content/preprocessed_ace2.csv")
  dataset["copd"] = pd.read_csv(path + "/content/preprocessed_copd2.csv")
  dataset["ppi"] = pd.read_csv(path + "/content/preprocessed_ppi2.csv")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Preparing Data


In [4]:
#random seed for reproducibility
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

#FIELD: how the data should be processed
TEXT = data.Field(tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm',
                  include_lengths = True) #in order to know how long the actual sequences are
                  #this will cause `batch.text` to now be a tuple with the first element being the sentence 
                  #(a numericalized tensor that has been padded) and the second element being the actual lengths of the sentences

LABEL = data.LabelField(dtype = torch.float)
fields = {'label' : LABEL, "text" : TEXT}

#HYPERPARAMETERS

#dataset configuration
i = "ace" #which dataset
clean_text = True #otherwise, it will be used "text"

#bucket iterator configuration
BATCH_SIZE = 128 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#neural network configuration
EMBEDDING_DIM = 300 #size of the dense word vectors (usually 50-250)
#in this case, must be equal to that of the pre-trained GloVe vectors
HIDDEN_DIM = 256 #size of the hidden states (usually 100-500)
OUTPUT_DIM = 1 #number of classes
N_LAYERS = 5
BIDIRECTIONAL = True
DROPOUT = 0.5


To ensure the pre-trained vectors can be loaded into the model, the `EMBEDDING_DIM` must be equal to that of the pre-trained GloVe vectors loaded earlier.

In [5]:
#removal and renaming of columns

if clean_text == True: 
  dataset[i].drop(dataset[i].columns.difference(['Label', "text_clean"]), 1, inplace=True)
  dataset[i].rename(columns={'text_clean': 'text'}, inplace=True)

else: 
  dataset[i].drop(dataset[i].columns.difference(['Label', "text"]), 1, inplace=True)

dataset[i].rename(columns={'Label': 'label'}, inplace=True)

#final dataset with "label" and "text"

  after removing the cwd from sys.path.


In [6]:
print(dataset[i].shape)
dataset[i].head()

(2496, 2)


Unnamed: 0,label,text
0,0,distinct and combined vascular effects of ace ...
1,0,computerized surveillance of adverse drug reac...
2,0,glomerular size selective dysfunction in niddm...
3,0,total arterial compliance in ambulatory hypert...
4,0,racial differences in the outcome of left vent...


### Train, test and val split

In [7]:
train, test = train_test_split(dataset[i], train_size=0.5, random_state=random.seed(SEED), shuffle=True, stratify=dataset[i]['label'])

In [8]:
#OVERSAMPLING
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
# instantiating the random over sampler 
ros = RandomOverSampler(sampling_strategy=1, random_state=SEED)
# resampling X, y
X_ros, y_ros = ros.fit_resample(train["text"].values.reshape(-1, 1), train["label"].values)
# new class distribution 
print(Counter(y_ros))

Counter({0: 1229, 1: 1229})


In [9]:
oversampled_train = pd.DataFrame(X_ros, columns=["text"])
oversampled_train["label"] = y_ros

In [10]:
class DataFrameDataset(Dataset):
  """Class for using pandas DataFrames as a datasource"""
  def __init__(self, examples, fields, filter_pred=None):
    """
    Create a dataset from a pandas dataframe of examples and Fields
    Arguments:
      examples pd.DataFrame: DataFrame of examples
      fields {str: Field}: The Fields to use in this tuple. The
        string is a field name, and the Field is the associated field.
      filter_pred (callable or None): use only exanples for which
        filter_pred(example) is true, or use all examples if None.
        Default is None
    """
    self.examples = examples.apply(SeriesExample.fromSeries, args=(fields,), axis=1).tolist()
    if filter_pred is not None:
      self.examples = filter(filter_pred, self.examples)
    self.fields = dict(fields)
    # Unpack field tuples
    for n, f in list(self.fields.items()):
      if isinstance(n, tuple):
        self.fields.update(zip(n, f))
        del self.fields[n]

class SeriesExample(Example):
  """Class to convert a pandas Series to an Example"""

  @classmethod
  def fromSeries(cls, data, fields):
    return cls.fromdict(data.to_dict(), fields)

  @classmethod
  def fromdict(cls, data, fields):
    ex = cls()
      
    for key, field in fields.items():
      if key not in data:
        raise ValueError("Specified key {} was not found in "
        "the input data".format(key))
      if field is not None:
        setattr(ex, key, field.preprocess(data[key]))
      else:
        setattr(ex, key, data[key])
    return ex

In [11]:
# #pytorch dataset 
# df = DataFrameDataset(dataset[i], fields)

In [12]:
#pytorch dataset 
train_data = DataFrameDataset(oversampled_train, fields)
df = DataFrameDataset(test, fields)

In [13]:
#test and val split
test_data, valid_data = df.split(split_ratio=0.8, stratified=True, strata_field='label', random_state = random.seed(SEED))

print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')
print(f'Number of validation examples: {len(valid_data)}')

#checking an example
print("\n", vars(train_data.examples[0]))

Number of training examples: 2458
Number of testing examples: 998
Number of validation examples: 250

 {'label': 0, 'text': ['increased', 'urinary', 'nag', 'excretion', 'in', 'hypertensives', 'can', 'decline', 'with', 'antihypertensive', 'treatment', 'acetylglucosamine', 'urine', 'adult', 'aged', 'antihypertensive', 'agents', 'therapeutic', 'use', 'biomarkers', 'urine', 'creatinine', 'blood', 'diastole', 'female', 'fosinopril', 'therapeutic', 'use', 'glomerular', 'filtration', 'rate', 'humans', 'hypertension', 'drug', 'therapy', 'physiopathology', 'urine', 'male', 'middle', 'aged', 'potassium', 'blood', 'systole']}


In [14]:
#VOCABULARY: look up table where every unique word in your data set has a corresponding _index_ (an integer).
#each _index_ is used to construct a _one-hot_ vector for each word.
TEXT.build_vocab(train_data,
                 vectors = "glove.42B.300d", #use of pre-trained word embeddings
                 unk_init = torch.Tensor.normal_) #initialize words in pre-trained embeddings randomly via gaussian distribution

LABEL.build_vocab(train_data)

Only build the vocabulary on the training set because when testing a machine learning system you do not want to look at the test set in any way. Also, do not include the validation set as you want it to reflect the test set as much as possible.

In [15]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 9993
Unique tokens in LABEL vocabulary: 2


In [16]:
poscount = 0
for elem in range(len(train_data)):
  if train_data[elem].label == 1:
    poscount += 1 

In [17]:
poscount

1229

In [18]:
train_data[0].label

0

There is the addition of the `<pad>` token.

When feeding sentences into our model, you feed a _batch_ of them at a time, i.e. more than one at a time, and all sentences in the batch need to be the same size. Thus, to ensure each sentence in the batch is the same size, any shorter than the longest within the batch are padded.

In [19]:
print("Most common words:", TEXT.vocab.freqs.most_common(20))

Most common words: [('the', 27516), ('of', 25642), ('in', 19899), ('and', 18786), ('with', 12830), ('patients', 11968), ('to', 10611), ('a', 8964), ('0', 8955), ('was', 8146), ('effects', 6721), ('drug', 6634), ('were', 6046), ('use', 5934), ('p', 5821), ('treatment', 5726), ('captopril', 5630), ('1', 5617), ('therapeutic', 5462), ('infarction', 5184)]


In [20]:
#see the vocabulary directly using either stoi (string to int) or itos (int to string)
print(TEXT.vocab.itos[:10])

#check the labels, ensuring 0 is for negative and 1 is for positive
# LABEL.vocab.stoi = {1:1, 0:0}
print(LABEL.vocab.stoi)

['<unk>', '<pad>', 'the', 'of', 'in', 'and', 'with', 'patients', 'to', 'a']
defaultdict(None, {0: 0, 1: 1})


In [21]:
#creating the iterators

#ITERATORS: you iterate over iterators in the training/evaluation loop, 
#and they return a batch of examples (indexed and converted into tensors) at each iteration.

#BUCKET ITERATORS: special type of iterator that will return a batch of examples 
#where each example is of a similar length, minimizing the amount of padding per example.

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
  (train_data, valid_data, test_data), 
  batch_size = BATCH_SIZE,
  device = device, 
  sort_key = lambda x: len(x.text),
  sort_within_batch = True) #False, ##for packed padded sequences all of the tensors within a batch need to be sorted by their lengths

## Build the Model


## Build the Model

### Different RNN Architecture

Long Short-Term Memory (LSTM) is a type of RNN architecture. 

Standard RNNs suffer from the vanishing gradient problem. LSTMs overcome this by having an extra recurrent state called a _cell_, $c$ - which can be thought of as the "memory" of the LSTM - and using multiple _gates_ which control the flow of information into and out of the memory. We can simply think of the LSTM as a function of $x_t$, $h_t$ and $c_t$, instead of just $x_t$ and $h_t$.

$$(h_t, c_t) = \text{LSTM}(x_t, h_t, c_t)$$

Thus, the model using an LSTM looks something like (with the embedding layers omitted):

![](https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/assets/sentiment2.png?raw=1)

The initial cell state, $c_0$, like the initial hidden state is initialized to a tensor of all zeros. The final prediction is still, however, only made using the final hidden state, not the final cell state, i.e. $\hat{y}=f(h_T)$.

### Bidirectional RNN

The concept behind a bidirectional RNN is simple. As well as having an RNN processing the words in the sentence from the first to the last (a forward RNN), we have a second RNN processing the words in the sentence from the **last to the first** (a backward RNN). At time step $t$, the forward RNN is processing word $x_t$, and the backward RNN is processing word $x_{T-t+1}$. 

In PyTorch, the hidden state (and cell state) tensors returned by the forward and backward RNNs are stacked on top of each other in a single tensor. 

We make our sentiment prediction using a concatenation of the last hidden state from the forward RNN (obtained from final word of the sentence), $h_T^\rightarrow$, and the last hidden state from the backward RNN (obtained from the first word of the sentence), $h_T^\leftarrow$, i.e. $\hat{y}=f(h_T^\rightarrow, h_T^\leftarrow)$   

The image below shows a bi-directional RNN, with the forward RNN in orange, the backward RNN in green and the linear layer in silver.  

![](https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/assets/sentiment3.png?raw=1)

### Multi-layer RNN

Multi-layer RNNs (also called *deep RNNs*) are another simple concept. The idea is that we add additional RNNs on top of the initial standard RNN, where each RNN added is another *layer*. The hidden state output by the first (bottom) RNN at time-step $t$ will be the input to the RNN above it at time step $t$. The prediction is then made from the final hidden state of the final (highest) layer.

The image below shows a multi-layer unidirectional RNN, where the layer number is given as a superscript. Also note that each layer needs their own initial hidden state, $h_0^L$.

![](https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/assets/sentiment4.png?raw=1)

### Regularization

Although we've added improvements to our model, each one adds additional parameters. Without going into overfitting into too much detail, the more parameters you have in in your model, the higher the probability that your model will overfit (memorize the training data, causing a low training error but high validation/testing error, i.e. poor generalization to new, unseen examples). To combat this, we use regularization. More specifically, we use a method of regularization called *dropout*. Dropout works by randomly *dropping out* (setting to 0) neurons in a layer during a forward pass. The probability that each neuron is dropped out is set by a hyperparameter and each neuron with dropout applied is considered indepenently. One theory about why dropout works is that a model with parameters dropped out can be seen as a "weaker" (less parameters) model. The predictions from all these "weaker" models (one for each forward pass) get averaged together withinin the parameters of the model. Thus, your one model can be thought of as an ensemble of weaker models, none of which are over-parameterized and thus should not overfit.

All layers have their parameters initialized to random values, unless explicitly specified.

- **Embedding layer**: is used to transform the sparse one-hot vector into a dense embedding vector (dense as the dimensionality is a lot smaller and all the elements are real numbers). It is simply a single fully connected layer. 
As well as reducing the dimensionality of the input to the RNN, there is the theory that words with similar meaning are mapped close together in this dense vector space.

- **RNN**: takes in the dense vector and the previous hidden state $h_{t-1}$, which it uses to calculate the next hidden state, $h_t$.

- **Linear layer**: takes the final hidden state and feeds it through a fully connected layer, $f(h_T)$, transforming it to the correct output dimension.

In [22]:
class RNN(nn.Module): #the RNN class is a sub-class of nn.Module
  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx): #define the layers of the module 
      
    super().__init__()
    
    self.embedding = nn.Embedding(vocab_size, 
                                  embedding_dim, 
                                  padding_idx = pad_idx) #index of the pad token is passed as an argument to the embedding layer
    
    self.rnn = nn.LSTM(embedding_dim, #LSTM layer
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout) #adds dropout on the connections between hidden states in one layer to hidden states in the next layer
    #implementing bidirectionality and adding additional layers are done 
    #by passing values for the num_layers and bidirectional arguments for the RNN/LSTM
    
    self.fc = nn.Linear(hidden_dim * 2, output_dim) #linear layer
    #as the final hidden state of the LSTM has both a forward and a backward component,
    #which will be concatenated together, the size of the input to the nn.Linear layer 
    #is twice that of the hidden dimension size

    self.dropout = nn.Dropout(dropout) #dropout layer
    #the argument is the probability of dropping out each neuron
    #the dropout layer is used within the forward method after each layer we want to apply dropout to

      
  def forward(self, text, text_lengths): #called when feeding examples into the model
  #text_lengths is needed as we are passing the lengths of the sentences 
  #to be able to use packed padded sequences

    #text = [sent len, batch size] (tensor)
    #text is a batch of senteces, each having each word converted into a one-hot vector 

    #the input batch is passed through the embedding layer to get `embedded`, 
    #which gives us a dense vector representation of our sentences.
    embedded = self.dropout(self.embedding(text)) 
    
    #embedded = [sent len, batch size, emb dim] (tensor)
    
    #PACK SEQUENCE: before we pass our embeddings to the RNN, we need to pack them
    #this will cause our RNN to only process the non-padded elements of a sequence
    #lengths need to be on CPU
    packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'))
    
    #the RNN will then return packed_output (a packed sequence) as well as the hidden and cell states (both of which are tensors)
    packed_output, (hidden, cell) = self.rnn(packed_embedded) 
    #so LSTM returns the output and a tuple of the final hidden state and the final cell state
    
    #UNPACK SEQUENCE: unpack of the output sequence to transform it from a packed sequence to a tensor
    #the elements of output from padding tokens will be zero tensors (tensors where every element is zero)
    output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
    
    #output = [sent len, batch size, hid dim * num directions] (tensor)
    #output is the concatenation of the hidden state from every time step
    #output over padding tokens are zero tensors

    #hidden = [num layers * num directions, batch size, hid dim]
    #cell = [num layers * num directions, batch size, hid dim]
    
    #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
    #and apply dropout
    
    #the layers are ordered: [forward_layer_0, backward_layer_0, forward_layer_1, backward_layer 1, ..., forward_layer_n, backward_layer n]
    #as we want the final layer forward and backward hidden states, 
    #we get the last two hidden layers, hidden[-2,:,:] and hidden[-1,:,:], 
    #and concatenate them together before passing them to the linear layer (after applying dropout)
    hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
            
    #hidden = [batch size, hid dim * num directions]
    
    #the last hidden state, hidden, is fed through the linear layer to produce a prediction
    return self.fc(hidden)

The tensor `text` should have another dimension due to the one-hot vectors, however PyTorch conveniently stores a one-hot vector as it's index value, i.e. the tensor representing a sentence is just a tensor of the indexes for each token in that sentence. The act of converting a list of tokens into a list of indexes is commonly called *numericalizing*.

Notes: 
- Never use dropout on the input or output layers (`text` or `fc` in this case) because you only ever want to use dropout on intermediate layers.
- Without packed padded sequences, `hidden` and `cell` are tensors from the last element in the sequence, which will most probably be a pad token, however when using packed padded sequences they are both from the last non-padded element in the sequence.
- The `lengths` argument of `packed_padded_sequence` must be a CPU tensor so we explicitly make it one by using `.to('cpu')`.
- Usually, there is only the need to unpack output if you are going to use it later on in the model. Although in this case the unpack output is not used, it has been unpacked anyway.

In [23]:
#create an instance of the RNN class
INPUT_DIM = len(TEXT.vocab) #input dimension: dimension of the one-hot vectors (equal to the vocabulary size)
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] #get the pad token index from the vocabulary
#this allows to get the actual string representing the pad token from the field's pad_token attribute, which is <pad> by default

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, 
            BIDIRECTIONAL, DROPOUT, PAD_IDX)

- **Input dimension**: dimension of the one-hot vectors (equal to the vocabulary size).
- **Embedding dimension**: size of the dense word vectors. This is usually around 50-250 dimensions, but depends on the size of the vocabulary.
- **Hidden dimension**: size of the hidden states. This is usually around 100-500 dimensions, but also depends on factors such as on the vocabulary size, the size of the dense vectors and the complexity of the task.
- **Output dimension**: usually the number of classes. However in the case of only 2 classes the output value is between 0 and 1 and thus can be 1-dimensional, i.e. a single scalar real number.

In [24]:
def count_parameters(model):
  """ 
  Function that tells how many trainable parameters the model has 
  so we can compare the number of parameters across different models.
  """
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 10,449,037 trainable parameters


In [25]:
#retrieve the embeddings from the field's vocab
pretrained_embeddings = TEXT.vocab.vectors

#check the embeddings are the correct size, [vocab size, embedding dim]
print(pretrained_embeddings.shape)

torch.Size([9993, 300])


In [26]:
#replace the initial weights of the embedding layer with the pre-trained embeddings
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-1.1172e-01, -4.9659e-01,  1.6307e-01,  ..., -1.4447e+00,
          8.4021e-01, -8.6684e-01],
        [ 1.0322e-01, -1.6268e+00,  5.7285e-01,  ...,  3.1804e-01,
         -1.6259e-01, -4.1701e-02],
        [-2.0838e-01, -1.4932e-01, -1.7528e-02,  ..., -5.4066e-01,
          2.1199e-01, -9.4357e-03],
        ...,
        [-6.1891e-01, -1.1732e+00, -1.2075e+00,  ...,  2.8668e+00,
         -6.7785e-01,  4.0222e-02],
        [ 2.2345e-01, -3.2344e-01, -2.0852e-03,  ...,  6.9014e-01,
         -6.9764e-02,  1.0198e-01],
        [ 4.3500e-01, -4.8822e-01, -1.4933e-01,  ..., -3.4345e-01,
         -6.2270e-03,  2.3741e-01]])

Replacing the initial weights of the embedding layer with the pre-trained embeddings should always be done on the `weight.data` and not the `weight`.

In [27]:
#initialization of <unk> and <pad> to all zeros

#get the index of the <unk> tokens
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

#initialize <unk> and <pad> tokens to all zeros
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

#check that the first two rows of the embedding weights matrix have been set to zeros
print(model.embedding.weight.data)

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-2.0838e-01, -1.4932e-01, -1.7528e-02,  ..., -5.4066e-01,
          2.1199e-01, -9.4357e-03],
        ...,
        [-6.1891e-01, -1.1732e+00, -1.2075e+00,  ...,  2.8668e+00,
         -6.7785e-01,  4.0222e-02],
        [ 2.2345e-01, -3.2344e-01, -2.0852e-03,  ...,  6.9014e-01,
         -6.9764e-02,  1.0198e-01],
        [ 4.3500e-01, -4.8822e-01, -1.4933e-01,  ..., -3.4345e-01,
         -6.2270e-03,  2.3741e-01]])


As our `<unk>` and `<pad>` token aren't in the pre-trained vocabulary they have been initialized using `unk_init` (an $\mathcal{N}(0,1)$ distribution) when building our vocab. It is preferable to initialize them both to all zeros to explicitly tell our model that, initially, they are irrelevant for determining relevance. 

We do this by manually setting their row in the embedding weights matrix to zeros. We get their row by finding the index of the tokens, which we have already done for the padding index.

Like initializing the embeddings, this should be done on the `weight.data` and not the `weight`.

As we passed the index of the pad token to the `padding_idx` of the embedding layer it will remain zeros throughout training, however the `<unk>` token embedding will be learned.

## Train the Model

**Optimizers** 
- **SGD**: updates all parameters with the same learning rate and choosing this learning rate can be tricky. 
- **Adam**: adapts the learning rate for each parameter, giving parameters that are updated more frequently lower learning rates and parameters that are updated infrequently higher learning rates. 
You do not have to provide an initial learning rate for Adam as PyTorch specifies a sensibile default initial learning rate.

In [28]:
#OPTIMIZER: algorithm we use to update the parameters of the module. 
#chosen optimizer: Adam
optimizer = optim.Adam(model.parameters())

#CRITERION: loss function
#chosen loss function: binary cross entropy with logits
weights = [1 - (len(dataset[i][dataset[i]["label"]==0]) / dataset[i].shape[0])]
weight = torch.FloatTensor(weights)
criterion = nn.BCEWithLogitsLoss(weight=weight)
#criterion = nn.BCEWithLogitsLoss()

#place the model and the criterion on the GPU (if we have one)
model = model.to(device)
criterion = criterion.to(device)

The model currently outputs an unbound real number. As the labels are either 0 or 1, you want to restrict the predictions to a number between 0 and 1. This can be done using the _sigmoid_ or _logit_ functions. 

Is it possible to use the bound scalar to calculate the loss using binary cross entropy. 

The `BCEWithLogitsLoss` criterion carries out both the sigmoid and the binary cross entropy steps.

In [29]:
def binary_metrics(preds, y):
  
  """
  Returns metrics per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
  """

  #feed the prediction through a sigmoid layer, squashing the values in [0, 1], then round them to the nearest integer
  rounded_preds = torch.round(torch.sigmoid(preds)) 

  #calculate how many rounded predictions equal the actual labels and average it across the batch
  correct = (rounded_preds == y).float() #convert into float for division 
  acc = correct.sum() / len(correct)

  y_true = y.detach().numpy()
  y_pred = rounded_preds.detach().numpy()

  recall = recall_score(y_true, y_pred, average='binary', zero_division=0)
  precision = precision_score(y_true, y_pred, average='binary', zero_division=0)
  f2 = fbeta_score(y_true, y_pred, average='binary', beta=2, zero_division=0)
  f3 = fbeta_score(y_true, y_pred, average='binary', beta=3, zero_division=0)

  #print(y)
  #sumpos = np.sum(rounded_preds.detach().numpy() == 1)

  return acc, recall, precision, f2, f3

In [30]:
def train(model, iterator, optimizer, criterion): #iterates over all examples, one batch at a time
  
  epoch_loss = 0
  epoch_acc = 0
  epoch_recall = 0
  epoch_precision = 0
  epoch_f2 = 0
  epoch_f3 = 0
  
  model.train() #put the model in "training mode", which turns on dropout and batch normalization
  
  for batch in iterator: #for each batch
    
    optimizer.zero_grad() #we first zero the gradients

    #separete batch.text before passing it to the model
    #batch.text is a tuple with the first element being the numericalized tensor 
    #and the second element being the actual lengths of each sequence
    text, text_lengths = batch.text

    #feed the batch of sentences,text, and their lenghts, text_lenghts into the model 
    predictions = model(text, text_lengths).squeeze(1)
    #squeeze is needed as the predictions are initially size [batch size, 1]
    #and we need to remove the dimension of size 1, as PyTorch expects the 
    #predictions input to our criterion function to be of size [batch size]    
    
    #loss and accuracy are then calculated using the predictions and the labels, batch.label, 
    #with the loss being averaged over all examples in the batch    
    
    #computation of loss 
    loss = criterion(predictions, batch.label)
    #criterion expects both input to be FloatTensors
    #that's wht in the label field we set dtype=torch.float 
    
    #computation of accuracy
    acc, recall, precision, f2, f3 = binary_metrics(predictions.cpu(), batch.label.cpu())

    loss.backward() #calculate the gradient of each parameter
    
    optimizer.step() #update the parameters using the gradients and optimizer algorithm

    #the loss and accuracy are accumulated across the epoch, 
    #the .item() method is used to extract a scalar from a tensor which only contains a single value
    epoch_loss += loss.item()
    epoch_acc += acc.item()
    epoch_recall += recall.item()
    epoch_precision += precision.item()
    epoch_f2 += f2.item()
    epoch_f3 += f3.item()

  #return the loss and accuracy, averaged across the epochs
  #the len of an iterator is the number of batches in the iterator
    
  return epoch_loss / len(iterator), epoch_acc / len(iterator), epoch_recall / len(iterator), epoch_precision / len(iterator), epoch_f2 / len(iterator), epoch_f3 / len(iterator)

**Train method notes**: 

- Each parameter in a model has a `grad` attribute which stores the gradient calculated by the `criterion`. PyTorch does not automatically remove (or "zero") the gradients calculated from the last gradient calculation, so they must be manually zeroed.

- You do not need to do `model.forward(batch.text)`, simply calling the model works.

- When initializing the `LABEL` field, we set `dtype=torch.float`. This is because TorchText sets tensors to be `LongTensor`s by default, however our criterion expects both inputs to be `FloatTensor`s. 
The alternative method of doing this would be to do the conversion inside the `train` function by passing `batch.label.float()` instad of `batch.label` to the criterion. 

In [31]:
def evaluate(model, iterator, criterion): #similar to train, without the the update of the parameters
  
  epoch_loss = 0
  epoch_acc = 0
  epoch_recall = 0
  epoch_precision = 0
  epoch_f2 = 0
  epoch_f3 = 0
  
  model.eval() #puts the model in "evaluation mode", which turns off dropout and batch normalization
  
  with torch.no_grad(): #in order to not calculate gradients

    for batch in iterator:

      #separate batch.text
      text, text_lengths = batch.text

      predictions = model(text, text_lengths).squeeze(1)
      
      loss = criterion(predictions, batch.label)
      
      acc, recall, precision, f2, f3 = binary_metrics(predictions.cpu(), batch.label.cpu())
      
      epoch_loss += loss.item()
      epoch_acc += acc.item()
      epoch_recall += recall.item()
      epoch_precision += precision.item()
      epoch_f2 += f2.item()
      epoch_f3 += f3.item()
      
  return epoch_loss / len(iterator), epoch_acc / len(iterator), epoch_recall / len(iterator), epoch_precision / len(iterator), epoch_f2 / len(iterator), epoch_f3 / len(iterator)

No gradients are calculated on PyTorch operations inside the `with no_grad()` block. This causes less memory to be used and speeds up computation.

The rest of the function is the same as `train`, with the removal of `optimizer.zero_grad()`, `loss.backward()` and `optimizer.step()`, as we do not update the model's parameters when evaluating.

In [32]:
import time

def epoch_time(start_time, end_time):
  """
  Returns how long an epoch takes, in order to compare training times between models
  """
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

We then train the model through multiple epochs, an epoch being a complete pass through all examples in the training and validation sets.

At each epoch, if the validation loss is the best we have seen so far, we'll save the parameters of the model and then after training has finished we'll use that model on the test set.

In [33]:
#train the model through multiple epochs
#EPOCH: complete pass through all examples in the training and validation sets

N_EPOCHS = 50

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc, train_recall, train_precision, train_f2, train_f3= train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc, valid_recall, valid_precision, valid_f2, valid_f3 = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    #at each epoch, if the validation loss is the best seen so far,
    #we'll save the parameters of the model and then, after training has finished,
    #we'll use that model on the test set
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Train Recall: {train_recall*100:.2f}% | Train Precision: {train_precision*100:.2f}% | Train F2: {train_f2*100:.2f}% | Train F3: {train_f3*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% | Val. Recall: {valid_recall*100:.2f}% | Val. Precision: {valid_precision*100:.2f}% | Val. F2: {valid_f2*100:.2f}% | Val. F3: {valid_f3*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 13s
	Train Loss: 0.010 | Train Acc: 35.70% | Train Recall: 36.00% | Train Precision: 34.63% | Train F2: 26.37% | Train F3: 28.38%
	 Val. Loss: 0.010 |  Val. Acc: 87.56% | Val. Recall: 75.00% | Val. Precision: 8.83% | Val. F2: 29.88% | Val. F3: 42.65%
Epoch: 02 | Epoch Time: 0m 13s
	Train Loss: 0.008 | Train Acc: 82.95% | Train Recall: 76.26% | Train Precision: 65.97% | Train F2: 69.65% | Train F3: 71.79%
	 Val. Loss: 0.007 |  Val. Acc: 76.79% | Val. Recall: 100.00% | Val. Precision: 6.45% | Val. F2: 25.64% | Val. F3: 40.82%
Epoch: 03 | Epoch Time: 0m 14s
	Train Loss: 0.005 | Train Acc: 88.84% | Train Recall: 76.43% | Train Precision: 72.41% | Train F2: 73.11% | Train F3: 74.46%
	 Val. Loss: 0.008 |  Val. Acc: 87.17% | Val. Recall: 75.00% | Val. Precision: 8.60% | Val. F2: 29.39% | Val. F3: 42.18%
Epoch: 04 | Epoch Time: 0m 14s
	Train Loss: 0.005 | Train Acc: 87.40% | Train Recall: 68.49% | Train Precision: 65.89% | Train F2: 64.81% | Train F3: 65.69%
	 Val. L

In [34]:
#test loss and accuracy (using parameters that gave the best validation loss)

model.load_state_dict(torch.load('tut2-model.pt'))

test_loss, test_acc, test_recall, test_precision, test_f2, test_f3 = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | Test Recall: {test_recall*100:.2f}% | Test Precision: {test_precision*100:.2f}% | Test F2: {test_f2*100:.2f}% | Test F3: {test_f3*100:.2f}%')

Test Loss: 0.002 | Test Acc: 96.80% | Test Recall: 21.67% | Test Precision: 17.11% | Test F2: 20.47% | Test F3: 21.04%
