In [1]:
!pip install torchtext==0.10.0.



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.10.0.
  Downloading torchtext-0.10.0-cp37-cp37m-manylinux1_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 25.6 MB/s 
[?25hCollecting torch==1.9.0
  Downloading torch-1.9.0-cp37-cp37m-manylinux1_x86_64.whl (831.4 MB)
[K     |████████████████████████████████| 831.4 MB 2.8 kB/s 
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.12.1+cu113
    Uninstalling torch-1.12.1+cu113:
      Successfully uninstalled torch-1.12.1+cu113
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.13.1
    Uninstalling torchtext-0.13.1:
      Successfully uninstalled torchtext-0.13.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy import data , datasets
import random
import spacy

preparing data , we'll set the seed, define the Fields and get the train/valid/test splits

In [3]:
seed =1234
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True

Text = data.Field(tokenize = 'spacy',tokenizer_language = 'en_core_web_sm',include_lengths= True)
Label = data.LabelField(dtype=torch.float)

load the IMDb dataset.

In [4]:
train_data,test_data = datasets.IMDB.splits(Text,Label)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:09<00:00, 8.63MB/s]


create the validation set from our training set.

In [5]:
train_data,valid_data=train_data.split(random_state=random.seed(seed))

downloading the vectors and associating them with the correct words in our vocabulary and we'll be using the "glove.6B.100d" vectors" glove is the algorithm used to calculate the vectors and by setting unk_init to torch.Tensor.normal_. This will now initialize those words via a Gaussian distribution.

In [6]:
from torchtext.vocab import Vectors
max_size = 25000
Text.build_vocab(train_data,max_size=max_size,vectors = "glove.6B.100d",unk_init=torch.Tensor.normal_)
Label.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [02:42, 5.31MB/s]                           
100%|█████████▉| 399999/400000 [00:13<00:00, 29980.73it/s]


placing the tensors on the GPU if one is available

In [7]:
Batch_size = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


create the iterators

In [8]:
train_iterator,valid_iterator,test_iterator = data.BucketIterator.splits((train_data,valid_data,test_data),
                                                                  batch_size=Batch_size,
                                                                  sort_within_batch=True,
                                                                  device=device)

build the model using the bidirectional recurrent neural network

In [55]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
       
        
        embedded = self.dropout(self.embedding(text))
        
        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'))
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
       
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        
            
        return self.fc(hidden)




create an instance of our RNN class with the new parameters and arguments for the number of layers bidirectionality and dropout probability

In [56]:
input_dim=len(Text.vocab)
Embedding_dim =100
Hidden_dim=256
Output_dim =1
N_layers=2
Bidirectional = True
Dropout = 0.5
pad_IDX= Text.vocab.stoi[Text.pad_token]


model = RNN(input_dim,Embedding_dim,Hidden_dim,Output_dim,N_layers,Bidirectional,Dropout,pad_IDX)

We retrieve the embeddings from the field's vocab, and check they're the correct size

In [57]:
pre_embeddings = Text.vocab.vectors
print(pre_embeddings.shape)

torch.Size([25002, 100])


We then replace the initial weights of the embedding layer with the pre-trained embeddings

In [58]:
model.embedding.weight.data.copy_(pre_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 1.0512,  0.0020, -0.9155,  ..., -0.2175,  0.2258,  0.5867],
        [-0.3148,  0.1176,  0.5727,  ..., -0.1893,  0.2597, -0.3915],
        [-0.7507,  0.0280,  0.4090,  ..., -0.0273,  0.3827,  0.3968]])

initialize <unk> and <pad> token  to all zeros to explicitly tell our model that, initially, they are irrelevant for determining sentiment.

In [59]:
unk_idx = Text.vocab.stoi[Text.unk_token]
model.embedding.weight.data[unk_idx]=torch.zeros(Embedding_dim)
model.embedding.weight.data[pad_IDX]=torch.zeros(Embedding_dim)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 1.0512,  0.0020, -0.9155,  ..., -0.2175,  0.2258,  0.5867],
        [-0.3148,  0.1176,  0.5727,  ..., -0.1893,  0.2597, -0.3915],
        [-0.7507,  0.0280,  0.4090,  ..., -0.0273,  0.3827,  0.3968]])


We define the optimizer and the criterion and place the model and criterion on the GPU

In [60]:
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

 implement the function to calculate accuracy

In [52]:
def binary_accuracy(preds,y):
  rounded_preds=torch.round(torch.sigmoid(preds))
  correct = (rounded_preds==y).float()
  acc = correct.sum()/len(correct)
  return acc

We define a function for training our model.

In [53]:
def train(model,iterator,optimizer,criterion):
  epoch_loss=0
  epoch_acc = 0
  model.train()
  for batch in iterator:
    optimizer.zero_grad()
    text,text_lengths=batch.text
    preds = model(text,text_lengths).squeeze(1)
    loss = criterion(preds,batch.label)
    acc = binary_accuracy(preds,batch.label)
    loss.backward()
    optimizer.step()
    epoch_loss+= loss.item()
    epoch_acc+= acc.item()

  return   epoch_loss/len(iterator) , epoch_acc/len(iterator)

Then we define a function for testing our model.

In [26]:
def evaluate(model,iterator,criterion):
  epoch_loss=0
  epoch_acc=0
  with torch.no_grad():
    for batch in iterator:
      text,text_lengths= batch.text
      preds = model(text,text_lengths).squeeze(1)
      loss = criterion(preds,batch.label)
      acc = binary_accuracy(preds,batch.label)
      epoch_loss += loss.item()
      epoch_acc += acc.item()

  return    epoch_loss/len(iterator) , epoch_acc/len(iterator)


create a function to tell us how long our epochs are taking.

In [18]:
import time
def epoch_time(start,end):
  timeE = end - start
  minutes = int(timeE/60)
  secs = int(timeE-(minutes * 60))
  return minutes , secs

 we train our model.

In [62]:
N_epochs =5 
best_loss = float('inf')
for epoch in range(N_epochs):

  start = time.time()

  train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
  valid_loss,valid_acc = evaluate(model,valid_iterator,criterion)


  end = time.time()

  epoch_minutes ,epoch_seconds = epoch_time(start,end)


  if valid_loss<best_loss:
    best_loss = valid_loss
    torch.save(model.state_dict(),'tut2-model.pt')
  
  print(f'epoch_no : {epoch+1:02} | Epoch Time: {epoch_minutes}m {epoch_seconds}s')
  print(f'\train_loss : {train_loss:.2f} | train_acc: {train_acc*100:.2f}%')
  print(f'\valid_loss: {valid_loss:.2f} |  valid_acc: {valid_acc*100:.2f}%') 

epoch_no : 01 | Epoch Time: 0m 38s
	rain_loss : 0.60 | train_acc: 69.23%
alid_loss: 0.52 |  valid_acc: 75.58%
epoch_no : 02 | Epoch Time: 0m 37s
	rain_loss : 0.50 | train_acc: 76.79%
alid_loss: 0.45 |  valid_acc: 80.01%
epoch_no : 03 | Epoch Time: 0m 38s
	rain_loss : 0.38 | train_acc: 83.56%
alid_loss: 0.36 |  valid_acc: 83.62%
epoch_no : 04 | Epoch Time: 0m 38s
	rain_loss : 0.30 | train_acc: 87.97%
alid_loss: 0.37 |  valid_acc: 85.02%
epoch_no : 05 | Epoch Time: 0m 37s
	rain_loss : 0.27 | train_acc: 89.28%
alid_loss: 0.33 |  valid_acc: 86.30%


improved test accuracy.

In [63]:
model.load_state_dict(torch.load('tut2-model.pt'))
test_loss,test_acc = evaluate(model,test_iterator,criterion)
print(f'test_loss:{test_loss:.3f} | test_acc:{test_acc*100:3f}%')

test_loss:0.338 | test_acc:85.846387%


testing the user input.

In [68]:
nlp = spacy.load('en_core_web_sm')
def predict(model,sentence):
  model.eval()
  tokenized= [t.text for t in nlp.tokenizer(sentence)]
  indexed = [Text.vocab.stoi[t] for t in tokenized]
  length = [len(indexed)]
  tensor = torch.LongTensor(indexed).to(device)
  tensor = tensor.unsqueeze(1)
  length_tensor = torch.LongTensor(length)
  preds= torch.sigmoid(model(tensor,length_tensor))
  return preds.item()



negative review.

In [75]:
predict(model,"this film was boring")

0.07299040257930756

positive review.

In [70]:
predict(model,"this film is fantastic")

0.8819243907928467