In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import IMDB
from torchtext.data import Field, LabelField, BucketIterator

In [3]:
SRC = Field(tokenize = 'spacy', lower = True)
TRG = LabelField(dtype = torch.int64)

train_data, test_data = IMDB.splits(SRC, TRG) # download imdb dataset

aclImdb_v1.tar.gz:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:02<00:00, 35.5MB/s]


In [4]:
# display lenght of test and traing data
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 25000
Number of testing examples: 25000


In [5]:
# display single example at index 0
print(vars(train_data.examples[0]))

{'text': ['for', 'a', 'comedy', 'this', 'has', 'a', 'decent', 'and', 'inventive', 'plot', 'and', 'trey', 'parker', 'and', 'matt', 'stone', "'s", 'comic', 'timing', 'is', 'perfect', '.', 'there', 'are', 'dozens', 'of', 'funny', 'moments', 'to', 'this', 'fantastic', 'movie', '.', 'i', 'especially', 'like', 'the', 'multitude', 'of', 'colors', 'and', 'the', 'way', 'the', 'clash', 'in', 'the', 'sports', 'arena', 'scenes', '.', 'robert', 'stacks', 'unsolved', 'mysteries', 'spoof', 'is', 'also', 'very', 'amusing', '.'], 'label': 'pos'}


In [6]:
# Build vocabulary for source and target from training data

SRC.build_vocab(train_data, max_size=10000, min_freq=5, vectors="glove.6B.100d")  # using pretrained word embedding
TRG.build_vocab(train_data, min_freq = 5)

print(vars(TRG.vocab))
print(f"Unique tokens in source vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in TRG vocabulary: {len(TRG.vocab)}")

.vector_cache/glove.6B.zip: 862MB [06:28, 2.22MB/s]                               
100%|█████████▉| 399999/400000 [00:25<00:00, 15812.95it/s]


{'freqs': Counter({'pos': 12500, 'neg': 12500}), 'itos': ['neg', 'pos'], 'unk_index': None, 'stoi': defaultdict(None, {'neg': 0, 'pos': 1}), 'vectors': None}
Unique tokens in source vocabulary: 10002
Unique tokens in TRG vocabulary: 2


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 100

# train and test iteartor
train_iterator,test_iterator = BucketIterator.splits(
      (train_data, test_data), 
      batch_size = BATCH_SIZE, 
      device = device
    )



In [9]:
# Model class
class Model(nn.Module):
  def __init__(self, input_dim, output_dim,emb_dim, hidden_dim, n_layers, dropout):
    # input_dim <--- vocabulary size
    # output_dim <--- len ([positive, negative]) == 2 
    # emb_dim <--- embedding dimension of embedding matrix
    
    super(Model, self).__init__()
    self.n_layers = n_layers
    self.hidden_dim = hidden_dim
    
    self.embedding = nn.Embedding(input_dim, emb_dim)
    self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout)
    
    self.fc1 = nn.Linear(hidden_dim, hidden_dim//2)
    self.fc2 = nn.Linear(hidden_dim//2, output_dim)
    
    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(dropout)

  def forward(self, src):
    # shape: [source_len, batch_size]
    embedded = self.dropout(self.embedding(src)) # sahpe: [src_len, batch_size, embed_dim]
    output, (hidden, cell) = self.rnn(embedded) 
    # output shape -> [batch, hidden_dim]
    # hiddden shape -> [n_layers, batch, hidden_dim]
    # cell shape -> [n_layers, batch, hidden_dim]
    output = self.fc1(output[-1])
    output = self.fc2(self.relu(output))
    return output

In [10]:
#initializing variables and hyper parameters
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 100
DEC_EMB_DIM = 100
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

# initializing our model
model = Model(INPUT_DIM, OUTPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT).to(device)

# loading pretrained word embedding
model.embedding.weight.data.copy_(SRC.vocab.vectors) 

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.0402, -0.4874,  0.7354,  ...,  0.1813, -0.4743, -0.4879],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.5194, -0.6982,  0.2841,  ...,  0.3718, -0.4543, -0.4990]],
       device='cuda:0')

In [11]:
optimizer = optim.Adam(model.parameters(), lr=3e-3)

# defining learnig rate scheduler (optional)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True)

criterion = nn.CrossEntropyLoss()


# Model training function
def train(model, iterator, optimizer=optimizer, criterion=criterion, clip=1):
    model.train()
    epoch_loss = 0
    total_correct = 0
    total_count = 0
    
    for i, batch in enumerate(iterator):
        src = batch.text.to(device)
        trg = batch.label.to(device)
        optimizer.zero_grad()
        output = model(src)
        
        total_correct += torch.sum(torch.eq(output.argmax(1), trg))
        total_count+=len(trg)
        
        loss = criterion(output, trg)
        
        loss.backward() 
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
        
    print(f'correct: {total_correct}/{total_count}')
    mean_loss = epoch_loss / len(iterator)
    scheduler.step(mean_loss)
    return mean_loss # mean loss

In [None]:
# loop and train our model
total_epoch = 120
for epoch in range(total_epoch):
  result = train(model=model, iterator=train_iterator)
  print(f'Epoch {epoch} -->', result)

correct: 12562/25000
Epoch 0 --> 0.6955907859802246


In [None]:
# function to experiment movie review sentences
import spacy

!python -m spacy download en # dwonload english from spacy

sp = spacy.load('en')


def predict(sentence):

  if type(sentence) == str:
    tokanized_sentence = [word.text for word in sp.tokenizer(sentence)]
  else:
    tokanized_sentence = sentence


  input_data = [SRC.vocab.stoi[word.lower()] for word in tokanized_sentence]
  input_data = torch.tensor(input_data, dtype=torch.int64).unsqueeze(1).to(device)


  model.eval()
  output = model(input_data)
  # print(output)
  predict = output.argmax(1)
  predict = predict.squeeze(0)
  print(output)

  if predict>0:
    return "---->> Positive Review"
  else:
    return '---->> Negative Review'

In [None]:
predict('i have enjoyed this movie') # predict funciton will predict if this is positive or negative review.