<a href="https://colab.research.google.com/github/asuvarna31/Pos-Tagging/blob/master/BiLSTMPoSTaggerEnglish.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import torch.nn as nn

from torchtext import data
from torchtext import datasets #already prepped datasets English

import spacy
import numpy as np
import time
import random #used for random seeding data

In [0]:
TEXT = data.Field (lower = True)
PTB_Tags = data.Field (unk_token = None) #remove the default unk_tokens

In [0]:
fields = (("text", TEXT), (None, None) ,("ptbtags", PTB_Tags))

Load the PTBPOS dataset and use fields to pass our fields to the dataset

In [5]:
train_data, val_data, test_data = datasets.UDPOS.splits(fields)

downloading en-ud-v2.zip


en-ud-v2.zip: 100%|██████████| 688k/688k [00:00<00:00, 2.06MB/s]


extracting


In [6]:
 #min_frequency of words to be added in vocabulary = 2

TEXT.build_vocab(train_data, min_freq =2, vectors = "glove.6B.100d",unk_init = torch.Tensor.normal_ )


.vector_cache/glove.6B.zip: 862MB [06:26, 2.23MB/s]                          
100%|█████████▉| 399934/400000 [00:17<00:00, 23417.99it/s]

NameError: ignored

In [0]:
PTB_Tags.build_vocab(train_data)

In [0]:
SEED = 1024
random.seed (SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [0]:
BATCH_SZ = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, val_iterator, test_iterator = data.BucketIterator.splits ((train_data, val_data, test_data), batch_size = BATCH_SZ, device = device)

Model used for PoS-Tagging - a multi-layer bi-directional LSTM

In [0]:
class BiLSTM(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
      super().__init__()
      self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)

      self.lstm = nn.LSTM (embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          dropout = dropout if n_layers > 1 else 0)
      self.fc = nn.Linear (hidden_dim*2 if bidirectional else hidden_dim, output_dim)
      self.dropout = nn.Dropout(dropout)


    def forward (self, text):
      embedded = self.dropout(self.embedding(text))
      outputs, (hidden,cell) = self.lstm(embedded)
      predictions = self.fc(self.dropout(outputs))
      return predictions



In [0]:
Input_dim = len(TEXT.vocab)
Embedding_dim = 100
Hidden_dim = 128
Output_dim = len(PTB_Tags.vocab)
n_layers = 3
Bidirectional = True
Dropout = 0.25
Pad_Idx = TEXT.vocab.stoi[TEXT.pad_token]

model = BiLSTM (Input_dim, Embedding_dim, Hidden_dim, Output_dim, n_layers, Bidirectional, Dropout, Pad_Idx)



In [16]:
def init_wts(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean = 0, std = 0.1)
        
model.apply(init_wts)

BiLSTM(
  (embedding): Embedding(8866, 100, padding_idx=1)
  (lstm): LSTM(100, 128, num_layers=3, dropout=0.25, bidirectional=True)
  (fc): Linear(in_features=256, out_features=51, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)

In [0]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)
model.embedding.weight.data[Pad_Idx] = torch.zeros(Embedding_dim) #initialize pad tokens to zeroes


In [0]:
optimizer = torch.optim.Adam(model.parameters())
TAG_PAD_IDX = PTB_Tags.vocab.stoi[PTB_Tags.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)


In [0]:
model = model.to(device)
criterion = criterion.to(device)

In [0]:
def categorical_accuracy(preds, y, tag_pad_idx):
   #skip accuracy calculations over <pad> tokens
   max_pred = preds.argmax(dim=1, keepdim=True)
   non_pad_elements = (y != tag_pad_idx).nonzero() #stores all non-pad elements
   correct = max_pred[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
   return correct.sum()/torch.FloatTensor([y[non_pad_elements].shape[0]])


In [0]:
def train(model, iterator, optimizer, criterion, tag_pad_idx):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()

    for batch in iterator:
      text = batch.text
      tags = batch.ptbtags

      optimizer.zero_grad()

      predictions = model(text)
      
      predictions = predictions.view(-1,predictions.shape[-1])
      tags = tags.view(-1)
      
      loss = criterion(predictions, tags)
      acc = categorical_accuracy(predictions, tags, tag_pad_idx)
      loss.backward()
      optimizer.step()
        
      epoch_loss += loss.item()
      epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)





In [0]:
def evaluate(model, iterator, criterion, tag_pad_idx):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()

    with torch.no_grad():

        for batch in iterator:
          text = batch.text
          tags = batch.ptbtags

          predictions = model(text)

          predictions = predictions.view(-1,predictions.shape[-1])
          tags = tags.view(-1)
          
          loss = criterion(predictions, tags)
          acc = categorical_accuracy(predictions, tags, tag_pad_idx)
          
            
          epoch_loss += loss.item()
          epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [0]:
N_epochs = 10
best_valid_loss = float('inf')

for epoch in range (N_epochs):
  train_loss, train_acc = train(model, train_iterator, optimizer, criterion, TAG_PAD_IDX)
  valid_loss, valid_acc = evaluate(model, val_iterator, criterion, TAG_PAD_IDX)

  if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
  print(f'Epoch: {epoch+1:02}')
  print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
  print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    

In [0]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion, TAG_PAD_IDX)

print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')