<a href="https://colab.research.google.com/github/aneeshc12/POS-tagging-and-LSTMs/blob/master/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# INLP A2 2020111018

In [116]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [117]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from copy import copy
from datetime import datetime
import matplotlib.pyplot as plt

In [118]:
# Parse data

# parse conllu, return sentences and tags
def parseConllu(path):
  X = []
  y = []
  pairs = []

  with open(path) as f:
    conllu = f.read().split('\n\n')
    for block in conllu:
      lines = block.split('\n')
      
      sentence = []
      tags = []
      for line in lines[2:]:
        print(line)
        items = line.split('\t')
        sentence.append(items[1])
        tags.append(items[3])

      pairs.append([sentence, tags])
      
  return np.array(pairs, dtype=object)

# generate mappings between indices and words in sequences
def assembleVocabulary(sequences, predefinedTags={"_unk": 0, "_pad": 1}): # no _bos or _eos
  vocab2idx = copy(predefinedTags)
  count = len(predefinedTags)

  # generate forward map
  for sequence in sequences:
    for word in sequence:
      if word not in vocab2idx:
        vocab2idx[word] = count
        count += 1

  # backwards map
  idx2vocab = {vocab2idx[k]: k for k in vocab2idx}

  return vocab2idx, idx2vocab

# encode a sequence of words as a float tensor, takes a sentence and a dict as inputs, return a float tensor
def encodeSequence(seq, toIdx):
  encoded = []
  for word in seq:
    if word not in toIdx:
      encoded.append(toIdx["_unk"])
    else:
      encoded.append(toIdx[word])
  encoded = torch.FloatTensor(encoded)
  return encoded

# pad all sequences with a "_pad" character uptil padding length, truncate larger strings
def padSequence(sequences, paddingLength):
  maxLength = 0
  paddedSeqs = []

  for seq in sequences:
    if len(seq) > maxLength:
      maxLength = len(seq)

  for seq in sequences:
    if len(seq) > paddingLength:
      paddedSeqs.append(seq[:paddingLength])
    else:
      paddingNeeded = paddingLength - len(seq)
      paddedSeqs.append(seq + ["_pad"] * paddingNeeded)

  paddedSeqs = np.array(paddedSeqs, dtype=object)
  return paddedSeqs

# begin and end sentences with "_bos" and "_eos" characters
def delimitSequence(sequences):
  delimited = []
  for seq in sequences:
    # delimited.append(["_bos"] + seq + ["_eos"])
    delimited.append(seq)

  delimited = np.array(delimited, dtype=object)
  return delimited

In [120]:
# load raw data
pairs = parseConllu("/content/drive/MyDrive/ud-english-treebanks/UD_English-GUM/en_gum-ud-train.conllu")

# meta::author = Claire Bailey-Ross, Andrew Beresford, Daniel Smith, Claire Warwick


IndexError: ignored

In [None]:
# visualise data lengths to get the best padding length
%matplotlib inline

nX = pairs[:, 0]

maxLength = 0
for seq in nX:
  if len(seq) > maxLength:
    maxLength = len(seq)

counts = np.zeros(maxLength+1)
for seq in nX:
  counts[len(seq)] += 1

plt.plot(np.arange(0,maxLength+1,1), counts)

# mean is about 12 for the altis dataset, setting padding to 18

In [None]:
# parse data, generate training, dev and test splits

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device is: ", device)

# load shuffle and preprocess data

np.random.shuffle(pairs)

X = (pairs[:, 0])
y = (pairs[:, 1])

X = delimitSequence(X)
y = delimitSequence(y)

X = padSequence(X, 15)    # mean for altis is ~12, using a padding of 20
y = padSequence(y, 15)    

# make vocabs
word2idx, idx2word = assembleVocabulary(X)
tag2idx, idx2tag = assembleVocabulary(y)

# split data
trainAmt = 0.8
devAmt = 0.1

trainIdx = int(trainAmt * pairs.shape[0])
devIdx = int(devAmt * pairs.shape[0])

trainX = X[:trainIdx]
trainY = y[:trainIdx]

devX = X[trainIdx:(trainIdx + devIdx)]
devY = y[trainIdx:(trainIdx + devIdx)]

testX = X[(trainIdx + devIdx):]
testY = y[(trainIdx + devIdx):]

In [None]:
# create dataloaders

# encode sequences internally 
class POSTagDataset(Dataset):
  def __init__(self, sentences, tags, word2idx, tag2idx):
    # encode and store sentences
    encSentences = torch.Tensor(encodeSequence(sentences[0], word2idx))
    for sentence in sentences[1:]:
      encSentences = torch.vstack([encSentences, encodeSequence(sentence, word2idx)])
    
    self.encSentences = encSentences

    # encode and store POS tags
    encPOS = torch.Tensor(encodeSequence(tags[0], tag2idx))
    for tag in tags[1:]:
      encPOS = torch.vstack([encPOS, encodeSequence(tag, tag2idx)])
    
    self.encPOS = encPOS
    assert(self.encPOS.shape[0] == self.encSentences.shape[0])

  def __len__(self):
    return self.encSentences.shape[0]

  def __getitem__(self, idx):
    return self.encSentences[idx], self.encPOS[idx]

In [None]:
trainDataset = POSTagDataset(trainX, trainY, word2idx, tag2idx)
devDataset = POSTagDataset(devX, devY, word2idx, tag2idx)
testDataset = POSTagDataset(testX, testY, word2idx, tag2idx)

In [None]:
# define main LSTM and experiment classes

# main lstm, take in encoded sente
class LSTM(nn.Module):
  def __init__(self, embeddingDim, hiddenDim, wordVocab, tagVocab, device):
    super(LSTM, self).__init__()


    self.hiddenDim = hiddenDim
    self.padIdx = 1
    self.wordEmbeddings = nn.Embedding(len(wordVocab), embeddingDim, 
                                       padding_idx=self.padIdx)         # encode each word as an embedding of size embeddingDim

    with torch.no_grad():
      self.wordEmbeddings.weight[self.padIdx,:] = 1.0  

    self.lstm = nn.LSTM(embeddingDim, hiddenDim, batch_first=True)      # main lstm 
    self.fc = nn.Linear(hiddenDim, len(tagVocab))                       # output a tag based on the hidden dim (using the internal state)

    self.logSoftMax = F.log_softmax

    self.device = device

  def forward(self, sentences):
    s = sentences.to(self.device)         # move to cuda
    embeddings = self.wordEmbeddings(s)
    out, _ = self.lstm(embeddings)
    out = self.fc(out)

    tagScores = self.logSoftMax(out, dim=1)
    return tagScores
    
# experiment class to manage training and testing
class Experiment():
  def __init__(self, embeddingDim, hiddenDim, wordVocab, tagVocab, device, batchSize=16, lossFunction=nn.CrossEntropyLoss, optimiser=torch.optim.SGD, lr=0.01):
    self.model = LSTM(embeddingDim, hiddenDim, wordVocab, tagVocab, device).to(device)
    self.lossFunction = lossFunction().to(device)
    self.optimiser = optimiser(self.model.parameters(), lr=lr)

    self.batchSize = batchSize
    self.device = device


  def evaluate(self, evalDataset):
    """
    Evaluate the containers LSTM on a torch dataset

    Return the average error with the containers loss function
    """
    evalDataloader = DataLoader(evalDataset, batch_size=self.batchSize, shuffle=True)
    with torch.no_grad():
      evalLoss = 0.0
      for i, (sentences, labels) in enumerate(iter(evalDataloader)):
        s = sentences.to(self.device)
        l = labels.to(self.device)

        self.model.zero_grad()
        tagScores = self.model(s.long())
        
        loss = self.lossFunction(tagScores.permute(0,2,1), l.long())        # permute tagscores to calculate the loss over a sentence

        evalLoss += loss

    return evalLoss/len(evalDataset)

  def train(self, trainDataset, devDataset, numEpochs=50, printStep=10, saveStep=100, savePath="/content/drive/MyDrive/Colab Notebooks/INLP/INLP-A2 weights/"):
    """
    Train the containers LSTM given a set of datasets and parameters

    Params:
    trainDataset: training dataset (torch)
    devDataset: dev dataset (torch)
    numEpochs:
    printStep: number of epochs before stats are printed
    saveStep: number of epochs before weights are saved
    savePath: default location weights are saved

    Returns nothing
    """

    # init dataloaders
    trainDataloader = DataLoader(trainDataset, batch_size=self.batchSize, shuffle=True)

    # iterate over all train batches, train the LSTM with sentences and labels
    # keep iterating until performance on validation drops (early stoppage)
    lastDevLoss = np.inf
    for epoch in range(numEpochs):

      # train over training data
      trainingLoss = 0.0
      for i, (sentences, labels) in enumerate(iter(trainDataloader)):
        s = sentences.to(self.device)     # move to cuda
        l = labels.to(self.device)

        # zero grad, compute scores, evaluate loss, backprop, update weights
        self.model.zero_grad()
        tagScores = self.model(s.long())
        
        loss = self.lossFunction(tagScores.permute(0,2,1), l.long())        # permute tagscores to calculate the loss over a sentence
        loss.backward()
        self.optimiser.step()

        trainingLoss += loss
      trainingLoss /= len(trainDataloader)

      # evaluate on dev data
      devLoss = self.evaluate(devDataset)

      # print every printStep
      if epoch % printStep == 0:
        print("Epoch %d | avg. training error: %f | avg. dev error: %f" % (epoch, trainingLoss, devLoss))
      
      # save every save step
      if epoch % saveStep == 0:
        filePath = savePath + datetime.now().strftime("%m-%d-%H-%M-%S") + ".pt"
        torch.save(self.model.state_dict, filePath)

      # early stoppage
      if devLoss > lastDevLoss:
        filePath = savePath + datetime.now().strftime("%m-%d-%H-%M-%S") + ".pt"
        torch.save(self.model.state_dict, filePath)

        print("Increase in dev loss, stopping training")
        print("Epoch %d | avg. training error: %f | avg. dev error: %f" % (epoch, trainingLoss, devLoss))
        break
      else:
        lastDevLoss = devLoss


def inference(model, sentence):
  """
  Given a model and a sentence, print predicted POS tags

  Return a tensor of tag indices
  """
  with torch.no_grad():
    processedSentence = sentence.lower().split(' ')
    encodedSentence = encodeSequence(processedSentence, word2idx).long()
    print(encodedSentence)
    results = model(encodedSentence)

  return results
    
    


In [None]:
# example training and evaluation

e1 = Experiment(50, 100, word2idx, tag2idx, device, lr=0.01)
e1.train(trainDataset, devDataset, numEpochs=250)
testLoss = e1.evaluate(testDataset)
print("Loss on the test dataset: ", testLoss)

In [None]:
results = inference(e1.model, "which flights leave april twelfth from indianapolis and arrive in montreal around 10 pm").to('cpu')

In [None]:
results
for r in results:
  # print(r)
  idx = int(np.argmax(r))
  print(idx, idx2tag[idx])

In [None]:
print(testX[1])
print(testY[1])