<a href="https://colab.research.google.com/github/aneeshc12/POS-tagging-and-LSTMs/blob/master/Copy_of_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# INLP A2 2020111018

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from copy import copy

In [3]:
# Parse data

# parse conllu, return sentences and tags
def parseConllu(path):
  X = []
  y = []
  pairs = []

  with open(path) as f:
    conllu = f.read().split('\n\n')
    for block in conllu:
      lines = block.split('\n')
      
      sentence = []
      tags = []
      for line in lines[2:]:
        items = line.split('\t')
        sentence.append(items[1])
        tags.append(items[3])

      pairs.append([sentence, tags])
      
  return np.array(pairs, dtype=object)

# generate mappings between indices and words in sequences
def assembleVocabulary(sequences, predefinedTags={"_unk": 0, "_pad": 1, "_bos": 2, "_eos": 3}):
  vocab2idx = copy(predefinedTags)
  count = len(predefinedTags)

  # generate forward map
  for sequence in sequences:
    for word in sequence:
      if word not in vocab2idx:
        vocab2idx[word] = count
        count += 1

  # backwards map
  idx2vocab = {vocab2idx[k]: k for k in vocab2idx}

  return vocab2idx, idx2vocab

# encode a sequence of words as a float tensor, takes a sentence and a dict as inputs, return a float tensor
def encodeSequence(seq, toIdx):
  encoded = []
  for word in seq:
    if word not in toIdx:
      encoded.append(toIdx["_unk"])
    else:
      encoded.append(toIdx[word])
  encoded = torch.FloatTensor(encoded)
  return encoded

# pad all sequences with a "_pad" character
def padSequence(sequences):
  maxLength = 0
  paddedSeqs = []

  for seq in sequences:
    if len(seq) > maxLength:
      maxLength = len(seq)

  for seq in sequences:
    paddingNeeded = maxLength - len(seq)
    paddedSeqs.append(seq + ["_pad"] * paddingNeeded)

  paddedSeqs = np.array(paddedSeqs, dtype=object)
  return paddedSeqs

# begin and end sentences with "_bos" and "_eos" characters
def delimitSequence(sequences):
  delimited = []
  for seq in sequences:
    delimited.append(["_bos"] + seq + ["_eos"])

  delimited = np.array(delimited, dtype=object)
  return delimited

In [4]:
# parse data, generate training, dev and test splits

# load shuffle and preprocess data
pairs = parseConllu("/content/drive/MyDrive/ud-english-treebanks/UD_English-Atis/en_atis-ud-train.conllu")

np.random.shuffle(pairs)

X = (pairs[:, 0])
y = (pairs[:, 1])

X = delimitSequence(X)
y = delimitSequence(y)

X = padSequence(X)
y = padSequence(y)

# make vocabs
word2idx, idx2word = assembleVocabulary(X)
tag2idx, idx2tag = assembleVocabulary(y)

# split data
trainAmt = 0.7
devAmt = 0.1

trainIdx = int(trainAmt * pairs.shape[0])
devIdx = int(devAmt * pairs.shape[0])

trainX = X[:trainIdx]
trainY = y[:trainIdx]

devX = X[trainIdx:(trainIdx + devIdx)]
devY = y[trainIdx:(trainIdx + devIdx)]

testX = X[(trainIdx + devIdx):]
testY = y[(trainIdx + devIdx):]

In [5]:
# create dataloaders

# encode sequences internally 
class POSTagDataset(Dataset):
  def __init__(self, sentences, tags, word2idx, tag2idx):
    # encode and store sentences
    encSentences = torch.Tensor(encodeSequence(sentences[0], word2idx))
    for sentence in sentences[1:]:
      encSentences = torch.vstack([encSentences, encodeSequence(sentence, word2idx)])
    
    self.encSentences = encSentences

    # encode and store POS tags
    encPOS = torch.Tensor(encodeSequence(tags[0], tag2idx))
    for tag in tags[1:]:
      encPOS = torch.vstack([encPOS, encodeSequence(tag, tag2idx)])
    
    self.encPOS = encPOS
    assert(self.encPOS.shape[0] == self.encSentences.shape[0])

  def __len__(self):
    return self.encSentences.shape[0]

  def __getitem__(self, idx):
    return self.encSentences[idx], self.encPOS[idx]

In [30]:
trainDataset = POSTagDataset(trainX, trainY, word2idx, tag2idx)
devDataset = POSTagDataset(devX, devY, word2idx, tag2idx)
testDataset = POSTagDataset(testX, testY, word2idx, tag2idx)

print(trainX.shape)
print(trainY.shape)

(2992, 48)
(2992, 48)


In [45]:
# define main LSTM and experiment classes

# main lstm, take in encoded sente
class LSTM(nn.Module):
  def __init__(self, embeddingDim, hiddenDim, wordVocab, tagVocab):
    super(LSTM, self).__init__()


    self.hiddenDim = hiddenDim
    self.padIdx = 1
    self.wordEmbeddings = nn.Embedding(len(wordVocab), embeddingDim, 
                                       padding_idx=self.padIdx)         # encode each word as an embedding of size embeddingDim

    self.lstm = nn.LSTM(embeddingDim, hiddenDim, batch_first=True)      # main lstm 
    self.fc = nn.Linear(hiddenDim, len(tagVocab))                       # output a tag based on the hidden dim (using the internal state)

    self.logSoftMax = F.log_softmax

  def forward(self, sentences):
    embeddings = self.wordEmbeddings(sentences)
    out, _ = self.lstm(embeddings)
    out = self.fc(out)

    tagScores = self.logSoftMax(out, dim=1)
    return tagScores
    
# experiment class to manage training and testing
class Experiment():
  def __init__(self, embeddingDim, hiddenDim, wordVocab, tagVocab, batchSize=16, lossFunction=nn.CrossEntropyLoss, optimiser=torch.optim.SGD, lr=0.01):
    self.model = LSTM(embeddingDim, hiddenDim, wordVocab, tagVocab)
    self.lossFunction = lossFunction()
    self.optimiser = optimiser(self.model.parameters(), lr=lr)

    self.batchSize = batchSize

  def train(self, trainX, trainY, devX, devY, numEpochs=50):
    # init dataloaders
    trainDataloader = DataLoader(trainDataset, batch_size=self.batchSize, shuffle=True)
    devDataloader = DataLoader(devDataset, batch_size=self.batchSize, shuffle=True)

    # iterate over all train batches, train the LSTM with sentences and labels
    # keep iterating until performance on validation drops (early stoppage)
    lastDevLoss = np.inf
    for epoch in range(numEpochs):

      # train over training data
      trainingLoss = 0.0
      for i, (sentences, labels) in enumerate(iter(trainDataloader)):
        self.model.zero_grad()
        tagScores = self.model(sentences.long())
        
        loss = self.lossFunction(tagScores.permute(0,2,1), labels.long())        # permute tagscores to calculate the loss over a sentence
        loss.backward()
        self.optimiser.step()

        trainingLoss += loss
      trainingLoss /= len(trainDataloader)

      # evaluate on dev data
      devLoss = 0.0
      with torch.no_grad():
        for i, (sentences, labels) in enumerate(iter(devDataloader)):
          self.model.zero_grad()
          tagScores = self.model(sentences.long())
          
          loss = self.lossFunction(tagScores.permute(0,2,1), labels.long())        # permute tagscores to calculate the loss over a sentence

          devLoss += loss
        devLoss /= len(devDataloader)

      if epoch % 2 == 0:
        print("Epoch %d | avg. training error: %f | avg. dev error: %f" % (epoch, trainingLoss, devLoss))
      
      if devLoss > lastDevLoss:
        print("Increase in dev loss, stopping training")
        break
      else:
        lastDevLoss = devLoss




# for i, (ts, tl) in enumerate(iter(trainDataloader)):
#   print("iter ", i)
#   print(ts.shape, ts[0])
#   print(tl.shape, tl[0])

trainDataloader = DataLoader(trainDataset, batch_size=16, shuffle=True)
trainSentences, trainLabels = next(iter(trainDataloader))

# trainSentences.view(trainSentences.shape[0], trainSentences.shape[1], 1),

myLSTM = LSTM(30, 50, word2idx, tag2idx)
# embs = myLSTM.wordEmbeddings(trainSentences.long())
# res, _ = myLSTM.lstm(embs)
# tagg = myLSTM.fc(res)
# scores = myLSTM.logSoftMax(tagg)

# scores = myLSTM(trainSentences.long())
# loss = nn.CrossEntropyLoss()



# print("score shep: ", scores.permute(0,2,1).shape)
# print("llb shep: ", trainLabels.shape)
# op = loss(scores.permute(0,2,1), trainLabels.long())

e1 = Experiment(30, 50, word2idx, tag2idx)
e1.train(trainX, trainY, devX, devY, numEpochs=100)


Epoch 0 | avg. training error: 2.834647 | avg. dev error: 2.828374
Epoch 2 | avg. training error: 2.808918 | avg. dev error: 2.802219
Epoch 4 | avg. training error: 2.775244 | avg. dev error: 2.765501
Epoch 6 | avg. training error: 2.720396 | avg. dev error: 2.703934
Epoch 8 | avg. training error: 2.634596 | avg. dev error: 2.614754
Epoch 10 | avg. training error: 2.541312 | avg. dev error: 2.523913
Epoch 12 | avg. training error: 2.438318 | avg. dev error: 2.415653
Epoch 14 | avg. training error: 2.288844 | avg. dev error: 2.246604
Epoch 16 | avg. training error: 1.982841 | avg. dev error: 1.850685
Epoch 18 | avg. training error: 1.192079 | avg. dev error: 1.030116
Epoch 20 | avg. training error: 0.841808 | avg. dev error: 0.802546
Epoch 22 | avg. training error: 0.758259 | avg. dev error: 0.737147
Epoch 24 | avg. training error: 0.719347 | avg. dev error: 0.701685
Epoch 26 | avg. training error: 0.693474 | avg. dev error: 0.677733
Epoch 28 | avg. training error: 0.673405 | avg. dev e

KeyboardInterrupt: ignored