<a href="https://colab.research.google.com/github/aneeshc12/POS-tagging-and-LSTMs/blob/master/Copy_of_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# INLP A2 2020111018

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [35]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from copy import copy
from datetime import datetime

In [3]:
# Parse data

# parse conllu, return sentences and tags
def parseConllu(path):
  X = []
  y = []
  pairs = []

  with open(path) as f:
    conllu = f.read().split('\n\n')
    for block in conllu:
      lines = block.split('\n')
      
      sentence = []
      tags = []
      for line in lines[2:]:
        items = line.split('\t')
        sentence.append(items[1])
        tags.append(items[3])

      pairs.append([sentence, tags])
      
  return np.array(pairs, dtype=object)

# generate mappings between indices and words in sequences
def assembleVocabulary(sequences, predefinedTags={"_unk": 0, "_pad": 1, "_bos": 2, "_eos": 3}):
  vocab2idx = copy(predefinedTags)
  count = len(predefinedTags)

  # generate forward map
  for sequence in sequences:
    for word in sequence:
      if word not in vocab2idx:
        vocab2idx[word] = count
        count += 1

  # backwards map
  idx2vocab = {vocab2idx[k]: k for k in vocab2idx}

  return vocab2idx, idx2vocab

# encode a sequence of words as a float tensor, takes a sentence and a dict as inputs, return a float tensor
def encodeSequence(seq, toIdx):
  encoded = []
  for word in seq:
    if word not in toIdx:
      encoded.append(toIdx["_unk"])
    else:
      encoded.append(toIdx[word])
  encoded = torch.FloatTensor(encoded)
  return encoded

# pad all sequences with a "_pad" character
def padSequence(sequences):
  maxLength = 0
  paddedSeqs = []

  for seq in sequences:
    if len(seq) > maxLength:
      maxLength = len(seq)

  for seq in sequences:
    paddingNeeded = maxLength - len(seq)
    paddedSeqs.append(seq + ["_pad"] * paddingNeeded)

  paddedSeqs = np.array(paddedSeqs, dtype=object)
  return paddedSeqs

# begin and end sentences with "_bos" and "_eos" characters
def delimitSequence(sequences):
  delimited = []
  for seq in sequences:
    delimited.append(["_bos"] + seq + ["_eos"])

  delimited = np.array(delimited, dtype=object)
  return delimited

In [18]:
# parse data, generate training, dev and test splits

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device is: ", device)

# load shuffle and preprocess data
pairs = parseConllu("/content/drive/MyDrive/ud-english-treebanks/UD_English-Atis/en_atis-ud-train.conllu")

np.random.shuffle(pairs)

X = (pairs[:, 0])
y = (pairs[:, 1])

X = delimitSequence(X)
y = delimitSequence(y)

X = padSequence(X)
y = padSequence(y)

# make vocabs
word2idx, idx2word = assembleVocabulary(X)
tag2idx, idx2tag = assembleVocabulary(y)

# split data
trainAmt = 0.7
devAmt = 0.1

trainIdx = int(trainAmt * pairs.shape[0])
devIdx = int(devAmt * pairs.shape[0])

trainX = X[:trainIdx]
trainY = y[:trainIdx]

devX = X[trainIdx:(trainIdx + devIdx)]
devY = y[trainIdx:(trainIdx + devIdx)]

testX = X[(trainIdx + devIdx):]
testY = y[(trainIdx + devIdx):]

Device is:  cuda


In [19]:
# create dataloaders

# encode sequences internally 
class POSTagDataset(Dataset):
  def __init__(self, sentences, tags, word2idx, tag2idx):
    # encode and store sentences
    encSentences = torch.Tensor(encodeSequence(sentences[0], word2idx))
    for sentence in sentences[1:]:
      encSentences = torch.vstack([encSentences, encodeSequence(sentence, word2idx)])
    
    self.encSentences = encSentences

    # encode and store POS tags
    encPOS = torch.Tensor(encodeSequence(tags[0], tag2idx))
    for tag in tags[1:]:
      encPOS = torch.vstack([encPOS, encodeSequence(tag, tag2idx)])
    
    self.encPOS = encPOS
    assert(self.encPOS.shape[0] == self.encSentences.shape[0])

  def __len__(self):
    return self.encSentences.shape[0]

  def __getitem__(self, idx):
    return self.encSentences[idx], self.encPOS[idx]

In [20]:
trainDataset = POSTagDataset(trainX, trainY, word2idx, tag2idx)
devDataset = POSTagDataset(devX, devY, word2idx, tag2idx)
testDataset = POSTagDataset(testX, testY, word2idx, tag2idx)

In [44]:
# define main LSTM and experiment classes

# main lstm, take in encoded sente
class LSTM(nn.Module):
  def __init__(self, embeddingDim, hiddenDim, wordVocab, tagVocab, device):
    super(LSTM, self).__init__()


    self.hiddenDim = hiddenDim
    self.padIdx = 1
    self.wordEmbeddings = nn.Embedding(len(wordVocab), embeddingDim, 
                                       padding_idx=self.padIdx)         # encode each word as an embedding of size embeddingDim

    self.lstm = nn.LSTM(embeddingDim, hiddenDim, batch_first=True)      # main lstm 
    self.fc = nn.Linear(hiddenDim, len(tagVocab))                       # output a tag based on the hidden dim (using the internal state)

    self.logSoftMax = F.log_softmax

    self.device = device

  def forward(self, sentences):
    s = sentences.to(self.device)         # move to cuda
    embeddings = self.wordEmbeddings(s)
    out, _ = self.lstm(embeddings)
    out = self.fc(out)

    tagScores = self.logSoftMax(out, dim=1)
    return tagScores
    
# experiment class to manage training and testing
class Experiment():
  def __init__(self, embeddingDim, hiddenDim, wordVocab, tagVocab, device, batchSize=16, lossFunction=nn.CrossEntropyLoss, optimiser=torch.optim.SGD, lr=0.01):
    self.model = LSTM(embeddingDim, hiddenDim, wordVocab, tagVocab, device).to(device)
    self.lossFunction = lossFunction().to(device)
    self.optimiser = optimiser(self.model.parameters(), lr=lr)

    self.batchSize = batchSize
    self.device = device


  def evaluate(self, evalDataset):
    evalDataloader = DataLoader(evalDataset, batch_size=self.batchSize, shuffle=True)
    with torch.no_grad():
      evalLoss = 0.0
      for i, (sentences, labels) in enumerate(iter(evalDataloader)):
        s = sentences.to(self.device)
        l = labels.to(self.device)

        self.model.zero_grad()
        tagScores = self.model(s.long())
        
        loss = self.lossFunction(tagScores.permute(0,2,1), l.long())        # permute tagscores to calculate the loss over a sentence

        evalLoss += loss

    return evalLoss/len(evalDataset)

  def train(self, trainDataset, devDataset, numEpochs=50, printStep=10, saveStep=100, savePath="/content/drive/MyDrive/Colab Notebooks/INLP/INLP-A2 weights/"):
    # init dataloaders
    trainDataloader = DataLoader(trainDataset, batch_size=self.batchSize, shuffle=True)

    # iterate over all train batches, train the LSTM with sentences and labels
    # keep iterating until performance on validation drops (early stoppage)
    lastDevLoss = np.inf
    for epoch in range(numEpochs):

      # train over training data
      trainingLoss = 0.0
      for i, (sentences, labels) in enumerate(iter(trainDataloader)):
        s = sentences.to(self.device)     # move to cuda
        l = labels.to(self.device)

        # zero grad, compute scores, evaluate loss, backprop, update weights
        self.model.zero_grad()
        tagScores = self.model(s.long())
        
        loss = self.lossFunction(tagScores.permute(0,2,1), l.long())        # permute tagscores to calculate the loss over a sentence
        loss.backward()
        self.optimiser.step()

        trainingLoss += loss
      trainingLoss /= len(trainDataloader)

      # evaluate on dev data
      devLoss = self.evaluate(devDataset)

      # print every printStep
      if epoch % printStep == 0:
        print("Epoch %d | avg. training error: %f | avg. dev error: %f" % (epoch, trainingLoss, devLoss))
      
      # save every save step
      if epoch % saveStep == 0:
        filePath = savePath + datetime.now().strftime("%m-%d-%H-%M-%S") + ".pt"
        torch.save(self.model.state_dict, filePath)

      # early stoppage
      if devLoss > lastDevLoss:
        filePath = savePath + datetime.now().strftime("%m-%d-%H-%M-%S") + ".pt"
        torch.save(self.model.state_dict, filePath)

        print("Increase in dev loss, stopping training")
        print("Epoch %d | avg. training error: %f | avg. dev error: %f" % (epoch, trainingLoss, devLoss))
        break
      else:
        lastDevLoss = devLoss




# for i, (ts, tl) in enumerate(iter(trainDataloader)):
#   print("iter ", i)
#   print(ts.shape, ts[0])
#   print(tl.shape, tl[0])

trainDataloader = DataLoader(trainDataset, batch_size=16, shuffle=True)
trainSentences, trainLabels = next(iter(trainDataloader))

# trainSentences.view(trainSentences.shape[0], trainSentences.shape[1], 1),

# myLSTM = LSTM(30, 50, word2idx, tag2idx)
# embs = myLSTM.wordEmbeddings(trainSentences.long())
# res, _ = myLSTM.lstm(embs)
# tagg = myLSTM.fc(res)
# scores = myLSTM.logSoftMax(tagg)

# scores = myLSTM(trainSentences.long())
# loss = nn.CrossEntropyLoss()

e1 = Experiment(50, 100, word2idx, tag2idx, device, lr=0.01)
e1.train(trainDataset, devDataset, numEpochs=250)
e1.evaluate(testDataset)


Epoch 0 | avg. training error: 2.823730 | avg. dev error: 0.177983
Epoch 10 | avg. training error: 2.473327 | avg. dev error: 0.154401
Epoch 20 | avg. training error: 0.706971 | avg. dev error: 0.044986
Epoch 30 | avg. training error: 0.579968 | avg. dev error: 0.037244
Epoch 40 | avg. training error: 0.476098 | avg. dev error: 0.030643
Epoch 50 | avg. training error: 0.394650 | avg. dev error: 0.025507
Epoch 60 | avg. training error: 0.349984 | avg. dev error: 0.022702
Epoch 70 | avg. training error: 0.319026 | avg. dev error: 0.020700
Epoch 80 | avg. training error: 0.295236 | avg. dev error: 0.019223
Epoch 90 | avg. training error: 0.275942 | avg. dev error: 0.017907
Epoch 100 | avg. training error: 0.259718 | avg. dev error: 0.016901
Epoch 110 | avg. training error: 0.245715 | avg. dev error: 0.016004
Epoch 120 | avg. training error: 0.233407 | avg. dev error: 0.015198
Epoch 130 | avg. training error: 0.222459 | avg. dev error: 0.014565
Epoch 140 | avg. training error: 0.212645 | a

tensor(0.0131, device='cuda:0')

In [None]:
# save word and tag vocabs
import pickle

with open("/content/drive/MyDrive/Colab Notebooks/INLP/INLP-A2 weights/word2idx.pkl", "wb") as f:
  pickle.dump(word2idx, f)
with open("/content/drive/MyDrive/Colab Notebooks/INLP/INLP-A2 weights/tag2idx.pkl", "wb") as f:
  pickle.dump(tag2idx, f)
with open("/content/drive/MyDrive/Colab Notebooks/INLP/INLP-A2 weights/idx2tag.pkl", "wb") as f:
  pickle.dump(idx2tag, f)