# INLP A2 2020111018

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from copy import copy

In [None]:
# Parse data

# parse conllu, return sentences and tags
def parseConllu(path):
  X = []
  y = []
  pairs = []

  with open(path) as f:
    conllu = f.read().split('\n\n')
    for block in conllu:
      lines = block.split('\n')
      
      sentence = []
      tags = []
      for line in lines[2:]:
        items = line.split('\t')
        sentence.append(items[1])
        tags.append(items[3])

      pairs.append([sentence, tags])
      
  return np.array(pairs, dtype=object)

# generate mappings between indices and words in sequences
def assembleVocabulary(sequences, predefinedTags={"_unk": 0, "_pad": 1, "_bos": 2, "_eos": 3}):
  vocab2idx = copy(predefinedTags)
  count = len(predefinedTags)

  # generate forward map
  for sequence in sequences:
    for word in sequence:
      if word not in vocab2idx:
        vocab2idx[word] = count
        count += 1

  # backwards map
  idx2vocab = {vocab2idx[k]: k for k in vocab2idx}

  return vocab2idx, idx2vocab

# encode a sequence of words as a float tensor, takes a sentence and a dict as inputs, return a float tensor
def encodeSequence(seq, toIdx):
  encoded = []
  for word in seq:
    if word not in toIdx:
      encoded.append(toIdx["_unk"])
    else:
      encoded.append(toIdx[word])
  encoded = torch.FloatTensor(encoded)
  return encoded

# pad all sequences with a "_pad" character
def padSequence(sequences):
  maxLength = 0
  paddedSeqs = []

  for seq in sequences:
    if len(seq) > maxLength:
      maxLength = len(seq)

  for seq in sequences:
    paddingNeeded = maxLength - len(seq)
    paddedSeqs.append(seq + ["_pad"] * paddingNeeded)

  paddedSeqs = np.array(paddedSeqs, dtype=object)
  return paddedSeqs

# begin and end sentences with "_bos" and "_eos" characters
def delimitSequence(sequences):
  delimited = []
  for seq in sequences:
    delimited.append(["_bos"] + seq + ["_eos"])

  delimited = np.array(delimited, dtype=object)
  return delimited

In [None]:
# parse data, generate training, dev and test splits

# load shuffle and preprocess data
pairs = parseConllu("/content/drive/MyDrive/ud-english-treebanks/UD_English-Atis/en_atis-ud-train.conllu")

np.random.shuffle(pairs)

X = (pairs[:, 0])
y = (pairs[:, 1])

X = delimitSequence(X)
y = delimitSequence(y)

X = padSequence(X)
y = padSequence(y)

# make vocabs
word2idx, idx2word = assembleVocabulary(X)
tag2idx, idx2tag = assembleVocabulary(y)

# split data
trainAmt = 0.7
devAmt = 0.1

trainIdx = int(trainAmt * pairs.shape[0])
devIdx = int(devAmt * pairs.shape[0])

trainX = X[:trainIdx]
trainY = y[:trainIdx]

devX = X[trainIdx:(trainIdx + devIdx)]
devY = y[trainIdx:(trainIdx + devIdx)]

testX = X[(trainIdx + devIdx):]
testY = y[(trainIdx + devIdx):]

In [None]:
# create dataloaders

# encode sequences internally 
class POSTagDataset(Dataset):
  def __init__(self, sentences, tags, word2idx, tag2idx):
    # encode and store sentences
    encSentences = torch.Tensor(encodeSequence(sentences[0], word2idx))
    for sentence in sentences[1:]:
      encSentences = torch.vstack([encSentences, encodeSequence(sentence, word2idx)])
    
    self.encSentences = encSentences

    # encode and store POS tags
    encPOS = torch.Tensor(encodeSequence(tags[0], tag2idx))
    for tag in tags[1:]:
      encPOS = torch.vstack([encPOS, encodeSequence(tag, tag2idx)])
    
    self.encPOS = encPOS
    assert(self.encPOS.shape[0] == self.encSentences.shape[0])

  def __len__(self):
    return self.encSentences.shape[0]

  def __getitem__(self, idx):
    return self.encSentences[idx], self.encPOS[idx]

In [None]:
trainDataset = POSTagDataset(trainX, trainY, word2idx, tag2idx)
devDataset = POSTagDataset(devX, devY, word2idx, tag2idx)
testDataset = POSTagDataset(testX, testY, word2idx, tag2idx)

In [None]:
# define main LSTM and experiment classes

# main lstm, take in encoded sente
class LSTM(nn.Module):
  def __init__(self, embeddingDim, hiddenDim, wordVocab, tagVocab):
    super(LSTM, self).__init__()


    self.hiddenDim = hiddenDim
    self.padIdx = 1
    self.wordEmbeddings = nn.Embedding(len(wordVocab), embeddingDim, 
                                       padding_idx=self.padIdx)         # encode each word as an embedding of size embeddingDim

    self.lstm = nn.LSTM(embeddingDim, hiddenDim, batch_first=True)      # main lstm 
    self.fc = nn.Linear(hiddenDim, len(tagVocab))                       # output a tag based on the hidden dim (using the internal state)

    self.logSoftMax = F.log_softmax

  def forward(self, sentences):
    embeddings = self.wordEmbeddings(sentences)
    out = self.lstm(embeddings)
    out = self.fc(out)

    tagScores = self.logSoftMax(out, dim=1)
    return tagScores
    
# experiment class to manage training and testing
class Experiment():
  def __init__(self, embeddingDim, hiddenDim, wordVocab, tagVocab, batchSize=16, lossFunction=nn.CrossEntropyLoss, optimiser=torch.optim.SGD, lr=0.01):
    self.model = LSTM(embedddingDim, hiddenDim, wordVocab, tagVocab)
    self.lossFunction = lossFunction()
    self.optimiser = optimiser(model.parameters(), lr=lr)

    self.batchSize = batchSize

  def train(trainX, trainY, valX, valY, numEpochs=50):
    # init dataloaders
    pass

trainDataloader = DataLoader(trainDataset, batch_size=16, shuffle=True)
trainSentences, trainLabels = next(iter(trainDataloader))

#trainSentences.view(trainSentences.shape[0], trainSentences.shape[1], 1),

myLSTM = LSTM(30, 50, word2idx, tag2idx)
embs = myLSTM.wordEmbeddings(trainSentences.long())
res, _ = myLSTM.lstm(embs)
tagg = myLSTM.fc(res)
scores = myLSTM.logSoftMax(tagg)

print(scores, scores.shape)
print()

print(trainLabels.shape)
print()


print(tag2idx)

e1 = Experiment(30, 50, word2idx, tag2idx)


tensor([[[-2.7726, -2.7726, -2.7726,  ..., -2.7726, -2.7726, -2.7726],
         [-2.8033, -2.7417, -2.7417,  ..., -2.7384, -2.8320, -2.7908],
         [-2.8758, -2.6763, -2.8177,  ..., -2.8249, -2.7999, -2.7140],
         ...,
         [-2.7726, -2.7726, -2.7726,  ..., -2.7726, -2.7726, -2.7726],
         [-2.7726, -2.7726, -2.7726,  ..., -2.7726, -2.7726, -2.7726],
         [-2.7726, -2.7726, -2.7726,  ..., -2.7726, -2.7726, -2.7726]],

        [[-2.7726, -2.7726, -2.7726,  ..., -2.7726, -2.7726, -2.7726],
         [-2.8110, -2.7350, -2.8190,  ..., -2.7833, -2.7325, -2.8254],
         [-2.8728, -2.6551, -2.7697,  ..., -2.8390, -2.8786, -2.8229],
         ...,
         [-2.7726, -2.7726, -2.7726,  ..., -2.7726, -2.7726, -2.7726],
         [-2.7726, -2.7726, -2.7726,  ..., -2.7726, -2.7726, -2.7726],
         [-2.7726, -2.7726, -2.7726,  ..., -2.7726, -2.7726, -2.7726]],

        [[-2.7726, -2.7726, -2.7726,  ..., -2.7726, -2.7726, -2.7726],
         [-2.7890, -2.7987, -2.6946,  ..., -2

  scores = myLSTM.logSoftMax(tagg)
