<a href="https://colab.research.google.com/github/aneeshc12/POS-tagging-and-LSTMs/blob/master/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# INLP A2 2020111018

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install conllu

import numpy as np
import torch
import conllu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting conllu
  Downloading conllu-4.5.2-py2.py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-4.5.2


In [64]:
# Parse data

# parse conllu, return sentences and tags
def parseConllu(path):
  X = []
  y = []
  pairs = []

  with open(path) as f:
    conllu = f.read().split('\n\n')
    for block in conllu:
      lines = block.split('\n')
      
      sentence = []
      tags = []
      for line in lines[2:]:
        items = line.split('\t')
        sentence.append(items[1])
        tags.append(items[3])

      pairs.append([sentence, tags])
      
  return np.array(pairs, dtype=object)

# generate mappings between indices and words in sequences
def assembleVocabulary(sequences, predefinedTags={"_unk": 0, "_pad": 1, "_bos": 2, "_eos": 3}):
  vocab2idx = predefinedTags
  count = len(predefinedTags)

  # generate forward map
  for sequence in sequences:
    for word in sequence:
      if word not in vocab2idx:
        vocab2idx[word] = count
        count += 1

  # backwards map
  idx2vocab = {vocab2idx[k]: k for k in vocab2idx}

  return vocab2idx, idx2vocab

# pad all sequences with a "_pad" character
def padSequence(sequences):
  maxLength = 0
  paddedSeqs = []

  for seq in sequences:
    if len(seq) > maxLength:
      maxLength = len(seq)

  for seq in sequences:
    paddingNeeded = maxLength - len(seq)
    paddedSeqs.append(seq + ["_pad"] * paddingNeeded)

  paddedSeqs = np.array(paddedSeqs, dtype=object)
  return paddedSeqs

# begin and end sentences with "_bos" and "_eos" characters
def delimitSequence(sequences):
  delimited = []
  for seq in sequences:
    delimited.append(["_bos"] + seq + ["_eos"])

  delimited = np.array(delimited, dtype=object)
  return delimited

In [65]:
# parse data, generate training, dev and test splits

# load shuffle and preprocess data
pairs = parseConllu("/content/drive/MyDrive/ud-english-treebanks/UD_English-Atis/en_atis-ud-train.conllu")

np.random.shuffle(pairs)

X = (pairs[:, 0])
y = (pairs[:, 1])

X = delimitSequence(X)
y = delimitSequence(y)

X = padSequence(X)
y = padSequence(y)

# make vocabs
word2idx, idx2word = assembleVocabulary(X)
tag2idx, idx2tag = assembleVocabulary(y)

# split data
trainAmt = 0.7
devAmt = 0.1

trainIdx = int(trainAmt * pairs.shape[0])
devIdx = int(devAmt * pairs.shape[0])

trainX = X[:trainIdx]
trainY = y[:trainIdx]

devX = X[trainIdx:(trainIdx + devIdx)]
devY = y[trainIdx:(trainIdx + devIdx)]

testX = X[(trainIdx + devIdx):]
testY = y[(trainIdx + devIdx):]