# Semantic Parsing Final Project
Link to the paper: https://aclanthology.org/P16-1004.pdf

Read through the paper fully before starting the assignment!

In [1]:
import torch
import torch.nn as nn

from google.colab import drive
drive.mount('/content/drive')

FILEPATH = "/content/drive/MyDrive/CSCI 1460/Final Project/"

Mounted at /content/drive


# Data Downloading
This cell obtains the pre-processed Jobs dataset (see the paper) that you will be using to train and evaluate your model. (Pre-processed meaning that argument identification, section 3.6, has already been done for you). You should only need to run this cell ***once***. Feel free to delete it after running. Create a folder in your Google Drive in which the code below will store the pre-processed data needed for this project. Modify `FILEPATH` above to direct to said folder. It should start with `drive/MyDrive/...`, feel free to take a look at previous assignments that use mounting Google Drive if you can't remember what it should look like. *Make sure the data path ends with a slash character ('/').* The below code will access the zip file containing the pre-processed Jobs dataset from the paper and extract the files into your folder! Feel free to take a look at the `train.txt` and `test.txt` files to see what the data looks like. :)

In [2]:
import requests
import io
import zipfile

# https://stackoverflow.com/questions/31126596/saving-response-from-requests-to-file
response = requests.get('http://dong.li/lang2logic/seq2seq_jobqueries.zip')
if response.status_code == 200:
  # https://stackoverflow.com/questions/3451111/unzipping-files-in-python
  with zipfile.ZipFile(io.BytesIO(response.content), "r") as zip_ref:
    zip_ref.extractall(FILEPATH)
  print("Extraction completed.")
else:
  print("Failed to download the zip file.")

Extraction completed.


# Data Pre-processing
The following code is defined for you! It extracts the queries (inputs to your Seq2Seq model) and logical forms (expected outputs) from the training and testing files. It also does important pre-processing such as padding the queries and logical forms and turns the words into vocab indices. **Look over and understand this code before you start the assignment!**

In [3]:
def extract_file(filename):
  """
  Extracts queries and corresponding logical forms from either
  train.txt or test.txt. (Feel free to take a look at the files themselves
  in your Drive!)

  Parameters
  ----------
  filename : str
      name of the file to extract from

  Returns
  ----------
  tuple[list[list[str]], list[list[str]]]
      a tuple of a list of queries and their corresponding logical forms
      each in the form of a list of string tokens
  """
  queries, logical_forms = [], []
  with open(FILEPATH + filename) as f:
    for line in f:
      line = line.strip() # remove new line character
      query, logical_form = line.split('\t')

      query = query.split(' ')[::-1] # reversed inputs are used the paper (section 4.2)
      logical_form = ["<s>"] + logical_form.split(' ') + ["</s>"]

      queries.append(query)
      logical_forms.append(logical_form)
  return queries, logical_forms

query_train, lf_train = extract_file('train.txt') # 500 instances
query_test, lf_test = extract_file('test.txt') # 140 instances

In [4]:
from collections import Counter

query_vocab = Counter()
for l in query_train:
  query_vocab.update(l)

query_word2idx = {}
for w, c in query_vocab.items():
  if c >= 2:
    query_word2idx[w] = len(query_word2idx)
query_word2idx['<UNK>'] = len(query_word2idx)
query_word2idx['<PAD>'] = len(query_word2idx)
query_idx2word = {i:word for word,i in query_word2idx.items()}

query_vocab = list(query_word2idx.keys())

lf_vocab = Counter()
for lf in lf_train:
  lf_vocab.update(lf)

lf_vocab['<UNK>'] = 0
lf_vocab['<PAD>'] = 0
lf_idx2word = {i:word for i, word in enumerate(lf_vocab.keys())}
lf_word2idx = {word:i for i, word in lf_idx2word.items()}

In [5]:
query_train_tokens = [[query_word2idx.get(w, query_word2idx['<UNK>']) for w in l] for l in query_train]
query_test_tokens = [[query_word2idx.get(w, query_word2idx['<UNK>']) for w in l] for l in query_test]

lf_train_tokens = [[lf_word2idx.get(w, lf_word2idx['<UNK>']) for w in l] for l in lf_train]
lf_test_tokens = [[lf_word2idx.get(w, lf_word2idx['<UNK>']) for w in l] for l in lf_test]

def pad(seq, max_len, pad_token_idx):
  """
  Pads a given sequence to the max length using the given padding token index

  Parameters
  ----------
  seq : list[int]
      sequence in the form of a list of vocab indices
  max_len : int
      length sequence should be padded to
  pad_token_idx
      vocabulary index of the padding token

  Returns
  ----------
  list[int]
      padded sequence
  """
  seq = seq[:max_len]
  padded_seq = seq + (max_len - len(seq)) * [pad_token_idx]
  return padded_seq

query_max_target_len = max([len(i) for i in query_train_tokens])
query_train_tokens = [pad(i, query_max_target_len, query_word2idx['<PAD>']) for i in query_train_tokens]
query_test_tokens = [pad(i, query_max_target_len, query_word2idx['<PAD>']) for i in query_test_tokens]

lf_max_target_len = int(max([len(i) for i in lf_train_tokens]) * 1.5)
lf_train_tokens = [pad(i, lf_max_target_len, lf_word2idx['<PAD>']) for i in lf_train_tokens]
lf_test_tokens = [pad(i, lf_max_target_len, lf_word2idx['<PAD>']) for i in lf_test_tokens]

# Data Loading
The following code creates a JobsDataset and DataLoaders to use with your implemented model. Take a look at the main function at the end of this stencil to see how they are used in context.

In [6]:
from torch.utils.data import Dataset, DataLoader, default_collate

class JobsDataset(Dataset):
  """Defines a Dataset object for the Jobs dataset to be used with Dataloader"""
  def __init__(self, queries, logical_forms):
    """
    Initializes a JobsDataset

    Parameters
    ----------
    queries : list[list[int]]
        a list of queries, which have been tokenized and padded, in the form
        of a list of vocab indices
    logical_forms : list[list[int]]
        a list of corresponding logical forms, which have been tokenized and
        padded, in the form of a list of vocab indices
    """
    self.queries = queries
    self.logical_forms = logical_forms

  def __len__(self) -> int:
    """
    Returns the amount of paired queries and logical forms in the dataset

    Returns
    ----------
    int
        length of the dataset
    """
    return len(self.queries)

  def __getitem__(self, idx: int) -> tuple[list[int], list[int]]:
    """
    Returns a paired query and logical form at the specified index

    Parameters
    ----------
    idx : int
        specified index of the dataset

    Returns
    ----------
    tuple[list[int], list[int]]
        paired query and logical form at the specified index, in the form of
        a list of vocab indices
    """
    return self.queries[idx], self.logical_forms[idx]

def build_datasets() -> tuple[JobsDataset, JobsDataset]:
  """
  Builds a train and a test dataset from the queries and logical forms
  train and test tokens

  Returns
  ----------
  tuple[JobsDataset, JobsDataset]
      a training and testing JobsDataset
  """
  jobs_train = JobsDataset(queries=query_train_tokens, logical_forms=lf_train_tokens)
  jobs_test = JobsDataset(queries=query_test_tokens, logical_forms=lf_test_tokens)
  return jobs_train, jobs_test

def collate(batch : list[tuple[list[int], list[int]]]) -> tuple[torch.Tensor, torch.Tensor]:
  """
  Used as collate_fn when creating the Dataloaders from the dataset

  Parameters
  ----------
  batch : list[tuple[list[int], list[int]]]
      a list of outputs of __getitem__

  Returns
  ----------
  tuple[torch.Tensor, torch.Tensor]
      a batched set of input sequences and a batched set of target sequences
  """
  src, tgt = default_collate(batch)
  return torch.stack(src), torch.stack(tgt)

def build_dataloaders(dataset_train: JobsDataset, dataset_test: JobsDataset,
                      train_batch_size: int) -> tuple[DataLoader, DataLoader]:
  """
  Used as collate_fn when creating the Dataloaders from the dataset, batching
  the training data according to the inputted batch size and batching the
  testing data with a batch size of 1

  Parameters
  ----------
  dataset_train : JobsDataset
      training dataset
  dataset_test : JobsDataset
      testing dataset
  train_batch_size : int
      batch size to be used during training

  Returns
  ----------
  tuple[DataLoader, DataLoader]
      a training and testing DataLoader
  """
  dataloader_train = DataLoader(dataset_train, batch_size=train_batch_size, shuffle=True, collate_fn=collate)
  dataloader_test = DataLoader(dataset_test, batch_size=1, shuffle=False, collate_fn=collate)
  return dataloader_train, dataloader_test

# TODO: Define your model here!

In [7]:
from typing import Tuple

class Encoder(nn.Module):
  def __init__(self,
               src_embed_size: int,
               src_vocab_size: int,
               hidden_size: int,
               num_layers: int,
               dropout: float):
    """
    Initializes an Encoder module that uses an LSTM

      Parameters
      ----------
      src_embed_size: int
        size of embeddings for source vocab

      src_vocab_size: int
        vocab size of source

      hidden_size: int
        size of hidden layer in LSTM

      num_layers: int
        number of layers in LSTM

      dropout : float
        dropout used for LSTM
    """

    super(Encoder, self).__init__()
    self.embedding = nn.Embedding(src_vocab_size, src_embed_size)
    self.encoder = nn.LSTM(src_embed_size, hidden_size, num_layers=num_layers, dropout=dropout, batch_first=True)

  def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
    """
    Full encoder forward pass

      Parameters
      ----------
      x: torch.Tensor
        Vector of indexes representing input sequence

      Returns
      ----------
      Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
        torch.Tensor - output of LSTM (encodings)
        Tuple[torch.Tensor, torch.Tensor] - last hidden cell states of LSTM
    """

    embeddings = self.embedding(x)
    output, (h_n, c_n) = self.encoder(embeddings)
    return output, (h_n, c_n)

In [8]:
from typing import Tuple

class Attention(nn.Module):
  def __init__(self, hidden_size: int, dropout: float):
    """
    Initializes an attention module

    Paramters
    ---------
    hidden_size: int
      size of hidden state in encoder AND decoder

    dropout: float
      dropout used for linear layers
    """

    super(Attention, self).__init__()
    self.W1 = nn.Linear(hidden_size, hidden_size)
    self.W2 = nn.Linear(hidden_size, hidden_size)
    self.activation = nn.Tanh()
    self.dropout = nn.Dropout(dropout)

  def forward(self, encodings: torch.Tensor, hidden_state: torch.Tensor) -> torch.Tensor:
    """
    Applies attention to decoder output using encodings

    Parameters
    ----------
    encodings: torch.Tensor
      Full encodings output by encoder

    hidden_state: torch.Tensor
      Final hidden state output by decoder

    Returns
    ----------
    torch.Tensor
      Hidden state with attention applied
    """

    weights = torch.bmm(encodings, hidden_state.unsqueeze(2)).squeeze(2)
    weights = torch.softmax(weights, dim=1)
    context = torch.bmm(weights.unsqueeze(1), encodings).squeeze(1)
    h_att = self.activation(self.W1(hidden_state) + self.W2(context))
    h_att = self.dropout(h_att)
    return h_att


In [9]:
from typing import Tuple

class Decoder(nn.Module):
  def __init__(self,
               tgt_embed_size: int,
               tgt_vocab_size: int,
               hidden_size: int,
               num_layers: int,
               dropout: float,
               attention: Attention):
    """
    Initializes a Decoder module that uses an LSTM

      Parameters
      ----------
      tgt_emb_size: int
        size of embeddings for target vocab

      tgt_vocab_size: int
        vocab size of target

      hidden_size: int
        size of hidden layer in LSTM

      num_layers: int
        number of layers in LSTM

      dropout : float
        dropout used in LSTM and output layer

      attention: Attention
        attention module to be used in decoder
    """

    super(Decoder, self).__init__()
    self.embedding = nn.Embedding(tgt_vocab_size, tgt_embed_size)
    self.decoder = nn.LSTM(tgt_embed_size, hidden_size, num_layers=num_layers, dropout=dropout, batch_first=True)
    self.attention = attention
    self.output = nn.Linear(hidden_size, tgt_vocab_size)
    self.dropout = nn.Dropout(dropout)

  def forward(self,
              encodings: torch.Tensor,
              prev_token: torch.Tensor,
              state: Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
    """
    Single decoder pass to return next token in sequence

      Parameters
      ----------
      encodings: torch.Tensor
        Encodings generated by encoder from original input sequence

      prev_token: torch.Tensor
        Index of previously predicted token

      state: Tuple[torch.Tensor, torch.Tensor]
        tuple of (hidden_state, cell_state) to pass to LSTM
        uses previous state of decoder, or final state of encoder for first pass

      Returns
      ----------
      Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
        torch.Tensor - output of LSTM (raw logits)
        Tuple[torch.Tensor, torch.Tensor] - last hidden and cell states of LSTM
    """
    prev_embedding = self.embedding(prev_token).unsqueeze(1)
    decodings, (h_n, c_n) = self.decoder(prev_embedding, state)
    final_hidden = h_n[-1]

    h_att = self.attention(encodings, final_hidden)

    output = self.output(h_att)
    output = self.dropout(output)
    return output, (h_n, c_n)

In [10]:
from random import random

class Seq2Seq(nn.Module):
    def __init__(self,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 src_embed_size: int,
                 tgt_embed_size: int,
                 hidden_size: int,
                 num_layers: int,
                 dropout: float):
        """
        Initializes a Seq2Seq model

        Parameters
        ----------
        src_vocab_size: int
          vocab size of source

        tgt_vocab_size: int
          vocab size of target

        src_embed_size: int
          size of embeddings for source vocab

        tgt_embed_size: int
          size of embeddings for target vocab

        hidden_size: int
          size of hidden layer for BOTH encoder and decoder LSTMs

        num_layers: int
          number of layers in both encoder and decoder LSTMs

        dropout: float
          dropout used in all layers
        """
        super(Seq2Seq, self).__init__()

        self.encoder = Encoder(
            src_embed_size,
            src_vocab_size,
            hidden_size,
            num_layers,
            dropout
        )

        self.attention = Attention(hidden_size, dropout)

        self.decoder = Decoder(
            tgt_embed_size,
            tgt_vocab_size,
            hidden_size,
            num_layers,
            dropout,
            self.attention
        )

    def forward(self, src: torch.Tensor, tgt: torch.Tensor, teacher_forcing_ratio: float) -> torch.Tensor:
        """
        Full Seq2Seq forward pass

        Parameters
        ----------
        src : torch.Tensor
          source sequence tensor

        tgt : torch.Tensor
          target sequence tensor (for teacher forcing)

        teacher_forcing_ratio : float
          determines the ratio at which model uses teacher forcing

        Returns
        ----------
        torch.Tensor
            logits for the predicted target sequence
        """

        batch_size = src.size(0)
        max_len = tgt.size(1)
        outputs = torch.zeros(batch_size, max_len, self.decoder.embedding.num_embeddings)

        encodings, prev_state = self.encoder(src)
        prev_token = tgt[:, 0]

        for i in range(1, max_len):
          output, prev_state = self.decoder(encodings, prev_token, prev_state)
          outputs[:, i, :] = output
          if random() < teacher_forcing_ratio:
            prev_token = tgt[:, i]
          else:
            prev_token = output.argmax(dim=1)
        return outputs

In [11]:
QUERY_VOCAB_LEN = len(query_vocab)
LF_VOCAB_LEN = len(lf_vocab)

def create_model():
  """
  Returns your model!

  Returns
  ----------
  Seq2Seq
      your model!
  """
  src_vocab_size, tgt_vocab_size = QUERY_VOCAB_LEN, LF_VOCAB_LEN
  src_embed_size, tgt_embed_size = 150, 150
  hidden_size, num_layers = 150, 1
  dropout = 0.3
  model = Seq2Seq(src_vocab_size,
                  tgt_vocab_size,
                  src_embed_size,
                  tgt_embed_size,
                  hidden_size,
                  num_layers,
                  dropout)

  for param in model.parameters():
    nn.init.uniform_(param, a=-0.08, b=0.08)

  return model

# TODO: Training and testing loops

In [12]:
LF_SOS_INDEX = lf_word2idx['<s>']
LF_EOS_INDEX = lf_word2idx['</s>']
LF_PAD_INDEX = lf_word2idx['<PAD>']

In [13]:
def train(model: nn.Module, train_dataloader: DataLoader, num_epochs: int=5,
          device: str="cuda") -> nn.Module:
  """
  Trains your model!

  Parameters
  ----------
  model : nn.Module
      your model!
  train_dataloader : DataLoader
      a dataloader of the training data from build_dataloaders
  num_epochs : int
      number of epochs to train for
  device : str
      device that the model is running on

  Returns
  ----------
  Seq2Seq
      your trained model
  """

  # Move model to device and set to train mode
  model = model.to(device)
  model.train()

  # Initialize loss, optimizer, and parameters
  loss_fn = nn.NLLLoss()
  optimizer = torch.optim.RMSprop(model.parameters(), lr=0.01, alpha=0.95)
  teacher_forcing = 1

  for epoch in range(num_epochs):
    epoch_loss = 0
    for src, tgt in train_dataloader:
      # Prepare src and tgt
      src, tgt = src.to(device), tgt.to(device)
      src, tgt = src.transpose(0, 1), tgt.transpose(0, 1)

      # Get logits, apply log_softmax to them
      optimizer.zero_grad()
      logits = model(src, tgt, teacher_forcing_ratio=teacher_forcing)
      logits = nn.functional.log_softmax(logits, dim=-1)

      # Flatten logits and target for loss function
      flattened_logits = logits[:, 1:].reshape(-1, logits.size(-1))
      flattened_tgt = tgt[:, 1:].reshape(-1)
      flattened_logits = flattened_logits.to(device)
      flattened_tgt = flattened_tgt.to(device)

      # Get clipped loss, gradients, and take optimizer step
      loss = loss_fn(flattened_logits, flattened_tgt)
      loss.backward()
      nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
      optimizer.step()
      epoch_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss / len(train_dataloader)}")
  return model

In [14]:
def evaluate(model: nn.Module, dataloader: DataLoader, device: str="cuda") -> tuple[int, int]:
  """
  Evaluates your model!

  Parameters
  ----------
  model : nn.Module
      your model!
  dataloader : DataLoader
      a dataloader of the testing data from build_dataloaders
  device : str
      device that the model is running on

  Returns
  ----------
  tuple[int, int]
      per-token accuracy and exact_match accuracy
  """

  # Move model to device and set to eval mode
  model = model.to(device)
  model.eval()

  # Initialize counts used for accuracy
  total_tokens = 0
  correct_tokens = 0
  total_seqs = 0
  correct_seqs = 0

  with torch.no_grad():
    for src, tgt in dataloader:
      # Prepare src and tgt
      src, tgt = src.to(device), tgt.to(device)
      src, tgt = src.transpose(0, 1), tgt.transpose(0, 1)

      # Get predictions
      logits = model(src, tgt, teacher_forcing_ratio=0)
      y_hat = logits.argmax(dim=-1)
      y_hat = y_hat.to(device)

      # For each tgt/pred pair in the batch
      for i in range(tgt.size(0)):
        # Find the EOS index and trim before SOS and after EOS inclusive
        eos_index = torch.nonzero((tgt[i] == LF_EOS_INDEX), as_tuple=True)[0].item()
        tgt_trimmed = tgt[i, 1:eos_index + 1]
        y_hat_trimmed = y_hat[i, 1:eos_index + 1]

        # Update counts used for accuracy
        correct_tokens += torch.count_nonzero(tgt_trimmed == y_hat_trimmed).item()
        total_tokens += tgt_trimmed.size(0)
        if torch.equal(tgt_trimmed, y_hat_trimmed):
          correct_seqs += 1
        total_seqs += 1

  # Compute desired accuracy metrics
  per_token_accuracy = correct_tokens / total_tokens
  exact_match_accuracy = correct_seqs / total_seqs
  return per_token_accuracy, exact_match_accuracy

# Run this!

In [15]:
def main():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    jobs_train, jobs_test = build_datasets()
    dataloader_train, dataloader_test = build_dataloaders(jobs_train, jobs_test, train_batch_size=20)
    model = create_model()
    model = train(model, dataloader_train, num_epochs=20, device=device)
    test_per_token_accuracy, test_exact_match_accuracy = evaluate(model, dataloader_test, device=device)
    print(f'Test Per-token Accuracy: {test_per_token_accuracy}')
    print(f'Test Exact-match Accuracy: {test_exact_match_accuracy}')
main()



Epoch 1/20, Loss: 2.6243874835968017
Epoch 2/20, Loss: 1.106929476261139
Epoch 3/20, Loss: 1.0697244691848755
Epoch 4/20, Loss: 1.0314664912223817
Epoch 5/20, Loss: 1.0175537633895875
Epoch 6/20, Loss: 1.0112549424171449
Epoch 7/20, Loss: 0.9823995518684387
Epoch 8/20, Loss: 0.9676519823074341
Epoch 9/20, Loss: 0.9394924521446228
Epoch 10/20, Loss: 0.936479172706604
Epoch 11/20, Loss: 0.9266954207420349
Epoch 12/20, Loss: 0.9075261092185974
Epoch 13/20, Loss: 0.8810160779953002
Epoch 14/20, Loss: 0.8789917635917663
Epoch 15/20, Loss: 0.8878867888450622
Epoch 16/20, Loss: 0.8574706125259399
Epoch 17/20, Loss: 0.8630762839317322
Epoch 18/20, Loss: 0.8610024857521057
Epoch 19/20, Loss: 0.8420480084419251
Epoch 20/20, Loss: 0.8542601418495178
Test Per-token Accuracy: 0.8815533980582524
Test Exact-match Accuracy: 0.7642857142857142
