# Step0: Set up Environment

In [142]:
!pip install spacy --quiet
!python -m spacy download en_core_web_lg

import json
import re
import unicodedata
import spacy
from spacy.language import Language
from spacy.tokenizer import Tokenizer
import torch.nn as nn
import torch
from torch.nn.utils.rnn import pad_sequence
import re
import spacy
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import time
import math
import torch
import torch.nn as nn
import numpy as np
from torch import optim
import torch.nn.functional as F


[notice] A new release of pip is available: 24.2 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
     ---------------------------------------- 0.0/400.7 MB ? eta -:--:--
      -------------------------------------- 5.8/400.7 MB 32.0 MB/s eta 0:00:13
     - ------------------------------------ 13.4/400.7 MB 33.5 MB/s eta 0:00:12
     - ------------------------------------ 20.7/400.7 MB 34.4 MB/s eta 0:00:12
     -- ----------------------------------- 28.3/400.7 MB 34.5 MB/s eta 0:00:11
     --- ---------------------------------- 34.9/400.7 MB 33.6 MB/s eta 0:00:11
     ---- --------------------------------- 42.2/400.7 MB 34.0 MB/s eta 0:00:11
     ---- --------------------------------- 49.8/400.7 MB 34.1 MB/s eta 0:00:11
     ----- -------------------------------- 57.1/400.7 MB 34.4 MB/s eta 0:00:11
     ------ ------------------------------- 64.7/400.7 MB 34.4 MB/s eta 0:00:10
     ------ ------------------


[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [143]:
device = 'cpu'
if torch.cuda.is_available():
    device = "cuda"
print(f"Using {device} device")

def asMinutes(s):
    """
    Converts seconds into a minutes and seconds format.
    
    Parameters:
    - s: The time in seconds.
    
    Returns:
    - A string representing the time in minutes and seconds ('Xd Xm').
    """
    m = math.floor(s / 60)  # Convert seconds to minutes, discarding any remainder.
    s -= m * 60  # Calculate the remaining seconds.
    return '%dm %ds' % (m, s)  # Format and return the string.

def timeSince(since, percent):
    """
    Calculates and formats the time elapsed since a starting point and estimates remaining time.
    
    Parameters:
    - since: The starting time (usually obtained via time.time()).
    - percent: The completion percentage of the task.
    
    Returns:
    - A string indicating both the elapsed time and the estimated remaining time.
    """
    now = time.time()  # Get the current time.
    s = now - since  # Calculate elapsed time since the start.
    es = s / (percent)  # Estimate the total time based on the current progress.
    rs = es - s  # Calculate the remaining time by subtracting elapsed time from the total estimated time.
    
    # Format and return the elapsed and remaining times as a string.
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

Using cuda device


# Step1: Methods Introduction

For this task, I need to implement LSTM for Encoder and Decoder. 

1. Data pre-processing
    - Read data from a JSON file and split it into query and question datasets, each with train, dev, and test splits.

2. Create a Spacy tokenizer to process the input and output sentences.
    - maybe need to consider about vocab

3. Data Loader:
    - Use the tokenizer to tokenize each input sentence and its corresponding label, adding the following special tokens: `<sos>`, `<eos>`, and `<pad>`.
    - Pad the tokenized sentences so that each batch has the same length.

4. Define the LSTM Encoder and Decoder:
    - Implement an Encoder and a Decoder using LSTM.
    - Combine the Encoder and Decoder into a sequence-to-sequence (seq2seq) model.
    - Optionally, use teacher forcing during training to enhance convergence.
    - Optionally, use bidirectional or more layers.

5. Define the training method:
    - Perform the feedforward step.
    - Calculate the loss between the predicted output and the target labels.
    - Perform backpropagation to compute the gradients.
    - Apply gradient clipping to prevent exploding gradients.
    - Record and print the loss in the terminal during training.

6. Define the testing method to evaluate the model's performance on the test dataset.

7. Set the hyperparameters for the seq2seq model, such as:
    - Learning rate
    - Batch size
    - Number of epochs
    - Hidden dimensions

8. Train the model using both the question and query training datasets.

9. Test the model using both the question and query testing datasets.
    - Remember to ignore `<sos>`, `<eos>`, and `<pad>`.
    - Remember not only shortest sql query is valid.

# Step 2: Pre-process Raw Data

In [144]:
def unicodeToAscii(s):
    # Convert a Unicode string 's' to plain ASCII.
    # This is done by first normalizing the string into its decomposed form using 'NFD',
    # which separates characters from their accents. Then, it filters out all nonspacing marks (Mn).
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Does not allow sql to have multiple space between each word
def normalize_whitespace(text):
    return re.sub(r'\s+', ' ', text).strip()

def preprocess_sentence(s:str) -> str:
    """
    Preprocesses sentence text for consistency
    """
    s = s.strip()
    s = normalize_whitespace(s)
    s = unicodeToAscii(s)
    s = s.strip()
    return s

def preprocess_dataset(dataset_loc = "atis.json",split_type=None, split=['dev', 'test', 'train']):
    
    # Read Dataset JSON file
    with open(dataset_loc) as f:
        dataset_json = json.load(f)

    processed_dataset = []
    variable_names = set()
    sql_templates = set()

    for sample in dataset_json:
        processed_sample = {}

        # Preprocess sql queries
        sql = [preprocess_sentence(query) for query in sample['sql']]

        # All valid sql queries for this examples sorted by their length
        sql = sorted(sql, key=len)

        # Adds shorests sql template to the set of sql templates
        sql_templates.add(sql[0])
        

        # Dictionary for variables/placeholders metadata
        variables_metadata = sample["variables"]

        # Delete 'location' key from variables dictionary
        # variable_type_mapping = {var['name']:var['type'] for var in variables_metadata}
        for var in variables_metadata:
            # Add current variable to set of all possible variable names
            variable_names.add(var.get("name"))
            var.pop('location', None)
        # query split for this sample
        query_split = sample['query-split']

        # Skips sample if its not the specified split_type or split
        if(split_type == "query"):
            if(query_split not in split):
                continue

        # Process each sentence
        for sentence in sample['sentences']:
            # Skips sample if its not the specified split_type or split
            if(split_type == "question"):
                if(sentence['question-split'] not in split):
                    continue
            # variables/placeholder mapping dictionary
            variables = sentence['variables']

            # Sentence text with variables/placeholders
            text_with_vars = preprocess_sentence(sentence['text'])

            # Replacing variables/placeholders in current sentence and sql query with their values from the variables dictionary
            text_with_vars_replaced = text_with_vars
            sql_with_vars_replaced = sql

            # Replace sentence and all sql variables with their values
            for var in variables:
                text_with_vars_replaced = text_with_vars_replaced.replace(var,variables[var])
                sql_with_vars_replaced = [query.replace(var,variables[var]) for query in sql_with_vars_replaced]

            # Taggingg expected output
            sentence_var_tagging_labels = []
            for word in text_with_vars.split():
                if(word in variables):
                    sentence_var_tagging_labels.append(word)
                else:
                    sentence_var_tagging_labels.append("-")

            # Appends preprocessed dictionary of current sentence to the processesed_dataset list
            processed_dataset.append({
                "text_with_vars":text_with_vars,
                "text_with_vars_replaced":text_with_vars_replaced,
                "sentence_var_tagging_labels":sentence_var_tagging_labels,
                "vars_metadata":variables_metadata,
                "variables":variables,
                "sql_with_vars": sql,
                "shortest_sql_with_vars":sql[0],
                "sql_with_vars_replaced": sql_with_vars_replaced,
                "shortest_sql_with_vars_replaced":sql_with_vars_replaced[0],
                "query_split":sample['query-split'],
                "question_split":sentence['question-split']
            })
    
    return processed_dataset,variable_names,sql_templates


question_train_data, question_train_vars, question_train_sqls = preprocess_dataset(dataset_loc="atis.json", split_type="question", split=["train"])
question_test_data, question_test_vars, question_test_sqls = preprocess_dataset(dataset_loc="atis.json", split_type="question", split=["test"])
question_dev_data, question_dev_vars, question_dev_sqls = preprocess_dataset(dataset_loc="atis.json", split_type="question", split=["dev"])


query_train_data, query_train_vars, query_train_sqls = preprocess_dataset(dataset_loc="atis.json", split_type="query", split=["train"])
query_test_data, query_test_vars, query_test_sqls = preprocess_dataset(dataset_loc="atis.json", split_type="query", split=["test"])
query_dev_data, query_dev_vars, query_dev_sqls = preprocess_dataset(dataset_loc="atis.json", split_type="query", split=["dev"])


def unicodeToAscii(s):
    # Convert  to plain ASCII by (1) normalizing the string into its decomposed form using 'NFD',
    # which separates characters from their accents, and (2) filtering out all nonspacing marks (Mn).
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    # First, convert the string to lowercase and strip leading and trailing whitespaces.
    # This helps in reducing the variation between different uses of capitalization and spaces.
    s = s.lower().strip()

    # Convert the string from Unicode to ASCII, removing diacritics (e.g., accents) from characters.
    # This is crucial for languages with accented characters, making the text processing uniform.
    s = unicodeToAscii(s)

    # Insert a space before any punctuation marks (.!?).
    # This ensures punctuation is treated as a separate word, aiding in tokenization for NLP tasks.
    # For example, "hello!" becomes "hello !".
    s = re.sub(r"([.!?])", r" \1", s)

    # Replace any sequence of characters that are not letters or punctuation marks (.!?)
    # with a single space. This step removes numbers and special characters,
    # focusing on retaining only textual information that's crucial for most NLP tasks.
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)

    # Finally, strip leading and trailing whitespaces that might have been added
    # during the normalization process, ensuring the output is tidy.
    return s.strip()

# Step 3: Create tokenizer, vocab

In [None]:

#====================================================================================
SOS_token = 0
EOS_token = 1
PAD_token = 2
UNK_token = 3

# Defines a class 'Lang' to manage language-specific data.
class Lang:
    # The class constructor that initializes a new instance of the language data handler.
    def __init__(self, name):
        self.name = name  # The name of the language (e.g., 'natural', 'sql').

        self.word2index = {}  # A dictionary to map words to their numeric index.
        self.index2word = {0: "SOS", 1: "EOS", 2: "PAD", 3: "UNK"}  # A dictionary to map numeric indices back to words, pre-filled with special tokens.

        self.word2count = {}  # A dictionary to count occurrences of each word.
        self.n_words = 4  # The total number of unique words in the vocabulary, starting with 2 to account for the special tokens.

    # Adds a sentence to the language model, incrementing the vocabulary and word counts.
    def addSentence(self, sentence):
        for word in sentence.split(' '):  # Splits the sentence into words and processes each word.
            self.addWord(word)

    # Adds a word to the language model, updating the necessary mappings and counts.
    def addWord(self, word):
        if word not in self.word2index.keys():
            # If the word is new, it is added to all relevant dictionaries and counters.
            self.word2index[word] = self.n_words  # Maps the word to the current count of unique words.
            self.word2count[word] = 1  # Initializes the word's count to 1.
            self.index2word[self.n_words] = word  # Maps the current count of unique words back to the word.
            self.n_words += 1  # Increments the total count of unique words.
        else:
            # If the word already exists, just increments its count.
            self.word2count[word] += 1


def readLangs(data):
    print("Reading lines...")
    # data can be train, test, dev

    # [[input1, sql1],[input2, sql2],...]
    pairs = []
    for sample in data:
        # 获取 text_with_vars_replaced 和 shortest_sql_with_vars_replaced
        text_with_vars_replaced = sample['text_with_vars_replaced']
        shortest_sql_with_vars_replaced = sample['shortest_sql_with_vars_replaced']
        
        # 将它们组合成一个元组并添加到 pairs 列表
        pairs.append([normalizeString(text_with_vars_replaced), shortest_sql_with_vars_replaced])

    natural_lang = Lang('natural')
    sql_lang = Lang('sql')

    # Return the 'Lang' objects for input and output languages, and the list of sentence pairs.
    return natural_lang, sql_lang, pairs

def filterPair(p):
    # Determine if a given pair of sentences ('p') should be kept based on length and prefix criteria.
    
    # Check if the first sentence in the pair is longer than the MAX_LENGTH.
    if len(p[0].split(' ')) >= MAX_LENGTH:
        return False  # Exclude the pair if the first sentence is too long.
    
    # Check if the second sentence in the pair is longer than the MAX_LENGTH.
    elif len(p[1].split(' ')) >= MAX_LENGTH:
        return False  # Exclude the pair if the second sentence is too long.
    
    return True

def filterPairs(pairs):
    # Filter a list of sentence pairs using the filterPair criteria.
    
    keep = []  # Initialize an empty list to store pairs that meet the filtering criteria.
    for pair in pairs:
        # For each pair in the input list, check if it should be kept.
        if filterPair(pair):
            keep.append(pair)  # Add the pair to the 'keep' list if it passes the filter.
    return keep  # Return the list of pairs that meet the filtering criteria.

MAX_LENGTH = 250

def prepareData(data):
    input_lang, output_lang, pairs = readLangs(data)
    print("Read %s sentence pairs" % len(pairs))

    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))

    print("Counting words...")
    # Process each sentence pair, adding the words from each sentence to their respective
    # language's vocabulary.
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])

    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

# Example usage of the prepareData function.
# Prepares the data for English to French translation (can be reversed).
input_lang_test, output_lang_test, pairs_test = prepareData(question_train_data)
# Print a random sentence pair from the prepared data to demonstrate the outcome.
print(random.choice(pairs_test))

Reading lines...
Read 4347 sentence pairs
Trimmed to 4345 sentence pairs
Counting words...
Counted words:
natural 671
sql 742
['how much is a first class ticket from boston to san francisco', 'SELECT DISTINCT FAREalias0.FARE_ID FROM AIRPORT_SERVICE AS AIRPORT_SERVICEalias0 , AIRPORT_SERVICE AS AIRPORT_SERVICEalias1 , CITY AS CITYalias0 , CITY AS CITYalias1 , FARE AS FAREalias0 , FARE_BASIS AS FARE_BASISalias0 WHERE ( CITYalias0.CITY_CODE = AIRPORT_SERVICEalias0.CITY_CODE AND CITYalias0.CITY_NAME = "BOSTON" AND CITYalias1.CITY_CODE = AIRPORT_SERVICEalias1.CITY_CODE AND CITYalias1.CITY_NAME = "SAN FRANCISCO" AND FAREalias0.FROM_AIRPORT = AIRPORT_SERVICEalias0.AIRPORT_CODE AND FAREalias0.TO_AIRPORT = AIRPORT_SERVICEalias1.AIRPORT_CODE ) AND FARE_BASISalias0.CLASS_TYPE = "FIRST" AND FAREalias0.FARE_BASIS_CODE = FARE_BASISalias0.FARE_BASIS_CODE ;']


In [146]:
print(output_lang_test.index2word)

{0: 'SOS', 1: 'EOS', 2: 'PAD', 3: 'UNK', 4: 'SELECT', 5: 'DISTINCT', 6: 'FLIGHTalias0.FLIGHT_ID', 7: 'FROM', 8: 'AIRPORT', 9: 'AS', 10: 'AIRPORTalias0', 11: ',', 12: 'FLIGHT', 13: 'FLIGHTalias0', 14: 'WHERE', 15: 'AIRPORTalias0.AIRPORT_CODE', 16: '=', 17: '"MKE"', 18: 'AND', 19: 'FLIGHTalias0.TO_AIRPORT', 20: ';', 21: '"DAL"', 22: 'AIRPORT_SERVICE', 23: 'AIRPORT_SERVICEalias0', 24: 'AIRPORT_SERVICEalias1', 25: 'CITY', 26: 'CITYalias0', 27: 'CITYalias1', 28: 'DATE_DAY', 29: 'DATE_DAYalias0', 30: 'DAYS', 31: 'DAYSalias0', 32: '(', 33: 'CITYalias1.CITY_CODE', 34: 'AIRPORT_SERVICEalias1.CITY_CODE', 35: 'CITYalias1.CITY_NAME', 36: '"BOSTON"', 37: 'DATE_DAYalias0.DAY_NUMBER', 38: '9', 39: 'DATE_DAYalias0.MONTH_NUMBER', 40: '8', 41: 'DATE_DAYalias0.YEAR', 42: '1991', 43: 'DAYSalias0.DAY_NAME', 44: 'DATE_DAYalias0.DAY_NAME', 45: 'FLIGHTalias0.FLIGHT_DAYS', 46: 'DAYSalias0.DAYS_CODE', 47: 'AIRPORT_SERVICEalias1.AIRPORT_CODE', 48: ')', 49: 'CITYalias0.CITY_CODE', 50: 'AIRPORT_SERVICEalias0.CITY_

# Step 4: Build LSTM Encoder and Decoder

In [147]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size  # Hidden size for LSTM.

        # Embedding layer to convert token indices to dense vectors.
        self.embedding = nn.Embedding(input_size, hidden_size)
        
        # LSTM layer instead of GRU, this is the main change.
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        
        # Dropout layer for regularization.
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        # Embedding the input tokens.
        embedded = self.embedding(input)
        
        # Dropout for regularization.
        embedded = self.dropout(embedded)
        
        # Passing through LSTM instead of GRU.
        output, (hidden, cell) = self.lstm(embedded)

        # Return LSTM output, hidden, and cell states.
        return output, (hidden, cell)
    
class AdditiveAttention(nn.Module):
    def __init__(self, hidden_size):
        super(AdditiveAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)
        self.out_size = hidden_size * 2  # Combined context and decoder states.

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)
        return context, weights

class NoAttention(nn.Module):
    def __init__(self, hidden_size):
        super(NoAttention, self).__init__()
        self.out_size = hidden_size

    def forward(self, query, keys):
        context = torch.zeros([query.shape[0], query.shape[1], 0]).to(device)
        weights = torch.zeros(keys.shape).to(device)
        return context, weights

class DotProductAttention(nn.Module):
    def __init__(self, hidden_size):
        super(DotProductAttention, self).__init__()
        self.out_size = hidden_size *2
    
    def forward(self, query, keys):
        scores = (query * keys).sum(-1)
        scores = scores.unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)
        return context, weights

def get_attention_module(name, hidden_size):
    if name == 'none':
        return NoAttention(hidden_size)
    elif name == "additive":
        return AdditiveAttention(hidden_size)
    elif name == "dot-product":
        return DotProductAttention(hidden_size)
    else:
        raise Exception(f"Attention type {name} is not defined")

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, attention_type="none", dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = get_attention_module(attention_type, hidden_size)
        # Replacing GRU with LSTM here.
        self.lstm = nn.LSTM(self.attention.out_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden  # LSTM hidden state (hidden, cell)

        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                decoder_input = target_tensor[:, i].unsqueeze(1)  # Teacher forcing
            else:
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # Inference mode

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions

    def forward_step(self, input, hidden, encoder_outputs):
        embedded = self.dropout(self.embedding(input))

        query = hidden[0].permute(1, 0, 2)  # LSTM hidden state (hidden, cell), need to use hidden[0]
        context, attn_weights = self.attention(query, encoder_outputs)
        input_lstm = torch.cat((embedded, context), dim=2)

        output, hidden = self.lstm(input_lstm, hidden)  # LSTM updates
        output = self.out(output)

        return output, hidden, attn_weights

# Step: Training Prepare

In [150]:
def indexesFromSentence(lang, sentence):
    # Converts a sentence into a list of word indices according to a given language's vocabulary.
    return [lang.word2index[word] for word in sentence.split(' ')]


def get_dataloader(batch_size, data):
    input_lang, output_lang, pairs = prepareData(data)

    n = len(pairs)
    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)  # Initializes a numpy array for input sentence indices.
    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)  # Initializes a numpy array for target sentence indices.

    for idx, (inp, tgt) in enumerate(pairs):
        inp_ids = indexesFromSentence(input_lang, inp) + [EOS_token]  # Gets input indices, appends EOS token.
        tgt_ids = indexesFromSentence(output_lang, tgt) + [EOS_token]  # Gets target indices, appends EOS token.
        # Fills the respective numpy arrays with indices.
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids

    # Converts the numpy arrays to PyTorch tensors and moves them to the specified device.
    train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))
    # Creates a DataLoader with random sampling for batch generation.
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    return input_lang, output_lang, train_dataloader, pairs



def train_epoch(dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    """
    Trains the model for one epoch using the given dataloader, encoder, decoder, and optimizers.

    Parameters:
    - dataloader: DataLoader providing batches of input and target tensors.
    - encoder: The encoder model which processes the input tensors.
    - decoder: The decoder model which generates the output sequence.
    - encoder_optimizer: Optimizer for updating the encoder's weights.
    - decoder_optimizer: Optimizer for updating the decoder's weights.
    - criterion: Loss function to calculate the difference between
                 the decoder's outputs and the target tensors.

    Returns:
    - The average loss over all batches in this epoch.
    """
    total_loss = 0  # Initialize total loss for this epoch.

    # Iterate over batches of data in the dataloader.
    for data in dataloader:
        input_tensor, target_tensor = data  # Unpack the batch into input and target tensors.

        # Clear gradients before processing the batch.
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        # Pass the input tensor through the encoder.
        encoder_outputs, encoder_hidden = encoder(input_tensor)

        # Pass the encoder's outputs and hidden state to the decoder, along with the target tensor.
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        # Compute the loss between the decoder's output and the actual target tensor.
        # The .view(-1, decoder_outputs.size(-1)) reshapes the decoder's output
        # to a 2D tensor where rows correspond to batch elements concatenated together,
        # and columns correspond to the output size. The target is similarly flattened.
        loss = criterion(decoder_outputs.view(-1, decoder_outputs.size(-1)), target_tensor.view(-1))

        loss.backward()  # Compute the gradient of the loss with respect to model parameters.

        # Update the encoder and decoder parameters based on gradients.
        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()  # Accumulate the loss.

    # Calculate the average loss per batch for this epoch.
    return total_loss / len(dataloader)


def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001, print_every=1):
    """
    Trains an encoder-decoder model.

    Parameters:
    - train_dataloader: DataLoader providing batches of data for training.
    - encoder: The encoder part of the sequence-to-sequence model.
    - decoder: The decoder part of the sequence-to-sequence model.
    - n_epochs: Total number of epochs to train the models.
    - learning_rate: Learning rate for the optimizers.
    - print_every: Frequency of reporting the average loss.
    """
    start = time.time()  # Record the start time for calculating elapsed time.
    print_loss_total = 0  # Sum of losses, reset every 'print_every' epochs.

    # Initialize optimizers for both encoder and decoder with the Adam algorithm.
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

    # Define the loss function. NLLLoss is common for classification problems.
    criterion = nn.NLLLoss()

    # Training loop over the specified number of epochs.
    for epoch in range(1, n_epochs + 1):
        # Perform one epoch of training and return the loss.
        loss = train_epoch(train_dataloader, encoder, decoder,
                           encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss  # Accumulate loss.

        # Every 'print_every' epochs, print the average loss and reset the total loss.
        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every  # Calculate average loss.
            print_loss_total = 0  # Reset total loss for the next 'print_every' epochs.
            # Print a summary: elapsed time, current epoch, progress (%), and average loss.
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                         epoch, epoch / n_epochs * 100, print_loss_avg))
            

def tensorFromSentence(lang, sentence):
    # Converts a sentence into a PyTorch tensor of word indices, appending the EOS (End of Sentence) token.
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)  # Appends the EOS token's index to signify the end of the sentence.
    # Converts the list of indices into a PyTorch tensor and returns it.
    return torch.tensor(indexes, dtype=torch.long, device=DEVICE).view(1, -1)


def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    # Temporarily disables gradient calculations to save memory and computations since they are not needed.
    with torch.no_grad():
        # Convert the input sentence into a tensor of word indices.
        input_tensor = tensorFromSentence(input_lang, sentence)

        # Pass the input tensor through the encoder to obtain its outputs and final hidden state.
        encoder_outputs, encoder_hidden = encoder(input_tensor)

        # Pass the encoder outputs and hidden state into the decoder to produce the output sequence.
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        # Select the top prediction (highest probability) from the decoder's output at each time step.
        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()  # Remove extraneous dimensions.

        decoded_words = []  # To store the decoded words.
        for idx in decoded_ids:
            # Check for the EOS token. If found, append '<EOS>' to the decoded words and stop decoding.
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            # Convert each index back to a word and append to the list of decoded words.
            decoded_words.append(output_lang.index2word[idx.item()])

    # Return the list of decoded words and any attention weights from the decoder.
    return decoded_words, decoder_attn

def evaluateRandomly(encoder, decoder, input_lang, output_lang, pairs, n=5):
    # Sets the encoder and decoder to evaluation mode, which turns off dropout and batch normalization,
    # ensuring consistent behavior for inference.
    encoder.eval()
    decoder.eval()

    # Loop over n examples chosen randomly.
    for i in range(n):
        # Randomly select a sentence pair from the global 'pairs' list.
        pair = random.choice(pairs)
        
        # Print the input sentence from the pair.
        print('>', pair[0])
        # Print the target (correct) translation or response.
        print('=', pair[1])
        
        # Use the 'evaluate' function to generate the output sentence for the input sentence.
        output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
        # Join the list of output words into a single sentence.
        output_sentence = ' '.join(output_words)
        
        # Print the model's translation or response.
        print('<', output_sentence)
        print('')  # Print a newline for readability between each evaluated pair.

def evaluateAll(encoder, decoder, input_lang, output_lang, pairs):
    # Sets the encoder and decoder to evaluation mode, which turns off dropout and batch normalization,
    # ensuring consistent behavior for inference.
    encoder.eval()
    decoder.eval()
    
    correct = 0

    # Loop over n examples chosen randomly.
    for pair in pairs:
        # Randomly select a sentence pair from the global 'pairs' list.
        # Use the 'evaluate' function to generate the output sentence for the input sentence.
        output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
        # Join the list of output words into a single sentence.
        if output_words[-1] == "<EOS>":
            output_sentence = ' '.join(output_words[:-1])
        else:
            output_sentence = ' '.join(output_words)
        if output_sentence == pair[1]:
            correct += 1
    
    return correct/len(pairs)

# Begin Training

In [None]:
# Set the size of the hidden layers in the encoder and decoder models.
hidden_size = 128
# Specify the batch size for training, determining how many examples are processed together.
batch_size = 32

input_lang, output_lang, train_dataloader, train_pairs = get_dataloader(batch_size, question_train_data)

encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size, output_lang.n_words, "none").to(device)

train(train_dataloader, encoder, decoder, 10, print_every=1)

evaluateRandomly(encoder, decoder, input_lang, output_lang, train_pairs)
evaluateAll(encoder, decoder, input_lang, output_lang, train_pairs)

Reading lines...
Read 4347 sentence pairs
Trimmed to 4345 sentence pairs
Counting words...
Counted words:
natural 671
sql 742


In [None]:
encoder2 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder2 = AttnDecoderRNN(hidden_size, output_lang.n_words, "additive").to(device)

train(train_dataloader, encoder2, decoder2, 10, print_every=1)

evaluateRandomly(encoder2, decoder2, input_lang, output_lang, train_pairs)
evaluateAll(encoder2, decoder2, input_lang, output_lang, train_pairs)

1m 5s (- 9m 48s) (1 10%) 1.2584
2m 10s (- 8m 42s) (2 20%) 0.3852
3m 15s (- 7m 37s) (3 30%) 0.2436
4m 21s (- 6m 32s) (4 40%) 0.1853
5m 26s (- 5m 26s) (5 50%) 0.1554
6m 31s (- 4m 21s) (6 60%) 0.1371
7m 36s (- 3m 15s) (7 70%) 0.1249
8m 42s (- 2m 10s) (8 80%) 0.1151
9m 47s (- 1m 5s) (9 90%) 0.1065
10m 52s (- 0m 0s) (10 100%) 0.0994
> what city is the airport MCO in
= SELECT DISTINCT CITYalias0.CITY_CODE FROM AIRPORT AS AIRPORTalias0 , AIRPORT_SERVICE AS AIRPORT_SERVICEalias0 , CITY AS CITYalias0 WHERE AIRPORTalias0.AIRPORT_CODE = "MCO" AND AIRPORTalias0.AIRPORT_CODE = AIRPORT_SERVICEalias0.AIRPORT_CODE AND CITYalias0.CITY_CODE = AIRPORT_SERVICEalias0.CITY_CODE ;
< SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS

0.0

In [None]:
encoder3 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder3 = AttnDecoderRNN(hidden_size, output_lang.n_words, "dot-product").to(device)

train(train_dataloader, encoder3, decoder3, 10, print_every=1)

evaluateRandomly(encoder3, decoder3, input_lang, output_lang, train_pairs)
evaluateAll(encoder3, decoder3, input_lang, output_lang, train_pairs)

0m 43s (- 6m 32s) (1 10%) 1.2471
1m 27s (- 5m 48s) (2 20%) 0.4123
2m 10s (- 5m 4s) (3 30%) 0.2583
2m 54s (- 4m 21s) (4 40%) 0.1948
3m 37s (- 3m 37s) (5 50%) 0.1628
4m 20s (- 2m 53s) (6 60%) 0.1437
5m 4s (- 2m 10s) (7 70%) 0.1308
5m 47s (- 1m 26s) (8 80%) 0.1216
6m 30s (- 0m 43s) (9 90%) 0.1144
7m 14s (- 0m 0s) (10 100%) 0.1091
> show me the earliest flight on 8 2 from BOSTON to DENVER that serves a meal
= SELECT DISTINCT FLIGHTalias0.FLIGHT_ID FROM AIRPORT_SERVICE AS AIRPORT_SERVICEalias0 , AIRPORT_SERVICE AS AIRPORT_SERVICEalias1 , CITY AS CITYalias0 , CITY AS CITYalias1 , DATE_DAY AS DATE_DAYalias0 , DAYS AS DAYSalias0 , FLIGHT AS FLIGHTalias0 , FOOD_SERVICE AS FOOD_SERVICEalias0 WHERE ( ( ( DATE_DAYalias0.DAY_NUMBER = 2 AND DATE_DAYalias0.MONTH_NUMBER = 8 AND DATE_DAYalias0.YEAR = 1991 AND DAYSalias0.DAY_NAME = DATE_DAYalias0.DAY_NAME AND FLIGHTalias0.FLIGHT_DAYS = DAYSalias0.DAYS_CODE AND FOOD_SERVICEalias0.MEAL_CODE = FLIGHTalias0.MEAL_CODE ) AND CITYalias1.CITY_CODE = AIRPORT_SER

0.0

In [None]:
class BiEncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(BiEncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, (h_n, c_n) = self.lstm(embedded)
        # 双向LSTM会输出 2 个方向的最后隐藏状态，我们将它拼接
        h_n = torch.cat((h_n[0], h_n[1]), dim=1).unsqueeze(0)  # [1, batch, hidden*2]
        c_n = torch.cat((c_n[0], c_n[1]), dim=1).unsqueeze(0)  # [1, batch, hidden*2]
        return output, (h_n, c_n)

class AdditiveAttention(nn.Module):
    def __init__(self, hidden_size):
        super(AdditiveAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)
        self.out_size = hidden_size * 2

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)
        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)
        return context, weights

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, attention_type="none", dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = get_attention_module(attention_type, hidden_size)
        self.lstm = nn.LSTM(self.attention.out_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden  # (h_n, c_n)
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                decoder_input = target_tensor[:, i].unsqueeze(1)  # Teacher forcing
            else:
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions

    def forward_step(self, input, hidden, encoder_outputs):
        embedded = self.dropout(self.embedding(input))
        h_n, c_n = hidden
        query = h_n.permute(1, 0, 2)  # [batch, 1, hidden]
        context, attn_weights = self.attention(query, encoder_outputs)
        input_lstm = torch.cat((embedded, context), dim=2)
        output, (h_n, c_n) = self.lstm(input_lstm, (h_n, c_n))
        output = self.out(output)
        return output, (h_n, c_n), attn_weights


enc_hidden_size = 64
hidden_size = 128
batch_size = 32

input_lang, output_lang, train_dataloader, train_pairs = get_dataloader(batch_size, question_train_data)

encoder4 = BiEncoderRNN(input_lang.n_words, enc_hidden_size).to(device)
decoder4 = AttnDecoderRNN(hidden_size, output_lang.n_words, "additive").to(device)

train(train_dataloader, encoder4, decoder4, 20, print_every=1)

evaluateRandomly(encoder4, decoder4, input_lang, output_lang, train_pairs)
evaluateAll(encoder4, decoder4, input_lang, output_lang, train_pairs)

Reading lines...
Read 4347 sentence pairs
Trimmed to 4315 sentence pairs
Counting words...
Counted words:
natural 896
sql 712
1m 5s (- 20m 44s) (1 5%) 1.2523
2m 10s (- 19m 35s) (2 10%) 0.3895
3m 15s (- 18m 30s) (3 15%) 0.2438
4m 21s (- 17m 24s) (4 20%) 0.1837
5m 26s (- 16m 20s) (5 25%) 0.1523
6m 32s (- 15m 14s) (6 30%) 0.1335
7m 37s (- 14m 9s) (7 35%) 0.1214
8m 42s (- 13m 4s) (8 40%) 0.1115
9m 48s (- 11m 58s) (9 45%) 0.1034
10m 53s (- 10m 53s) (10 50%) 0.0967
11m 58s (- 9m 47s) (11 55%) 0.0905
13m 3s (- 8m 42s) (12 60%) 0.0853
14m 9s (- 7m 37s) (13 65%) 0.0808
15m 14s (- 6m 31s) (14 70%) 0.0765
16m 19s (- 5m 26s) (15 75%) 0.0728
17m 24s (- 4m 21s) (16 80%) 0.0689
18m 30s (- 3m 15s) (17 85%) 0.0654
19m 35s (- 2m 10s) (18 90%) 0.0615
20m 40s (- 1m 5s) (19 95%) 0.0576
21m 46s (- 0m 0s) (20 100%) 0.0533
> i'd like to go from BOSTON to SAN FRANCISCO
= SELECT DISTINCT FLIGHTalias0.FLIGHT_ID FROM AIRPORT_SERVICE AS AIRPORT_SERVICEalias0 , AIRPORT_SERVICE AS AIRPORT_SERVICEalias1 , CITY AS CIT

0.0