# Step0: Set up Environment

In [None]:
!pip install spacy --quiet
!pip install torchtext
!python -m spacy download en_core_web_lg

import json
import re
import unicodedata
import spacy
from spacy.language import Language
from spacy.tokens import Doc
from spacy.tokenizer import Tokenizer
import torch
import torch.nn as nn
from collections import Counter
import torch
from torch.nn.utils.rnn import pad_sequence
import re
import spacy
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

#====================================================================================
# 设置device
device = 'cpu'
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    pass
    #device = "mps"
print(f"Using {device} device")


[notice] A new release of pip is available: 24.2 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
     ---------------------------------------- 0.0/400.7 MB ? eta -:--:--
      -------------------------------------- 5.8/400.7 MB 32.0 MB/s eta 0:00:13
     - ------------------------------------ 13.1/400.7 MB 32.9 MB/s eta 0:00:12
     - ------------------------------------ 20.7/400.7 MB 33.6 MB/s eta 0:00:12
     -- ----------------------------------- 27.5/400.7 MB 32.9 MB/s eta 0:00:12
     --- ---------------------------------- 34.9/400.7 MB 33.6 MB/s eta 0:00:11
     ---- --------------------------------- 42.5/400.7 MB 33.8 MB/s eta 0:00:11
     ---- --------------------------------- 49.8/400.7 MB 34.1 MB/s eta 0:00:11
     ----- -------------------------------- 57.4/400.7 MB 34.2 MB/s eta 0:00:11
     ------ ------------------------------- 64.7/400.7 MB 34.1 MB/s eta 0:00:10
     ------ ------------------


[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


ModuleNotFoundError: No module named 'torchtext'

# Step1: Methods Introduction

For this task, I need to implement LSTM for Encoder and Decoder. 

1. Data pre-processing
    - Read data from a JSON file and split it into query and question datasets, each with train, dev, and test splits.

2. Create a Spacy tokenizer to process the input and output sentences.
    - maybe need to consider about vocab

3. Data Loader:
    - Use the tokenizer to tokenize each input sentence and its corresponding label, adding the following special tokens: `<sos>`, `<eos>`, and `<pad>`.
    - Pad the tokenized sentences so that each batch has the same length.

4. Define the LSTM Encoder and Decoder:
    - Implement an Encoder and a Decoder using LSTM.
    - Combine the Encoder and Decoder into a sequence-to-sequence (seq2seq) model.
    - Optionally, use teacher forcing during training to enhance convergence.
    - Optionally, use bidirectional or more layers.

5. Define the training method:
    - Perform the feedforward step.
    - Calculate the loss between the predicted output and the target labels.
    - Perform backpropagation to compute the gradients.
    - Apply gradient clipping to prevent exploding gradients.
    - Record and print the loss in the terminal during training.

6. Define the testing method to evaluate the model's performance on the test dataset.

7. Set the hyperparameters for the seq2seq model, such as:
    - Learning rate
    - Batch size
    - Number of epochs
    - Hidden dimensions

8. Train the model using both the question and query training datasets.

9. Test the model using both the question and query testing datasets.
    - Remember to ignore `<sos>`, `<eos>`, and `<pad>`.
    - Remember not only shortest sql query is valid.

# Step 2: Pre-process Raw Data

In [None]:
def unicodeToAscii(s):
    # Convert a Unicode string 's' to plain ASCII.
    # This is done by first normalizing the string into its decomposed form using 'NFD',
    # which separates characters from their accents. Then, it filters out all nonspacing marks (Mn).
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Does not allow sql to have multiple space between each word
def normalize_whitespace(text):
    return re.sub(r'\s+', ' ', text).strip()

def preprocess_sentence(s:str) -> str:
    """
    Preprocesses sentence text for consistency
    """
    s = s.strip()
    s = normalize_whitespace(s)
    s = unicodeToAscii(s)
    s = s.strip()
    return s

def preprocess_dataset(dataset_loc = "atis.json",split_type=None, split=['dev', 'test', 'train']):
    
    # Read Dataset JSON file
    with open(dataset_loc) as f:
        dataset_json = json.load(f)

    processed_dataset = []
    variable_names = set()
    sql_templates = set()

    for sample in dataset_json:
        processed_sample = {}

        # Preprocess sql queries
        sql = [preprocess_sentence(query) for query in sample['sql']]

        # All valid sql queries for this examples sorted by their length
        sql = sorted(sql, key=len)

        # Adds shorests sql template to the set of sql templates
        sql_templates.add(sql[0])
        

        # Dictionary for variables/placeholders metadata
        variables_metadata = sample["variables"]

        # Delete 'location' key from variables dictionary
        # variable_type_mapping = {var['name']:var['type'] for var in variables_metadata}
        for var in variables_metadata:
            # Add current variable to set of all possible variable names
            variable_names.add(var.get("name"))
            var.pop('location', None)
        # query split for this sample
        query_split = sample['query-split']

        # Skips sample if its not the specified split_type or split
        if(split_type == "query"):
            if(query_split not in split):
                continue

        # Process each sentence
        for sentence in sample['sentences']:
            # Skips sample if its not the specified split_type or split
            if(split_type == "question"):
                if(sentence['question-split'] not in split):
                    continue
            # variables/placeholder mapping dictionary
            variables = sentence['variables']

            # Sentence text with variables/placeholders
            text_with_vars = preprocess_sentence(sentence['text'])

            # Replacing variables/placeholders in current sentence and sql query with their values from the variables dictionary
            text_with_vars_replaced = text_with_vars
            sql_with_vars_replaced = sql

            # Replace sentence and all sql variables with their values
            for var in variables:
                text_with_vars_replaced = text_with_vars_replaced.replace(var,variables[var])
                sql_with_vars_replaced = [query.replace(var,variables[var]) for query in sql_with_vars_replaced]

            # Taggingg expected output
            sentence_var_tagging_labels = []
            for word in text_with_vars.split():
                if(word in variables):
                    sentence_var_tagging_labels.append(word)
                else:
                    sentence_var_tagging_labels.append("-")

            # Appends preprocessed dictionary of current sentence to the processesed_dataset list
            processed_dataset.append({
                "text_with_vars":text_with_vars,
                "text_with_vars_replaced":text_with_vars_replaced,
                "sentence_var_tagging_labels":sentence_var_tagging_labels,
                "vars_metadata":variables_metadata,
                "variables":variables,
                "sql_with_vars": sql,
                "shortest_sql_with_vars":sql[0],
                "sql_with_vars_replaced": sql_with_vars_replaced,
                "shortest_sql_with_vars_replaced":sql_with_vars_replaced[0],
                "query_split":sample['query-split'],
                "question_split":sentence['question-split']
            })
    
    return processed_dataset,variable_names,sql_templates

# Step 3: Create spacy tokenizer

In [None]:
#====================================================================================
# Define tokenizer
# 这会加载默认的 spaCy NLP 管道，包括 tokenizer、tagger、parser、ner 等
nlp = spacy.load('en_core_web_lg')

# Create a custom component to merge entities
@Language.component("entity_merger")
def entity_merger(doc):
    """
    Custom component of the spacy nlp pipeline which merges geographical location entity tokens into a single token
    For example: 'New York' would noramlly be split into 2 tokens 'New' and 'York' but this will combine into a single 'New York' token
    This is implemented because city_name type variables could have the value 'New York' and for effective tagging we aim to keep the tokenisation scheme consistent to the dataset
    """
    # Iterate over the entities in reverse order (to avoid index issues when merging)
    with doc.retokenize() as retokenizer:
        for ent in reversed(list(doc.ents)):
            # Merge the entity tokens into one token
            if(ent.label_ in ["GPE"]):
                attrs = {"LEMMA": ent.text}
                retokenizer.merge(ent, attrs=attrs)
    return doc

# Add the custom component after NER
nlp.add_pipe("entity_merger", after="ner")

#====================================================================================
# use embedded vocab for en_core_web_lg
vector_size=300

def sentence_to_tensor(sentence, vector_size=vector_size):
    doc = nlp(sentence)
    vectors = []
    for token in doc:
        if token.has_vector:
            vectors.append(torch.tensor(token.vector))
        else:
            vectors.append(torch.zeros(vector_size))
    return torch.stack(vectors)  # [seq_len, 300]

# Step 4: Data Loader

In [None]:
# Define Dataloader
class generationDataset(Dataset):
    def __init__(self, data, nlp, max_len = 200):
        self.data = data
        self.nlp = nlp
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_sentence = self.data[idx]["text_with_vars_replaced"]
        target_sql_query = self.data[idx]["shortest_sql_with_vars_replaced"]
        input_vectors = sentence_to_tensor(input_sentence)  # 不 pad
        target_vectors = sentence_to_tensor(target_sql_query)  # 不 pad
        return input_vectors, target_vectors

sos_vector = torch.ones(vector_size)      # <sos> 向量，用 1 表示
eos_vector = -torch.ones(vector_size)     # <eos> 向量，用 -1 表示
pad_vector = torch.zeros(vector_size)  # padding 使用全零向量

# Define collate function for each batch
def collate_fn(batch):
    input_seqs, target_seqs = zip(*batch)  # batch 是 [(input1, target1), (input2, target2), ...]
    
    # 计算输入和目标的长度
    input_lengths = [seq.size(0) for seq in input_seqs]
    target_lengths = [seq.size(0) for seq in target_seqs]

    max_input_len = max(input_lengths)
    max_target_len = max(target_lengths)

    # 添加 sos 和 eos 到每个输入和目标序列
    padded_inputs = []
    padded_targets = []

    for seq in input_seqs:
        # 添加 sos 和 eos
        padded_seq = torch.cat([sos_vector, seq, eos_vector])
        padded_inputs.append(padded_seq)

    for seq in target_seqs:
        # 添加 sos 和 eos
        padded_seq = torch.cat([sos_vector, seq, eos_vector])
        padded_targets.append(padded_seq)

    # 对输入和目标进行 padding
    padded_inputs = [torch.cat([seq, pad_vector.repeat(max_input_len - len(seq), 1)]) for seq in padded_inputs]
    padded_targets = [torch.cat([seq, pad_vector.repeat(max_target_len - len(seq), 1)]) for seq in padded_targets]

    # 将列表转换为张量
    inputs_tensor = torch.stack(padded_inputs)  # [batch_size, max_input_len, 300]
    targets_tensor = torch.stack(padded_targets)  # [batch_size, max_target_len, 300]

    return inputs_tensor, targets_tensor, input_lengths, target_lengths

# Step 5: Build LSTM Encoder and Decoder

In [None]:
class Encoder(nn.Module):
    def __init__(self, emb_dim, hid_dim, n_layers=1, dropout=0.5):
        super().__init__()
        self.lstm = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True, bidirectional=True)

    def forward(self, src):  # src: [batch size, seq len, emb dim]
        outputs, (hidden, cell) = self.lstm(src)

        hidden = hidden.view(hidden.size(0) // 2, 2, hidden.size(1), hidden.size(2))
        hidden = torch.cat((hidden[:, 0, :, :], hidden[:, 1, :, :]), dim=2)

        cell = cell.view(cell.size(0) // 2, 2, cell.size(1), cell.size(2))
        cell = torch.cat((cell[:, 0, :, :], cell[:, 1, :, :]), dim=2)

        return hidden, cell


# Define the Decoder
class Decoder(nn.Module):
    def __init__(self, emb_dim, hid_dim, output_dim, n_layers=1, dropout=0.5):
        super().__init__()
        self.lstm = nn.LSTM(emb_dim, hid_dim * 2, n_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hid_dim * 2, output_dim)  # 输出词向量空间维度

    def forward(self, input_vec, hidden, cell):  # input_vec: [batch size, emb dim]
        input_vec = input_vec.unsqueeze(1)  # [batch size, 1, emb dim]
        output, (hidden, cell) = self.lstm(input_vec, (hidden, cell))
        prediction = self.fc_out(output.squeeze(1))  # [batch size, output dim]
        return prediction, hidden, cell


# Define the Seq2Seq model
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

        # 提前加载 spaCy 词汇和词向量，以提高效率
        self.words_vectors = {}
        for word in nlp.vocab:
            if word.has_vector:  # 只考虑有词向量的词
                self.words_vectors[word.text] = word.vector
    
    def token_to_vector(self, word):
        """
        给定预测的词，返回对应的词向量。
        """
        doc = nlp(word)  # 使用spaCy获取词的向量
        return torch.tensor(doc[0].vector)  # 返回 spaCy 的词向量

    def forward(self, src, trg, teacher_forcing_ratio=0.5, mode='train'):
        """
        训练和推理的前向传播方法。包括Teacher Forcing的使用。
        mode: 'train' or 'inference'
        """
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = len(nlp.vocab)  # 词汇大小，取决于 spaCy 的词汇表

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        
        hidden, cell = self.encoder(src)  # 获取编码器的隐藏状态和细胞状态
        
        # Decoder的输入是目标序列的第一个词嵌入向量（一般是<sos> token）
        input_vec = trg[:, 0, :]  # [batch size, emb dim]
        
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input_vec, hidden, cell)
            outputs[:, t, :] = output
            
            if mode == 'train':
                # 使用 Teacher Forcing 或预测结果作为下一个时间步的输入
                top1 = output.argmax(1)  # 获取输出的最大值索引，代表预测的词ID
                if torch.rand(1).item() < teacher_forcing_ratio:
                    # 使用真实的目标词作为输入
                    input_vec = trg[:, t, :]
                else:
                    # 使用模型预测的词作为输入
                    predicted_word = nlp.vocab.strings[top1.item()]  # 将预测索引转换为词
                    input_vec = self.token_to_vector(predicted_word)  # 根据预测词获取词向量
            elif mode == 'inference':
                # 推理模式：不使用 Teacher Forcing，而是直接使用模型预测的词
                top1 = output.argmax(1)  # 获取输出的最大值索引，代表预测的词ID
                predicted_word = nlp.vocab.strings[top1.item()]  # 将预测索引转换为词
                input_vec = self.token_to_vector(predicted_word)  # 根据预测词获取词向量
        
        return outputs


# Step