In [1]:
import torch
import json
import spacy
import numpy as np
from pprint import pprint
from spacy.language import Language
from spacy.tokens import Doc
from spacy.tokenizer import Tokenizer
import re
import unicodedata
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
print(torch.__version__)

2.0.1+cu117


In [2]:
def unicodeToAscii(s):
    # Convert a Unicode string 's' to plain ASCII.
    # This is done by first normalizing the string into its decomposed form using 'NFD',
    # which separates characters from their accents. Then, it filters out all nonspacing marks (Mn).
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalize_whitespace(text):
  return re.sub(r'\s+', ' ', text).strip()

def preprocess_sentence(s:str) -> str:
    """
    Preprocesses sentence text for consistency
    """
    s = s.strip()
    s = normalize_whitespace(s)
    s = unicodeToAscii(s)
    s = s.strip()
    return s


In [3]:
!python -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg')

# Create a custom component to merge entities
@Language.component("entity_merger")
def entity_merger(doc):
    """
    Custom component of the spacy nlp pipeline which merges geographical location entity tokens into a single token
    For example: 'New York' would noramlly be split into 2 tokens 'New' and 'York' but this will combine into a single 'New York' token
    This is implemented because city_name type variables could have the value 'New York' and for effective tagging we aim to keep the tokenisation scheme consistent to the dataset
    """
    # Iterate over the entities in reverse order (to avoid index issues when merging)
    with doc.retokenize() as retokenizer:
        for ent in reversed(list(doc.ents)):
            # Merge the entity tokens into one token
            if(ent.label_ in ["GPE", "ORG"]):
                attrs = {"LEMMA": ent.text}
                retokenizer.merge(ent, attrs=attrs)
    return doc

# Add the custom component after NER
nlp.add_pipe("entity_merger", after="ner")

def whitespace_tokenizer(nlp):
    # Create a custom tokenizer that splits only on whitespace
    return Tokenizer(nlp.vocab, token_match=re.compile(r'\S+').match)

nlp.tokenizer = whitespace_tokenizer(nlp)


Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
     ---------------------------------------- 0.0/400.7 MB ? eta -:--:--
     --------------------------------------- 3.7/400.7 MB 31.3 MB/s eta 0:00:13
      ------------------------------------- 10.5/400.7 MB 31.2 MB/s eta 0:00:13
     - ------------------------------------ 16.0/400.7 MB 27.2 MB/s eta 0:00:15
     - ------------------------------------ 21.0/400.7 MB 28.8 MB/s eta 0:00:14
     -- ----------------------------------- 27.3/400.7 MB 26.6 MB/s eta 0:00:15
     -- ----------------------------------- 31.5/400.7 MB 27.7 MB/s eta 0:00:14
     --- ---------------------------------- 38.0/400.7 MB 26.3 MB/s eta 0:00:14
     --- ---------------------------------- 41.9/400.7 MB 26.9 MB/s eta 0:00:14
     ---- --------------------------------- 49.5/400.7 MB 26.5 MB/s eta 0:00:14
     ----- -------------------


[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
class ATISClassificationDataset(Dataset):

    def __init__(self, dataset_loc, nlp, split_type='query', split=['train'], tokenizer=None):

        self.nlp = nlp
        self.data = []

        self.variable_names = set()
        self.sql_templates = set()

        # Reads Json
        with open(dataset_loc) as f:
            dataset_json = json.load(f)

        for sample in dataset_json:

            processed_sample = {}

            # All valid sql queries for this examples soretd by length
            sql = sorted(sample['sql'],key=len)
            # Adds shortest sql query template to list of sql templates
            self.sql_templates.add(sql[0])

            # Check query split
            query_split = sample['query-split']
            if split_type == "query" and query_split not in split:
                continue

            # Adds to variables set
            variables_metadata = sample["variables"]
            for var in variables_metadata:
                self.variable_names.add(var.get("name"))

            # Process each sentence
            for sentence in sample['sentences']:

                # Check question split
                if split_type == "question" and sentence['question-split'] not in split:
                    continue

                # variables/placeholder mapping dictionary
                variables = sentence['variables']

                # Sentence text with variables/placeholders
                text_with_vars = sentence['text']

                # Replacing variables/placeholders in current sentence and sql query with their values from the variables dictionary
                text_with_vars_replaced = text_with_vars
                sql_with_vars_replaced = sql.copy()

                # Replace sentence and all sql variables with their values
                for var in variables:
                    text_with_vars_replaced = text_with_vars_replaced.replace(var,variables[var])

                    # sql_with_vars_replaced = sql_with_vars_replaced.replace(var,variables[var])
                    sql_with_vars_replaced = [query.replace(var,variables[var]) for query in sql_with_vars_replaced]

                # Taggingg expected output
                sentence_var_tagging_labels = []
                for word in text_with_vars.split():
                    if(word in variables):
                        # Use variable name as tag
                        sentence_var_tagging_labels.append(word)
                    else:
                        # Use '-' for non-variable words
                        sentence_var_tagging_labels.append("no_var")

                # Appends preprocessed dictionary of current sentence to the processesed_dataset list
                self.data.append({
                    "text_with_vars": text_with_vars,
                    "text_with_vars_replaced":text_with_vars_replaced,
                    "tagging_labels":sentence_var_tagging_labels,
                    "variables":variables,
                    "sql_with_vars": sql,
                    "shortest_sql_with_vars":sql[0],
                    "sql_with_vars_replaced": sql_with_vars_replaced,
                    "shortest_sql_with_vars_replaced": sql_with_vars_replaced[0]
                })

        # Setup tagging label encoder
        # For tagging task - include all variable names plus "-" for non-variables
        all_tags = ["-","no_var"] + list(self.variable_names)
        all_tags = np.array(all_tags)
        self.tag_encoder = LabelEncoder()
        self.tag_encoder.fit(all_tags)

        # Setup SQL template label encoder
        self.sql_encoder = LabelEncoder()
        self.sql_encoder.fit(np.array(list(self.sql_templates)))

        # Process sentences using spacy nlp pipeline to get docs, corresponding label ids and tag ids
        self.docs = []
        self.tag_labels = []
        self.sql_labels = []

        for sample in self.data:
            # Convert sentence with variables into spacy doc
            doc = self.nlp(preprocess_sentence(sample['text_with_vars_replaced']))
            self.docs.append(doc)

            # Covert variable tags into ids foir tagging
            tags = sample["tagging_labels"]
            # if(len(doc) != len(tags)):
            #   print([token for token in doc])
            #   print(tags)
            self.tag_labels.append(self.tag_encoder.transform(tags))

            # Convert sql query with variables into ids for classification
            sql_template = sample["shortest_sql_with_vars"]
            self.sql_labels.append(self.sql_encoder.transform([sql_template])[0])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        """Get a single sample from the dataset"""
        doc = self.docs[idx]
        raw_item = self.data[idx]

        # Get token vectors from SpaCy document
        # vectors = [token.vector for token in doc]

        # Create tensor from vectors
        token_vectors = torch.tensor(doc.tensor, dtype=torch.float32)

        # Get token texts (needed for variable replacement during inference)
        tokens = [token.text for token in doc]

        # Get tag labels
        tag_labels = torch.tensor(self.tag_labels[idx], dtype=torch.long)

        # Get SQL label
        sql_label = torch.tensor(self.sql_labels[idx], dtype=torch.long)

        return {
            "token_vectors": token_vectors,
            "tokens": tokens,
            "tag_labels": tag_labels,
            "sql_label": sql_label,
            "raw_item": raw_item,
            "doc_len": len(doc),
            "true_sql_text": raw_item['shortest_sql_with_vars_replaced'] # For inference
        }

    def get_dataloader(self, batch_size=32, shuffle=True, num_workers=0):
        """Helper function to create a DataLoader with custom collate function"""
        return DataLoader(
            self,
            batch_size=batch_size,
            shuffle=shuffle,
            num_workers=num_workers,
            collate_fn=self.collate_fn,
            pin_memory=True
        )

    def collate_fn(self, batch):
      """Custom collate function that pads sequences to the longest in the batch"""

      # Find max length for both token vectors and tag labels in this batch
      max_token_len = max([item["doc_len"] for item in batch])
      max_tag_len = max([len(item["tag_labels"]) for item in batch])

      # Ensure both use the same max length for consistent padding
      max_len = max(max_token_len, max_tag_len)


      # PRepare lists to collect tensors and data
      token_vectors_list = []
      tag_labels_list = []
      attention_masks = []
      sql_labels = []
      tokens_list = []
      raw_items = []
      true_sql_text_list = []

      for item in batch:
          # Get original tensors and data
          token_vecs = item["token_vectors"]
          tags = item["tag_labels"]
          tokens = item["tokens"]

          # Create attention mask (1 for real tokens, 0 for padding)
          seq_len = len(token_vecs)
          tag_len = len(tags)
          attention_mask = torch.ones(seq_len, dtype=torch.long)

          # If padding is needed for tokens
          if(seq_len < max_len):
            # Pad token vectors
            padding = torch.zeros(max_len - seq_len, token_vecs.shape[1], dtype=torch.long)
            token_vecs = torch.cat([token_vecs,padding], dim=0)
            attention_mask = torch.cat([attention_mask, torch.zeros(max_len - seq_len, dtype=torch.long)])

            # Pad tokens list
            tokens.extend([""] * (max_len - seq_len))

          # Pad tag labels separately to ensure they all have the same length
          if(tag_len < max_len):
            padding = torch.zeros(max_len - tag_len, dtype=torch.long)
            tags = torch.cat([tags, padding], dim=0)

          # Add to lists
          token_vectors_list.append(token_vecs)
          tag_labels_list.append(tags)
          attention_masks.append(attention_mask)
          sql_labels.append(item["sql_label"])
          tokens_list.append(tokens)
          raw_items.append(item["raw_item"])
          true_sql_text_list.append(item["true_sql_text"])



      # Stack tensors
      return {
          "token_vectors": torch.stack(token_vectors_list),
          "tag_labels": torch.stack(tag_labels_list),
          "attention_mask": torch.stack(attention_masks),
          "sql_labels": torch.stack(sql_labels),
          "tokens": tokens_list,
          "raw_items": raw_items,
          "true_sql_texts":true_sql_text_list
      }

    def get_tag_vocab_size(self):
        """Returns the size of the tag vocabulary"""
        return len(self.tag_encoder.classes_)

    def get_sql_vocab_size(self):
        """Returns the number of unique SQL templates"""
        return len(self.sql_encoder.classes_)

    def get_vector_dim(self):
        """Returns the dimensionality of token vectors"""
        return self.nlp.get_pipe("tok2vec").model.get_dim("nO")

    def decode_tag(self, tag_id):
        """Convert tag ID back to original variable name or '-' """
        return self.tag_encoder.inverse_transform([tag_id])[0]

    def decode_sql_template(self, sql_id):
        """Convert SQL template ID back to original SQL template"""
        return self.sql_encoder.inverse_transform([sql_id])[0]


In [5]:


class LSTMTaggerClassifer(nn.Module):
    def __init__(self,
                 input_dim:int,
                 hidden_dim:int,
                 tag_vocab_size:int,
                 sql_vocab_size:int,
                 num_layers:int = 1,
                 dropout:float = 0.25):
        """
        LSTM model for both token tagging and SQL template classification

        Args:
            input_dim: Dimensionality of input vectors
            hidden_dim: Hidden dimension of LSTM
            tag_vocab_size: Size of tag vocabulary for tagging task
            sql_vocab_size: Number of unique SQL templates
            num_layers: Number of LSTM layers
            dropout: Dropout probability
        """
        super(LSTMTaggerClassifer, self).__init__()

        #LSTM layer
        self.lstm = nn.LSTM(
            input_dim,
            hidden_dim,
            num_layers=num_layers,
            bidirectional=True,
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True
        )

        # Dropout
        self.dropout = nn.Dropout(dropout)

        # Tagging layer (token classification)
        # hidden * 2 is for bidirectionality
        self.tag_classifier = nn.Linear(hidden_dim * 2, tag_vocab_size)

        # SQL Template classification layer
        # hidden * 2 is for bidirectionality
        self.sql_classifier = nn.Linear(hidden_dim * 2, sql_vocab_size)

    def forward(self, token_vectors, attention_mask=None):
        """
        Forward pass

        Args:
            token_vectors: Token vectors from SpaCy [batch_size, seq_len, input_dim]
            attention_mask: Attention mask indicating valid tokens [batch_size, seq_len]

        Returns:
            tag_logits: Token classification logits [batch_size, seq_len, tag_vocab_size]
            sql_logits: SQL template classification logits [batch_size, sql_vocab_size]
        """
        batch_size = token_vectors.shape[0]
        seq_len = token_vectors.shape[1]

        # Ignore padded vectrors in batch
        packed = nn.utils.rnn.pack_padded_sequence(token_vectors, attention_mask.sum(dim=1).cpu().long(), batch_first=True, enforce_sorted=False)
        # LSTM
        output, (hidden, _) = self.lstm(packed)
        # Unpack output back to padded sequences
        output, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True,total_length=seq_len)
        # Appply dropout
        output = self.dropout(output)
        # Feedforward layer to get tagging logits
        tag_logits = self.tag_classifier(output)  # shape: (B, T, num_tags)

        # Classification from last hidden states (concat of both directions)
        final_hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)
        # Apply dropout to final hidden output
        # final_hidden = self.dropout(final_hidden)
        # Feedforward layer to get SQL classification
        sql_logits = self.sql_classifier(final_hidden)

        return tag_logits, sql_logits



In [6]:
def evaluate_model(model, data_loader, device="cuda" if torch.cuda.is_available() else "cpu", dataset=None):
    """
    Evaluate the LSTM model with detailed metrics

    Args:
        model: LSTMTaggerClassifier model
        data_loader: DataLoader with evaluation data
        device: Device to run evaluation on
        dataset: SpacyTextDataset instance (for decoding predictions)

    Returns:
        Dictionary of evaluation metrics
    """
    model.eval()
    tag_criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='sum')
    sql_criterion = nn.CrossEntropyLoss(reduction='sum')

    total_tag_loss = 0
    total_sql_loss = 0
    total_correct_tags = 0
    total_valid_tags = 0
    total_correct_sql = 0
    total_sql_match = 0
    total_samples = 0

    with torch.no_grad():
        for batch in data_loader:
            # Get batch data
            token_vectors = batch["token_vectors"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            tag_labels = batch["tag_labels"].to(device)
            sql_labels = batch["sql_labels"].to(device)
            raw_items = batch["raw_items"]
            tokens = batch["tokens"]

            # Forward pass
            tag_logits, sql_logits = model(token_vectors, attention_mask)

            # Calculate lossess
            tag_loss = tag_criterion(tag_logits.view(-1, tag_logits.size(-1)), tag_labels.view(-1))
            sql_loss = sql_criterion(sql_logits, sql_labels)

            # Calculate metrics
            tag_preds = torch.argmax(tag_logits, dim=-1)
            sql_preds = torch.argmax(sql_logits, dim=-1)

            # Only count valid tags (non-padding)
            valid_mask = (tag_labels != 0) & (attention_mask == 1)
            total_correct_tags += (tag_preds[valid_mask] == tag_labels[valid_mask]).sum().item()
            total_valid_tags += valid_mask.sum().item()

            total_correct_sql += (sql_preds == sql_labels).sum().item()

            # Calculate SQL match accuracy (check if final SQL is in the list of valid SQLs)
            if dataset:
                for i in range(len(raw_items)):
                    # get predicted template
                    template = dataset.decode_sql_template(sql_preds[i].item())

                    # Get predicted tags and tokens
                    valid_length = attention_mask[i].sum().item()
                    item_tokens = tokens[i][:valid_length]
                    item_tag_preds = [dataset.decode_tag(t.item()) for t in tag_preds[i, :valid_length]]

                    # Build identified variables
                    variables = {}
                    for j, (token, tag) in enumerate(zip(item_tokens, item_tag_preds)):
                        if(tag != '-' or tag != 'no_var'):
                            variables[tag] = token

                    # Replace variables in template
                    final_sql = template.copy()
                    for var_name, var_value in variables.items():
                        final_sql = final_sql.replace(var_name, var_value)

                    if preprocess_sentence(final_sql) in [preprocess_sentence(x) for x in raw_items[i]["sql_with_vars_replaced"]]:
                      # print(f"SQL Template predicted:\n{template}\n")
                      # print(f"Variables identified:\n{variables}\n")
                      # print(f"SQL model predicted:\n{final_sql}\n")
                      # print(f"True SQL:\n{raw_items[i]['sql_with_vars_replaced']}")
                      total_sql_match += 1

            total_samples += len(token_vectors)
            total_tag_loss += tag_loss.item()
            total_sql_loss += sql_loss.item()

    metrics = {
        "tag_loss": total_tag_loss / total_valid_tags if total_valid_tags > 0 else 0,
        "sql_loss": total_sql_loss / total_samples,
        "tag_acc": total_correct_tags / total_valid_tags if total_valid_tags > 0 else 0,
        "sql_acc": total_correct_sql / total_samples,
    }

    if dataset:
        metrics["sql_match_acc"] = total_sql_match / total_samples

    # print(f"EVAL METRICS:")
    # print(metrics)
    return metrics




In [7]:
def train_model(model, train_loader, val_loader=None, epochs=1-0, lr=1e-3, device="cuda" if torch.cuda.is_available() else "cpu"):
    """
    Train LSTMTaggerClassfier model
    """
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    tag_criterion = nn.CrossEntropyLoss()
    sql_criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        # Set model to training model
        model.train()
        total_loss = 0

        # Training loop
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
          # Get batch data
          token_vectors = batch["token_vectors"].to(device)
          attention_mask = batch["attention_mask"].to(device)
          tag_labels = batch["tag_labels"].to(device)
          sql_labels = batch["sql_labels"].to(device)

          # Forward pass
          tag_logits, sql_logits = model(token_vectors, attention_mask)

          # Handle dimension mismatches
          tag_seq_len = tag_logits.size(1)
          label_seq_len = tag_labels.size(1)

          if tag_seq_len != label_seq_len:
              # Either pad or truncate logits/labels to match
              if tag_seq_len < label_seq_len:
                  # If logits are shorter, truncate labels
                  tag_labels = tag_labels[:, :tag_seq_len]
              else:
                  # If labels are shorter, truncate logits
                  tag_logits = tag_logits[:, :label_seq_len, :]

          # Calculate losses
          # tag_labels shape: torch.Size([32, 19])
          # tag_logits shaep: torch.Size([32, 18, 63])
          # if(tag_labels.size(1) != tag_logits.size(1)):
          #   print(tag_labels[0])
          #   print(tag_logits[1])

          # print(f"tag_labels shape: {tag_labels.size()}")
          # print(f"tag_logits shaep: {tag_logits.size()}")
          # print(f"input shape: {tag_logits.view(-1, tag_logits.size(-1)).shape}")
          # print(f"target shape: {tag_labels.view(-1).shape}")
          # print("*"*50)

          # Calculate losses
          tag_loss = tag_criterion(tag_logits.view(-1, tag_logits.size(-1)), tag_labels.view(-1))
          sql_loss = sql_criterion(sql_logits, sql_labels)
          loss = tag_loss + sql_loss

          # Backwrad pass
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()

          total_loss += loss.item()

        avg_loss = total_loss/len(train_loader)
        print(f"Epoch {epoch+1}/{epochs}, Training Loss: {avg_loss:.4f}")

        # Validation
        if(val_loader):
            val_metrics = evaluate_model(model, val_loader, device)
            print(f"Validation - Tag Acc: {val_metrics['tag_acc']:.4f}, SQL Acc: {val_metrics['sql_acc']:.4f}")

    return model

In [8]:
class SQLPipeline:
    def __init__(self,model, dataset, device="cuda" if torch.cuda.is_available() else "cpu"):
        """
        End-to-end SQL generation pipeline

        Args:
            model: Trained LSTMTaggerClassifier model
            dataset: SpacyTextDataset instance used for training
            device: Device to run inference on
        """
        self.model = model.to(device)
        self.model.eval()
        self.dataset = dataset
        self.device = device

    def predict(self, text, nlp=None):
        """
        Generate SQL query for input text

        Args:
            text: Input text query
            nlp: Optional SpaCy pipeline (if not provided, uses the dataset's)

        Returns:
            Dict containing the predicted SQL query, identified variables, and template
        """

        nlp = nlp or self.dataset.nlp

        # Process text with SpaCy
        doc = nlp(text)

        # Extract token vectors and texts
        # squeeze(0) used add batch dimension bcos model expects batch but we are processing a single sample
        token_vectors = torch.tensor(doc.tensor, dtype=torch.float32).unsqueeze(0).to(self.device)
        tokens = [token.text for token in doc]

        # Create attention mask
        attention_mask = torch.ones(1, len(doc), dtype=torch.float32).to(self.device)

        # Forward pass
        with torch.no_grad():
            tag_logits, sql_logits =self.model(token_vectors, attention_mask)

            # Get predicted tag for each token
            tag_preds = torch.argmax(tag_logits, dim=-1)[0].cpu().numpy()

            # Get predicted SQL template
            sql_pred = torch.argmax(sql_logits, dim=-1)[0].item()

        # Convert predictions back to human-readable form
        predicted_tags = [self.dataset.decode_tag(tag) for tag in tag_preds[:len(tokens)]]
        predicted_template = self.dataset.decode_sql_template(sql_pred)

        # Extract identified variables
        identified_variables = {}
        for i, (token, tag) in enumerate(zip(tokens, predicted_tags)):
            if tag != "-" or tag != 'no_var':
                # Tag is a variable name
                identified_variables[tag] = token

        # Replace variables in the SQL template
        final_sql = predicted_template
        for var_name, var_value in identified_variables.items():
            final_sql = final_sql.replace(var_name, var_value)

        return {
            "tokens": tokens,
            "predicted_tags": predicted_tags,
            "identified_variables": identified_variables,
            "predicted_template": predicted_template,
            "final_sql": final_sql
        }

    def batch_predict(self, dataloader):
        """
        Generate SQL queries for a batch of inputs

        Args:
            dataloader: DataLoader with test data

        Returns:
            List of prediction results
        """
        results = []

        for batch in tqdm(dataloader, desc="Generating SQL queries"):
            # Move batch to device
            token_vectors = batch["token_vectors"].to(self.device)
            attention_mask = batch["attention_mask"].to(self.device)
            tokens = batch["tokens"]
            raw_items = batch["raw_items"]

            # Forward pass
            with torch.no_grad():
                tag_logits, sql_logits = self.model(token_vectors, attention_mask)

                # Get predictions
                tag_preds = torch.argmax(tag_logits, dim=-1).cpu().numpy()
                sql_preds = torch.argmax(sql_logits, dim=-1).cpu().numpy()

            # Process each item in batch
            for i in range(len(tokens)):
                # Get token and tag sequences
                item_tokens = tokens[i]
                valid_tokens = [t for t in item_tokens if t] # Filter out padding

                # Get predicted tags for valid tokens
                item_tags = [self.dataset.decode_tag(tag) for tag in tag_preds[i, :len(valid_tokens)]]

                # Get predicted SQL template
                predicted_template = self.dataset.decode_sql_template(sql_preds[i])

                # Extract identified variables
                identified_variables = {}
                for j, (token, tag) in enumerate(zip(valid_tokens, item_tags)):
                    if tag != "-" or tag != 'no_var':
                        identified_variables[tag] = token

                # Replace variables in the SQL template
                final_sql = predicted_template
                for var_name, var_value in identified_variables.items():
                    final_sql = final_sql.replace(var_name, var_value)

                # Add to results
                results.append({
                    "tokens": valid_tokens,
                    "predicted_tags": item_tags,
                    "identified_variables": identified_variables,
                    "predicted_template": predicted_template,
                    "final_sql": final_sql,
                    "raw_item": raw_items[i]
                })

        return results

In [9]:
# Create datasets
train_dataset = ATISClassificationDataset(
    dataset_loc="atis.json",
    nlp=nlp,
    split_type="question",
    split=["train"]
)

val_dataset = ATISClassificationDataset(
    dataset_loc="atis.json",
    nlp=nlp,
    split_type="question",
    split=["dev"]
)

test_dataset = ATISClassificationDataset(
    dataset_loc="atis.json",
    nlp=nlp,
    split_type="question",
    split=["test"]
)

# Create dataloaders
train_loader = train_dataset.get_dataloader(batch_size=32)
val_loader = val_dataset.get_dataloader(batch_size=32)
test_loader = test_dataset.get_dataloader(batch_size=32)

In [10]:
# Create model
model = LSTMTaggerClassifer(
    input_dim=train_dataset.get_vector_dim(),
    hidden_dim=256,
    tag_vocab_size=train_dataset.get_tag_vocab_size(),
    sql_vocab_size=train_dataset.get_sql_vocab_size(),
    num_layers=3
)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.4 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Users\slh\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\slh\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "C:\Users\slh\AppData\Roaming\Python\Python310\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\slh\AppData\Roaming\Python\Python310\site-packages\traitlets\config\application.py", l

In [11]:
# Train model
trained_model = train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=50,
    lr=1e-3
)

Epoch 1/50: 100%|██████████| 136/136 [00:03<00:00, 38.40it/s]


Epoch 1/50, Training Loss: 7.6358
Validation - Tag Acc: 0.8616, SQL Acc: 0.2243


Epoch 2/50: 100%|██████████| 136/136 [00:03<00:00, 40.50it/s]


Epoch 2/50, Training Loss: 6.1835
Validation - Tag Acc: 0.8893, SQL Acc: 0.3292


Epoch 3/50: 100%|██████████| 136/136 [00:03<00:00, 39.80it/s]


Epoch 3/50, Training Loss: 5.2792
Validation - Tag Acc: 0.9123, SQL Acc: 0.3827


Epoch 4/50: 100%|██████████| 136/136 [00:03<00:00, 41.26it/s]


Epoch 4/50, Training Loss: 4.5304
Validation - Tag Acc: 0.9217, SQL Acc: 0.4403


Epoch 5/50: 100%|██████████| 136/136 [00:03<00:00, 41.00it/s]


Epoch 5/50, Training Loss: 3.8765
Validation - Tag Acc: 0.9366, SQL Acc: 0.4835


Epoch 6/50: 100%|██████████| 136/136 [00:03<00:00, 40.75it/s]


Epoch 6/50, Training Loss: 3.3218
Validation - Tag Acc: 0.9395, SQL Acc: 0.5247


Epoch 7/50: 100%|██████████| 136/136 [00:03<00:00, 41.37it/s]


Epoch 7/50, Training Loss: 2.8018
Validation - Tag Acc: 0.9447, SQL Acc: 0.5617


Epoch 8/50: 100%|██████████| 136/136 [00:03<00:00, 41.09it/s]


Epoch 8/50, Training Loss: 2.3726
Validation - Tag Acc: 0.9461, SQL Acc: 0.5782


Epoch 9/50: 100%|██████████| 136/136 [00:03<00:00, 40.90it/s]


Epoch 9/50, Training Loss: 1.9984
Validation - Tag Acc: 0.9465, SQL Acc: 0.6008


Epoch 10/50: 100%|██████████| 136/136 [00:03<00:00, 40.60it/s]


Epoch 10/50, Training Loss: 1.6436
Validation - Tag Acc: 0.9519, SQL Acc: 0.5967


Epoch 11/50: 100%|██████████| 136/136 [00:03<00:00, 41.61it/s]


Epoch 11/50, Training Loss: 1.3707
Validation - Tag Acc: 0.9523, SQL Acc: 0.6235


Epoch 12/50: 100%|██████████| 136/136 [00:03<00:00, 40.10it/s]


Epoch 12/50, Training Loss: 1.1697
Validation - Tag Acc: 0.9550, SQL Acc: 0.6564


Epoch 13/50: 100%|██████████| 136/136 [00:03<00:00, 39.87it/s]


Epoch 13/50, Training Loss: 0.9829
Validation - Tag Acc: 0.9581, SQL Acc: 0.6420


Epoch 14/50: 100%|██████████| 136/136 [00:03<00:00, 39.75it/s]


Epoch 14/50, Training Loss: 0.8259
Validation - Tag Acc: 0.9548, SQL Acc: 0.6543


Epoch 15/50: 100%|██████████| 136/136 [00:03<00:00, 39.29it/s]


Epoch 15/50, Training Loss: 0.7249
Validation - Tag Acc: 0.9588, SQL Acc: 0.6523


Epoch 16/50: 100%|██████████| 136/136 [00:03<00:00, 39.48it/s]


Epoch 16/50, Training Loss: 0.6101
Validation - Tag Acc: 0.9615, SQL Acc: 0.6667


Epoch 17/50: 100%|██████████| 136/136 [00:03<00:00, 39.34it/s]


Epoch 17/50, Training Loss: 0.5471
Validation - Tag Acc: 0.9621, SQL Acc: 0.6749


Epoch 18/50: 100%|██████████| 136/136 [00:03<00:00, 40.33it/s]


Epoch 18/50, Training Loss: 0.4869
Validation - Tag Acc: 0.9641, SQL Acc: 0.6584


Epoch 19/50: 100%|██████████| 136/136 [00:03<00:00, 40.25it/s]


Epoch 19/50, Training Loss: 0.4303
Validation - Tag Acc: 0.9656, SQL Acc: 0.6584


Epoch 20/50: 100%|██████████| 136/136 [00:03<00:00, 40.32it/s]


Epoch 20/50, Training Loss: 0.3877
Validation - Tag Acc: 0.9662, SQL Acc: 0.6708


Epoch 21/50: 100%|██████████| 136/136 [00:03<00:00, 40.08it/s]


Epoch 21/50, Training Loss: 0.3467
Validation - Tag Acc: 0.9648, SQL Acc: 0.6605


Epoch 22/50: 100%|██████████| 136/136 [00:03<00:00, 40.16it/s]


Epoch 22/50, Training Loss: 0.3076
Validation - Tag Acc: 0.9643, SQL Acc: 0.6728


Epoch 23/50: 100%|██████████| 136/136 [00:03<00:00, 41.03it/s]


Epoch 23/50, Training Loss: 0.2649
Validation - Tag Acc: 0.9679, SQL Acc: 0.6687


Epoch 24/50: 100%|██████████| 136/136 [00:03<00:00, 40.65it/s]


Epoch 24/50, Training Loss: 0.2359
Validation - Tag Acc: 0.9646, SQL Acc: 0.6872


Epoch 25/50: 100%|██████████| 136/136 [00:03<00:00, 39.43it/s]


Epoch 25/50, Training Loss: 0.2114
Validation - Tag Acc: 0.9652, SQL Acc: 0.6749


Epoch 26/50: 100%|██████████| 136/136 [00:03<00:00, 40.20it/s]


Epoch 26/50, Training Loss: 0.2268
Validation - Tag Acc: 0.9646, SQL Acc: 0.6728


Epoch 27/50: 100%|██████████| 136/136 [00:03<00:00, 39.39it/s]


Epoch 27/50, Training Loss: 0.2082
Validation - Tag Acc: 0.9660, SQL Acc: 0.6770


Epoch 28/50: 100%|██████████| 136/136 [00:03<00:00, 39.36it/s]


Epoch 28/50, Training Loss: 0.1918
Validation - Tag Acc: 0.9656, SQL Acc: 0.6770


Epoch 29/50: 100%|██████████| 136/136 [00:03<00:00, 38.11it/s]


Epoch 29/50, Training Loss: 0.1945
Validation - Tag Acc: 0.9658, SQL Acc: 0.6502


Epoch 30/50: 100%|██████████| 136/136 [00:03<00:00, 40.17it/s]


Epoch 30/50, Training Loss: 0.2198
Validation - Tag Acc: 0.9656, SQL Acc: 0.6543


Epoch 31/50: 100%|██████████| 136/136 [00:03<00:00, 39.76it/s]


Epoch 31/50, Training Loss: 0.1722
Validation - Tag Acc: 0.9633, SQL Acc: 0.6708


Epoch 32/50: 100%|██████████| 136/136 [00:03<00:00, 40.00it/s]


Epoch 32/50, Training Loss: 0.1333
Validation - Tag Acc: 0.9668, SQL Acc: 0.6667


Epoch 33/50: 100%|██████████| 136/136 [00:03<00:00, 39.96it/s]


Epoch 33/50, Training Loss: 0.1152
Validation - Tag Acc: 0.9679, SQL Acc: 0.6790


Epoch 34/50: 100%|██████████| 136/136 [00:03<00:00, 40.33it/s]


Epoch 34/50, Training Loss: 0.0988
Validation - Tag Acc: 0.9671, SQL Acc: 0.6934


Epoch 35/50: 100%|██████████| 136/136 [00:03<00:00, 39.84it/s]


Epoch 35/50, Training Loss: 0.0902
Validation - Tag Acc: 0.9702, SQL Acc: 0.6852


Epoch 36/50: 100%|██████████| 136/136 [00:03<00:00, 38.02it/s]


Epoch 36/50, Training Loss: 0.0770
Validation - Tag Acc: 0.9695, SQL Acc: 0.6934


Epoch 37/50: 100%|██████████| 136/136 [00:03<00:00, 39.41it/s]


Epoch 37/50, Training Loss: 0.0648
Validation - Tag Acc: 0.9714, SQL Acc: 0.6996


Epoch 38/50: 100%|██████████| 136/136 [00:03<00:00, 39.01it/s]


Epoch 38/50, Training Loss: 0.0804
Validation - Tag Acc: 0.9660, SQL Acc: 0.6872


Epoch 39/50: 100%|██████████| 136/136 [00:03<00:00, 39.63it/s]


Epoch 39/50, Training Loss: 0.1193
Validation - Tag Acc: 0.9675, SQL Acc: 0.6687


Epoch 40/50: 100%|██████████| 136/136 [00:03<00:00, 40.45it/s]


Epoch 40/50, Training Loss: 0.1456
Validation - Tag Acc: 0.9637, SQL Acc: 0.6749


Epoch 41/50: 100%|██████████| 136/136 [00:03<00:00, 40.04it/s]


Epoch 41/50, Training Loss: 0.1213
Validation - Tag Acc: 0.9687, SQL Acc: 0.6564


Epoch 42/50: 100%|██████████| 136/136 [00:03<00:00, 39.76it/s]


Epoch 42/50, Training Loss: 0.1201
Validation - Tag Acc: 0.9666, SQL Acc: 0.6831


Epoch 43/50: 100%|██████████| 136/136 [00:03<00:00, 40.22it/s]


Epoch 43/50, Training Loss: 0.0781
Validation - Tag Acc: 0.9668, SQL Acc: 0.6790


Epoch 44/50: 100%|██████████| 136/136 [00:03<00:00, 39.82it/s]


Epoch 44/50, Training Loss: 0.0782
Validation - Tag Acc: 0.9693, SQL Acc: 0.6893


Epoch 45/50: 100%|██████████| 136/136 [00:03<00:00, 40.30it/s]


Epoch 45/50, Training Loss: 0.0554
Validation - Tag Acc: 0.9714, SQL Acc: 0.6914


Epoch 46/50: 100%|██████████| 136/136 [00:03<00:00, 40.48it/s]


Epoch 46/50, Training Loss: 0.0437
Validation - Tag Acc: 0.9679, SQL Acc: 0.6893


Epoch 47/50: 100%|██████████| 136/136 [00:03<00:00, 40.15it/s]


Epoch 47/50, Training Loss: 0.0357
Validation - Tag Acc: 0.9720, SQL Acc: 0.6934


Epoch 48/50: 100%|██████████| 136/136 [00:03<00:00, 40.05it/s]


Epoch 48/50, Training Loss: 0.0356
Validation - Tag Acc: 0.9708, SQL Acc: 0.6975


Epoch 49/50: 100%|██████████| 136/136 [00:03<00:00, 40.95it/s]


Epoch 49/50, Training Loss: 0.0304
Validation - Tag Acc: 0.9689, SQL Acc: 0.6934


Epoch 50/50: 100%|██████████| 136/136 [00:03<00:00, 41.40it/s]


Epoch 50/50, Training Loss: 0.0310
Validation - Tag Acc: 0.9681, SQL Acc: 0.6934


In [12]:
# Evaluating model on validation set
evaluate_model(trained_model, val_loader,dataset=val_dataset)

{'tag_loss': 0.17478251636892125,
 'sql_loss': 3.2299980548183616,
 'tag_acc': 0.9681159420289855,
 'sql_acc': 0.6934156378600823,
 'sql_match_acc': 0.448559670781893}

In [13]:
# Evaluating model on test set
evaluate_model(trained_model, test_loader,dataset=test_dataset)

{'tag_loss': 0.24567377020023065,
 'sql_loss': 7.839518092622693,
 'tag_acc': 0.965818363273453,
 'sql_acc': 0.465324384787472,
 'sql_match_acc': 0.33557046979865773}