In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Preprocessing Functions

In [None]:
!pip install fuzzywuzzy -q

In [None]:
import re
import pandas as pd
import nltk
import json
from nltk.tokenize import word_tokenize
from fuzzywuzzy import fuzz
from fuzzywuzzy import process



In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def normalize_text(text):
    """ Normalize text by lowering case and keeping important SQL symbols """
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s,.*=<>()]', '', text)
    return text

def tokenize_text(text):
    """ Tokenize text using nltk's word_tokenize, ensuring SQL syntax is considered """
    return word_tokenize(text)

def parse_schema(schema):
    """ Parse the database schema into a dictionary mapping table names to their columns """
    tables = {}
    for table_idx, table_name in enumerate(schema['table_names']):
        columns = [schema['column_names'][idx][1] for idx in range(len(schema['column_names'])) if schema['column_names'][idx][0] == table_idx]
        tables[table_name] = {'columns': columns}
    return tables

def link_question_to_schema(question, tables):
    """ Enhanced linking using fuzzy matching and considering both table and column names """
    linked_elements = {}
    tokens = tokenize_text(question)
    for token in tokens:
        best_match = None
        highest_score = 0
        for table_name, info in tables.items():
            # Check table name match
            table_score = fuzz.partial_ratio(token.lower(), table_name.lower())
            if table_score > highest_score:
                best_match = ('table', table_name)
                highest_score = table_score
            # Check column name match
            for column in info['columns']:
                column_score = fuzz.partial_ratio(token.lower(), column.lower())
                if column_score > highest_score:
                    best_match = ('column', column)
                    highest_score = column_score
        if highest_score > 80:  # Adjust the threshold as needed
            linked_elements[token] = best_match
    return linked_elements

In [None]:
def preprocess_data(data, tables):
    """ Preprocess data by normalizing, tokenizing, linking to schema, and extracting features """
    processed_data = []
    questions = []
    schema_links = []
    sql_queries = []

    for entry in data:
        question = entry['question']
        normalized_question = normalize_text(question)
        tokenized_question = tokenize_text(normalized_question)
        schema_linked = link_question_to_schema(normalized_question, tables)

        processed_data.append({
            'original_question': question,
            'normalized_question': normalized_question,
            'tokenized_question': tokenized_question,
            'schema_links': schema_linked,
            'sql_query': entry['query']
        })

        questions.append(question)
        schema_links.append(schema_linked)
        sql_queries.append(entry['query'])

    return processed_data, questions, schema_links, sql_queries

In [None]:
def preprocess_all_data(train_data, db_schemas):
    """ Preprocess all training data for each schema available in db_schemas """

    all_processed_data = []
    questions_all = []
    schema_links_all = []
    sql_queries_all = []

    schema_mapping = {schema['db_id']: parse_schema(schema) for schema in db_schemas}

    for entry in train_data:
        db_id = entry['db_id']
        schema = schema_mapping[db_id]
        processed_entry, questions, schema_links, sql_queries = preprocess_data([entry], schema)
        all_processed_data.extend(processed_entry)

        questions_all.extend(questions)
        schema_links_all.extend(schema_links)
        sql_queries_all.extend(sql_queries)


     # Create a DataFrame from accumulated lists
    df = pd.DataFrame({
        'question': questions_all,
        'schema_links': schema_links_all,
        'sql_query': sql_queries_all
    })

    return all_processed_data, df


### Loading JSON Data and Preprocessing It

In [None]:
import json

In [None]:
def load_json_file(file_path):
    """ Load a JSON file and return the data """
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

In [None]:
db_schemas = load_json_file('/content/drive/MyDrive/ Text to Sql/spider/tables.json')
train_data = load_json_file('/content/drive/MyDrive/ Text to Sql/spider/train_spider.json')

In [None]:
processed_train_data, df_preprocessed_training_data_all = preprocess_all_data(train_data, db_schemas)

### Model Architecture

In [None]:
from transformers import BertModel, BertTokenizer
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

In [None]:
class TextToSQLModel(nn.Module):
    def __init__(self, tokenizer_vocab_size):
        super(TextToSQLModel, self).__init__()
        self.encoder = BertModel.from_pretrained('bert-base-uncased')
        self.decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=768, nhead=8), num_layers=6
        )
        self.embedding = nn.Embedding(tokenizer_vocab_size, 768)
        self.output_linear = nn.Linear(768, tokenizer_vocab_size)

    def forward(self, input_ids, attention_mask, tgt_tokens):
        encoder_hidden_states = self.encoder(input_ids=input_ids, attention_mask=attention_mask)[0]
        tgt_embeddings = self.embedding(tgt_tokens)  # Embedding target tokens
        decoder_outputs = self.decoder(tgt_embeddings, encoder_hidden_states)
        output = self.output_linear(decoder_outputs)
        return output


In [None]:
class SQLDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        inputs = self.tokenizer.encode_plus(
            item['normalized_question'],
            None,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_attention_mask=True,
            truncation=True
        )
        targets = self.tokenizer.encode(
            item['sql_query'],
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True
        )
        return {
            'input_ids': torch.tensor(inputs['input_ids']),
            'attention_mask': torch.tensor(inputs['attention_mask']),
            'labels': torch.tensor(targets)
        }


In [None]:
from tqdm import tqdm

def train_model(model, data_loader, optimizer, device, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(data_loader, desc=f'Epoch {epoch+1}/{epochs}', leave=False)
        for batch in progress_bar:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask, labels)
            loss = nn.CrossEntropyLoss()(outputs.transpose(1, 2), labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            progress_bar.set_postfix({'Batch Loss': loss.item()})

        avg_loss = total_loss / len(data_loader)
        return (f'End of Epoch {epoch+1}, Average Loss: {avg_loss:.4f}')




In [None]:
def evaluate_model(model, data_loader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask, labels)
            loss = nn.CrossEntropyLoss()(outputs.transpose(1, 2), labels)
            total_loss += loss.item()
    return total_loss / len(data_loader)


In [None]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = TextToSQLModel().to(device)
# optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)




In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Obtain the vocabulary size from the tokenizer
tokenizer_vocab_size = tokenizer.vocab_size

# Initialize the model with the required vocabulary size
model = TextToSQLModel(tokenizer_vocab_size).to(device)

# Set up the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
train_dataset = SQLDataset(processed_train_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)



In [None]:
# Call training
train_loss = train_model(model, train_loader, optimizer, device)



In [None]:
print("Training loss:", train_loss)

Training loss: End of Epoch 1, Average Loss: 0.1412


### Save the Model

In [None]:
def save_model(model, optimizer, save_path):
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, save_path)

# Example usage
save_path = '/content/drive/MyDrive/ Text to Sql/model_checkpoint.pth'
save_model(model, optimizer, save_path)