In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json
import re
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def load_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

In [None]:
tables = load_json('/content/drive/MyDrive/ Text to Sql/spider/tables.json')
train_data = load_json('/content/drive/MyDrive/ Text to Sql/spider/train_spider.json')

In [None]:
def extract_and_enrich_queries(data, schema):
    enriched_pairs = []
    for item in data:
        db_id = item['db_id']
        question = item['question']
        sql_query = item['query']
        # Find the corresponding schema
        db_schema = next((s for s in schema if s['db_id'] == db_id), None)
        if db_schema:
            # Optionally enrich or simplify sql_query here using db_schema
            # For example, substituting table and column names with placeholders (simplified)
            for table in db_schema['table_names_original']:
                if table in sql_query:
                    sql_query = sql_query.replace(table, 'table')
            for column_info in db_schema['column_names_original']:
                if column_info[1] in sql_query:
                    sql_query = sql_query.replace(column_info[1], 'column')
        enriched_pairs.append((question, sql_query))
    return enriched_pairs

In [None]:
query_pairs = extract_and_enrich_queries(train_data, tables)


In [None]:
query_pairs

[('How many heads of the departments are older than 56 ?',
  'SELECT count(column) FROM table WHERE column  >  56'),
 ('List the name, born state and age of the heads of departments ordered by age.',
  'SELECT column ,  column ,  column FROM table ORDER BY column'),
 ('List the creation year, name and budget of each department.',
  'SELECT creation ,  column ,  budget_in_billions FROM table'),
 ('What are the maximum and minimum budget of the departments?',
  'SELECT max(budget_in_billions) ,  min(budget_in_billions) FROM table'),
 ('What is the average number of employees of the departments whose rank is between 10 and 15?',
  'SELECT avg(num_employees) FROM table WHERE ranking BETWEEN 10 AND 15'),
 ('What are the names of the heads who are born outside the California state?',
  "SELECT column FROM table WHERE column != 'California'"),
 ("What are the distinct creation years of the departments managed by a secretary born in state 'Alabama'?",
  "SELECT DISTINCT T1.creation FROM table 

In [None]:
def tokenize_and_convert_to_sequences(pairs):
    vocab = {}
    sequences = []
    for question, sql in pairs:
        tokenized_question = word_tokenize(question.lower())
        tokenized_sql = word_tokenize(sql.lower())
        sequences.append((tokenized_question, tokenized_sql))
        for word in tokenized_question + tokenized_sql:
            if word not in vocab:
                vocab[word] = len(vocab) + 1  # Incremental index
    vocab['<pad>'] = 0  # Padding token
    return sequences, vocab

In [None]:
sequences, vocab = tokenize_and_convert_to_sequences(query_pairs)


In [None]:
sequences

[(['how',
   'many',
   'heads',
   'of',
   'the',
   'departments',
   'are',
   'older',
   'than',
   '56',
   '?'],
  ['select',
   'count',
   '(',
   'column',
   ')',
   'from',
   'table',
   'where',
   'column',
   '>',
   '56']),
 (['list',
   'the',
   'name',
   ',',
   'born',
   'state',
   'and',
   'age',
   'of',
   'the',
   'heads',
   'of',
   'departments',
   'ordered',
   'by',
   'age',
   '.'],
  ['select',
   'column',
   ',',
   'column',
   ',',
   'column',
   'from',
   'table',
   'order',
   'by',
   'column']),
 (['list',
   'the',
   'creation',
   'year',
   ',',
   'name',
   'and',
   'budget',
   'of',
   'each',
   'department',
   '.'],
  ['select',
   'creation',
   ',',
   'column',
   ',',
   'budget_in_billions',
   'from',
   'table']),
 (['what',
   'are',
   'the',
   'maximum',
   'and',
   'minimum',
   'budget',
   'of',
   'the',
   'departments',
   '?'],
  ['select',
   'max',
   '(',
   'budget_in_billions',
   ')',
   ',',
   'mi

In [None]:
vocab

{'how': 1,
 'many': 2,
 'heads': 3,
 'of': 4,
 'the': 5,
 'departments': 6,
 'are': 7,
 'older': 8,
 'than': 9,
 '56': 10,
 '?': 11,
 'select': 12,
 'count': 13,
 '(': 14,
 'column': 15,
 ')': 16,
 'from': 17,
 'table': 18,
 'where': 19,
 '>': 20,
 'list': 21,
 'name': 22,
 ',': 23,
 'born': 24,
 'state': 25,
 'and': 26,
 'age': 27,
 'ordered': 28,
 'by': 29,
 '.': 30,
 'order': 31,
 'creation': 32,
 'year': 33,
 'budget': 34,
 'each': 35,
 'department': 36,
 'budget_in_billions': 37,
 'what': 38,
 'maximum': 39,
 'minimum': 40,
 'max': 41,
 'min': 42,
 'is': 43,
 'average': 44,
 'number': 45,
 'employees': 46,
 'whose': 47,
 'rank': 48,
 'between': 49,
 '10': 50,
 '15': 51,
 'avg': 52,
 'num_employees': 53,
 'ranking': 54,
 'names': 55,
 'who': 56,
 'outside': 57,
 'california': 58,
 '!': 59,
 '=': 60,
 "'california": 61,
 "'": 62,
 'distinct': 63,
 'years': 64,
 'managed': 65,
 'a': 66,
 'secretary': 67,
 'in': 68,
 "'alabama": 69,
 't1.creation': 70,
 'as': 71,
 't1': 72,
 'join': 7

In [None]:
def convert_to_ids(sequences, vocab):
    numerical_data = []
    for question_tokens, sql_tokens in sequences:
        question_ids = [vocab[token] for token in question_tokens if token in vocab]
        sql_ids = [vocab[token] for token in sql_tokens if token in vocab]
        numerical_data.append((question_ids, sql_ids))
    return numerical_data

In [None]:
numerical_data = convert_to_ids(sequences, vocab)

In [None]:
print("Sample preprocessed data:", numerical_data[:5])

Sample preprocessed data: [([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [12, 13, 14, 15, 16, 17, 18, 19, 15, 20, 10]), ([21, 5, 22, 23, 24, 25, 26, 27, 4, 5, 3, 4, 6, 28, 29, 27, 30], [12, 15, 23, 15, 23, 15, 17, 18, 31, 29, 15]), ([21, 5, 32, 33, 23, 22, 26, 34, 4, 35, 36, 30], [12, 32, 23, 15, 23, 37, 17, 18]), ([38, 7, 5, 39, 26, 40, 34, 4, 5, 6, 11], [12, 41, 14, 37, 16, 23, 42, 14, 37, 16, 17, 18]), ([38, 43, 5, 44, 45, 4, 46, 4, 5, 6, 47, 48, 43, 49, 50, 26, 51, 11], [12, 52, 14, 53, 16, 17, 18, 19, 54, 49, 50, 26, 51])]


### Dataset

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [None]:
# Configuration
INPUT_DIM = 1000   # Adjust as per your vocabulary size
OUTPUT_DIM = 1000  # Adjust as per your vocabulary size
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
BATCH_SIZE = 32
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Encoder
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim, bidirectional=True)
        self.fc = nn.Linear(hid_dim * 2, hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)))
        return outputs, hidden

# Attention
class Attention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.attn = nn.Linear(hid_dim * 3, 1, bias=False)

    def forward(self, hidden, encoder_outputs, mask):
        src_len = encoder_outputs.shape[0]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = energy.squeeze(2)
        attention = attention.masked_fill(mask == 0, -1e10)
        return torch.softmax(attention, dim=1)

# Decoder
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(hid_dim + emb_dim, hid_dim)
        self.fc_out = nn.Linear(hid_dim * 3, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs, mask):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        a = self.attention(hidden, encoder_outputs, mask)
        a = a.unsqueeze(1)
        weighted = torch.bmm(a, encoder_outputs.permute(1, 0, 2))
        weighted = weighted.permute(1, 0, 2)
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        assert (output == hidden).all()
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))
        return prediction, hidden.squeeze(0), a.squeeze(1)


In [None]:
! pip install accelerate



In [None]:
!pip install torch transformers bitsandbytes accelerate sqlparse openai

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import re

model_name = "defog/sqlcoder-7b-2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    device_map="auto",
    use_cache=True,
)

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
Collecting openai
  Downloading openai-1.30.2-py3-none-any.whl (320 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidi

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/515 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/691 [00:00<?, ?B/s]

ImportError: Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install accelerate`