In [1]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

  from .autonotebook import tqdm as notebook_tqdm


### Load Model and Tokenizer

In [2]:
tokenizer = T5Tokenizer.from_pretrained('C:/Users/DELL/Desktop/Projects/2. NL2SQL/NL2SQL/Backend/model_and_tokenizer/tokenizer-final')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
model = T5ForConditionalGeneration.from_pretrained('C:/Users/DELL/Desktop/Projects/2. NL2SQL/NL2SQL/Backend/model_and_tokenizer/model-final')


### CUDA Activation if available

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Activating Model

In [5]:
model = model.to(device)
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [6]:
def generate_sql(input_prompt):
    # Tokenize the input prompt
    inputs = tokenizer(input_prompt, padding=True, truncation=True, return_tensors="pt").to(device)

    # Forward pass
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=512)

    # Decode the output IDs to a string (SQL query in this case)
    generated_sql = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return generated_sql

In [7]:
# Test the function
input_prompt = "tables:\n" + "CREATE TABLE student_course_attendance (student_id VARCHAR); CREATE TABLE students (student_id VARCHAR)" + "\n" + "query for:" + "List the id of students who never attends courses?"

generated_sql = generate_sql(input_prompt)

print(f"The generated SQL query is: {generated_sql}")
#OUTPUT: The generated SQL query is: SELECT student_id FROM students WHERE NOT student_id IN (SELECT student_id FROM student_course_attendance)

The generated SQL query is: SELECT student_id FROM students WHERE NOT student_id IN (SELECT student_id FROM student_course_attendance)


In [8]:
input_prompt = "tables:\n" + "CREATE TABLE Catalogs (date_of_latest_revision VARCHAR)" + "\n" +"query for: Find the dates on which more than one revisions were made."

generated_sql = generate_sql(input_prompt)

print(f"The generated SQL query is: {generated_sql}")

The generated SQL query is: SELECT date_of_latest_revision FROM Catalogs GROUP BY date_of_latest_revision HAVING COUNT(*) > 1


In [9]:
input_prompt = "tables:\n" + "CREATE TABLE table_22767 ( \"Year\" real, \"World\" real, \"Asia\" text, \"Africa\" text, \"Europe\" text, \"Latin America/Caribbean\" text, \"Northern America\" text, \"Oceania\" text )" + "\n" +"query for:what will the population of Asia be when Latin America/Caribbean is 783 (7.5%)?."

generated_sql = generate_sql(input_prompt)

print(f"The generated SQL query is: {generated_sql}")

The generated SQL query is: SELECT "Asia" FROM table_22767 WHERE "Latin America/Caribbean" = '783 (7.5%)'


In [10]:
input_prompt = "tables:\n" + "CREATE TABLE procedures ( subject_id text, hadm_id text, icd9_code text, short_title text, long_title text ) CREATE TABLE diagnoses ( subject_id text, hadm_id text, icd9_code text, short_title text, long_title text ) CREATE TABLE lab ( subject_id text, hadm_id text, itemid text, charttime text, flag text, value_unit text, label text, fluid text ) CREATE TABLE demographic ( subject_id text, hadm_id text, name text, marital_status text, age text, dob text, gender text, language text, religion text, admission_type text, days_stay text, insurance text, ethnicity text, expire_flag text, admission_location text, discharge_location text, diagnosis text, dod text, dob_year text, dod_year text, admittime text, dischtime text, admityear text ) CREATE TABLE prescriptions ( subject_id text, hadm_id text, icustay_id text, drug_type text, drug text, formulary_drug_cd text, route text, drug_dose text )" + "\n" +"query for:" + "what is the total number of patients who were diagnosed with icd9 code 2254?"

generated_sql = generate_sql(input_prompt)

print(f"The generated SQL query is: {generated_sql}")

The generated SQL query is: SELECT COUNT(DISTINCT demographic.subject_id) FROM demographic INNER JOIN diagnoses ON demographic.hadm_id = diagnoses.hadm_id WHERE diagnoses.icd9_code = "2254"


In [11]:
input_prompt = "tables:\n" + "CREATE TABLE table_22767 (Year, World, Asia, Africa, Europe, Latin America/Caribbean, Northern America, Oceania) CREATE TABLE table_83210(Year, Country, GDP)" + "\n" +"query for: what is the GDP of countries where the population of Asia exceeds 500 million in the year 2020?"
 

generated_sql = generate_sql(input_prompt)

print(f"The generated SQL query is: {generated_sql}")


The generated SQL query is: SELECT AVG(PIB) FROM table_83210 WHERE Asia > 500 AND Asia = "2020" AND YEAR = "2020"


In [None]:
input_prompt = "tables: CREATE TABLE departments("department_id" int NOT NULL AUTO_INCREMENT,
  "department_name" varchar(100) NOT NULL,
  PRIMARY KEY ("department_id")
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; CREATE TABLE "projects" (
  "project_id" int NOT NULL AUTO_INCREMENT,
  "project_name" varchar(100) NOT NULL,
  "department_id" int DEFAULT NULL,
  PRIMARY KEY ("project_id"),
  KEY "department_id" ("department_id"),
  CONSTRAINT "projects_ibfk_1" FOREIGN KEY ("department_id") REFERENCES "departments" ("department_id")
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
query for: test"