In [1]:
!nvidia-smi

Sun Mar 19 23:55:08 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 531.18                 Driver Version: 531.18       CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                      TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3070 Ti    WDDM | 00000000:0B:00.0  On |                  N/A |
| 53%   42C    P2               76W / 310W|   7733MiB /  8192MiB |      4%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## T5 Model Version and Tokenizer

In [2]:
from transformers import AutoTokenizer, T5ForConditionalGeneration

### Version

In [3]:
CKPT = 'output/t5_base_finetuned_cosql'

### Model and Tokenizer

In [4]:
model = T5ForConditionalGeneration.from_pretrained(CKPT)
tokenizer = AutoTokenizer.from_pretrained(CKPT)

## Data Loading and Preprocessing

In [5]:
import json
import pandas as pd
from datasets import load_dataset, Dataset

### Loading

In [6]:
with open('data\cosql_dataset\sql_state_tracking\cosql_train.json', 'r') as f:
    train_data = json.load(f)
with open('data\cosql_dataset\sql_state_tracking\cosql_dev.json', 'r') as f:
    test_data = json.load(f)

In [7]:
def process_cosql(dataset, scope: str) -> pd.DataFrame:
    scope_valid = ['all', 'final', 'interaction']
    if scope not in scope_valid:
        raise ValueError(f'scope must be one of {scope_valid}')

    processed_input = []
    processed_target = []
    for dialog in dataset:
        if scope in ['all', 'final']:
            processed_input.append(dialog['final']['utterance'])
            processed_target.append(dialog['final']['query'])
        if scope in ['all', 'interaction']:
            for turn in dialog['interaction']:
                processed_input.append(turn['utterance'])
                processed_target.append(turn['query'])
                
    processed_dataset = pd.DataFrame({
        'input': processed_input,
        'target': processed_target
    })

    return processed_dataset

In [8]:
train_df_final = process_cosql(train_data, scope='final')
test_df_final = process_cosql(test_data, scope='final')

In [9]:
train_dataset_final = Dataset.from_pandas(train_df_final)
test_dataset_final = Dataset.from_pandas(test_df_final)

### Transformation

In [10]:
def format_dataset(example):
    return {'input': 'translate to SQL: ' + example['input'], 'target': example['target']}

In [11]:
train_dataset = train_dataset_final.map(format_dataset, remove_columns=train_dataset_final.column_names)
test_dataset = test_dataset_final.map(format_dataset, remove_columns=test_dataset_final.column_names)

Map:   0%|          | 0/2159 [00:00<?, ? examples/s]

Map:   0%|          | 0/293 [00:00<?, ? examples/s]

### Tokenization

In [12]:
# tokenize the examples
def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(example_batch['input'], pad_to_max_length=True, max_length=64)
    target_encodings = tokenizer.batch_encode_plus(example_batch['target'], pad_to_max_length=True, max_length=64)

    encodings = {
        'input_ids': input_encodings['input_ids'], 
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings

In [13]:
train_dataset = train_dataset.map(convert_to_features, batched=True, remove_columns=train_dataset.column_names)
test_dataset = test_dataset.map(convert_to_features, batched=True, remove_columns=test_dataset.column_names)

columns = ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask']

train_dataset.set_format(type='torch', columns=columns)
test_dataset.set_format(type='torch', columns=columns)

Map:   0%|          | 0/2159 [00:00<?, ? examples/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Map:   0%|          | 0/293 [00:00<?, ? examples/s]

## Model Training

In [14]:
from transformers import Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments

### Trainer Args

In [20]:
# set training arguments - Feel free to adapt it
training_args = Seq2SeqTrainingArguments(
    output_dir="output/t5_base_wikisql_finetuned_cosql",
    per_device_train_batch_size=16,
    num_train_epochs=5,
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    evaluation_strategy="epoch",
    do_train=True,
    do_eval=True,
    logging_strategy="epoch",
    # logging_steps=100,
    save_strategy="epoch",
    #save_steps=1000,
    #eval_steps=1000,
    overwrite_output_dir=True,
    save_total_limit=3,
    load_best_model_at_end=True,
    push_to_hub=False,
    #fp16=True, 
    log_level="info",
    logging_dir="output/t5_base_wikisql_finetuned_cosql/log",
    report_to="all",
)

PyTorch: setting up devices


### Metrics

In [21]:
from datasets import load_metric
import evaluate
rouge = load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

### Trainer

In [22]:
# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [23]:
# trainer.evaluate()

In [24]:
trainer.train()

***** Running training *****
  Num examples = 2159
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 675
  Number of trainable parameters = 222903552


  0%|          | 0/675 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 293
  Batch size = 16
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



{'loss': 0.3031, 'learning_rate': 4e-05, 'epoch': 1.0}


  0%|          | 0/19 [00:00<?, ?it/s]

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_

{'eval_loss': 0.6724526286125183, 'eval_rouge2_precision': 0.4076, 'eval_rouge2_recall': 0.2013, 'eval_rouge2_fmeasure': 0.2577, 'eval_runtime': 9.2895, 'eval_samples_per_second': 31.541, 'eval_steps_per_second': 2.045, 'epoch': 1.0}


Model weights saved in output/t5_base_wikisql_finetuned_cosql\checkpoint-135\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 293
  Batch size = 16
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



{'loss': 0.2741, 'learning_rate': 3e-05, 'epoch': 2.0}


  0%|          | 0/19 [00:00<?, ?it/s]

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_

{'eval_loss': 0.6812426447868347, 'eval_rouge2_precision': 0.4198, 'eval_rouge2_recall': 0.2091, 'eval_rouge2_fmeasure': 0.2679, 'eval_runtime': 8.5741, 'eval_samples_per_second': 34.173, 'eval_steps_per_second': 2.216, 'epoch': 2.0}


Model weights saved in output/t5_base_wikisql_finetuned_cosql\checkpoint-270\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 293
  Batch size = 16
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



{'loss': 0.2716, 'learning_rate': 2e-05, 'epoch': 3.0}


  0%|          | 0/19 [00:00<?, ?it/s]

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_

{'eval_loss': 0.6743648648262024, 'eval_rouge2_precision': 0.4255, 'eval_rouge2_recall': 0.2112, 'eval_rouge2_fmeasure': 0.2708, 'eval_runtime': 8.4997, 'eval_samples_per_second': 34.472, 'eval_steps_per_second': 2.235, 'epoch': 3.0}


Model weights saved in output/t5_base_wikisql_finetuned_cosql\checkpoint-405\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 293
  Batch size = 16
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



{'loss': 0.2871, 'learning_rate': 1e-05, 'epoch': 4.0}


  0%|          | 0/19 [00:00<?, ?it/s]

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_

{'eval_loss': 0.668492317199707, 'eval_rouge2_precision': 0.4426, 'eval_rouge2_recall': 0.216, 'eval_rouge2_fmeasure': 0.2784, 'eval_runtime': 8.4727, 'eval_samples_per_second': 34.582, 'eval_steps_per_second': 2.242, 'epoch': 4.0}


Model weights saved in output/t5_base_wikisql_finetuned_cosql\checkpoint-540\pytorch_model.bin
Deleting older checkpoint [output\t5_base_wikisql_finetuned_cosql\checkpoint-135] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 293
  Batch size = 16
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



{'loss': 0.3187, 'learning_rate': 0.0, 'epoch': 5.0}


  0%|          | 0/19 [00:00<?, ?it/s]

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_

{'eval_loss': 0.6602781414985657, 'eval_rouge2_precision': 0.4398, 'eval_rouge2_recall': 0.2153, 'eval_rouge2_fmeasure': 0.2777, 'eval_runtime': 9.0312, 'eval_samples_per_second': 32.443, 'eval_steps_per_second': 2.104, 'epoch': 5.0}


Model weights saved in output/t5_base_wikisql_finetuned_cosql\checkpoint-675\pytorch_model.bin
Deleting older checkpoint [output\t5_base_wikisql_finetuned_cosql\checkpoint-270] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from output/t5_base_wikisql_finetuned_cosql\checkpoint-675 (score: 0.6602781414985657).


{'train_runtime': 236.3041, 'train_samples_per_second': 45.683, 'train_steps_per_second': 2.856, 'train_loss': 0.2909071802209925, 'epoch': 5.0}


TrainOutput(global_step=675, training_loss=0.2909071802209925, metrics={'train_runtime': 236.3041, 'train_samples_per_second': 45.683, 'train_steps_per_second': 2.856, 'train_loss': 0.2909071802209925, 'epoch': 5.0})

In [25]:
trainer.save_model()

Saving model checkpoint to output/t5_base_wikisql_finetuned_cosql
Configuration saved in output/t5_base_wikisql_finetuned_cosql\config.json
Configuration saved in output/t5_base_wikisql_finetuned_cosql\generation_config.json
Model weights saved in output/t5_base_wikisql_finetuned_cosql\pytorch_model.bin


In [28]:
tokenizer.save_pretrained('output/t5_base_wikisql_finetuned_cosql')

tokenizer config file saved in output/t5_base_wikisql_finetuned_cosql\tokenizer_config.json
Special tokens file saved in output/t5_base_wikisql_finetuned_cosql\special_tokens_map.json
Copy vocab file to output/t5_base_wikisql_finetuned_cosql\spiece.model


('output/t5_base_wikisql_finetuned_cosql\\tokenizer_config.json',
 'output/t5_base_wikisql_finetuned_cosql\\special_tokens_map.json',
 'output/t5_base_wikisql_finetuned_cosql\\spiece.model',
 'output/t5_base_wikisql_finetuned_cosql\\added_tokens.json',
 'output/t5_base_wikisql_finetuned_cosql\\tokenizer.json')

## Model Inference on WikiSQL Test Set

### Fine Tuned Model

In [29]:
CKPT = 'output/t5_base_wikisql_finetuned_cosql'
model_tuned = T5ForConditionalGeneration.from_pretrained(CKPT)
tokenizer_tuned = AutoTokenizer.from_pretrained(CKPT)

loading configuration file output/t5_base_wikisql_finetuned_cosql\config.json
Model config T5Config {
  "_name_or_path": "output/t5_base_finetuned_cosql",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
   

### Inference Set

In [30]:
test_data[0]['final']['utterance']

'How many car models are produced by each maker? List the count and the maker full name.'

In [31]:
inference_data = test_data

In [32]:
def translate_to_sql(text, tokenizer, model):
    inputs = tokenizer(text, padding='longest', max_length=64, return_tensors='pt')
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask
    output = model.generate(input_ids, attention_mask=attention_mask, max_length=64)

    return tokenizer.decode(output[0], skip_special_tokens=True)

In [33]:
for i in range(0,100,10):
    print('translate to SQL: ' + inference_data[i]['final']['utterance'])
    print('Predict. :' + translate_to_sql('translate to SQL: ' + inference_data[i]['final']['utterance'], tokenizer=tokenizer_tuned, model=model_tuned))
    print('Expected: ' + inference_data[i]['final']['query'])
    print('=================================\n')
     

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



translate to SQL: How many car models are produced by each maker? List the count and the maker full name.


Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Predict. :SELECT count(*), T1.maker_full_name FROM car_model AS T1 JOIN maker AS T2 ON T1.car_model_id = T2.car_model_id GROUP BY T1.maker_full_name
Expected: SELECT Count(*) ,  T2.FullName ,  T2.id FROM MODEL_LIST AS T1 JOIN CAR_MAKERS AS T2 ON T1.Maker  =  T2.Id GROUP BY T2.id;

translate to SQL: Find the manager name and district of the shop whose number of products is the largest.


Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Predict. :SELECT T1.manager_name, T1.shop_district FROM products AS T1 JOIN shop AS T2 ON T1.shop_id = T2.shop_id GROUP BY T1.manager_name ORDER BY count(*) DESC
Expected: SELECT manager_name ,  district FROM shop ORDER BY number_products DESC LIMIT 1

translate to SQL: Show distinct names of singers that have songs with sales more than 300000.


Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Predict. :SELECT DISTINCT T1.Name FROM artist AS T1 JOIN song AS T2 ON T1.Song_ID = T2.Song_ID WHERE T2.Sales > 300000
Expected: SELECT DISTINCT T1.Name FROM singer AS T1 JOIN song AS T2 ON T1.Singer_ID  =  T2.Singer_ID WHERE T2.Sales  >  300000

translate to SQL: List the title of all cartoon directed by "Ben Jones" or "Brandon Vietti".


Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Predict. :SELECT title FROM cartoon WHERE director = "Ben Jones" OR director = "Brandon Vietti"
Expected: SELECT Title FROM Cartoon WHERE Directed_by = "Ben Jones" OR Directed_by = "Brandon Vietti";

translate to SQL: What is the birth date of the poker player with the lowest earnings?


Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Predict. :SELECT birth_date FROM player ORDER BY earnings LIMIT 1
Expected: SELECT T1.Birth_Date FROM people AS T1 JOIN poker_player AS T2 ON T1.People_ID  =  T2.People_ID ORDER BY T2.Earnings ASC LIMIT 1

translate to SQL: Show distinct names of singers that have songs with sales more than 300000.


Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Predict. :SELECT DISTINCT T1.Name FROM artist AS T1 JOIN song AS T2 ON T1.Song_ID = T2.Song_ID WHERE T2.Sales > 300000
Expected: SELECT DISTINCT T1.Name FROM singer AS T1 JOIN song AS T2 ON T1.Singer_ID  =  T2.Singer_ID WHERE T2.Sales  >  300000

translate to SQL: How many cars have a larger accelerate than the car with the largest horsepower?


Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Predict. :SELECT count(*) FROM car WHERE acceleration > (SELECT max(acceleration) FROM car WHERE horsepower = )
Expected: SELECT COUNT(*) FROM CARS_DATA WHERE Accelerate  >  ( SELECT Accelerate FROM CARS_DATA ORDER BY Horsepower DESC LIMIT 1 );

translate to SQL: Who is the first student to register? List the first name, middle name and last name.


Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Predict. :SELECT T1.first_name, T1.mid_name, T1.last_name FROM Student AS T1 JOIN Student_Registration AS T2 ON T1.student_id = T2.student_id GROUP BY T1.student_
Expected: SELECT first_name ,  middle_name ,  last_name FROM Students ORDER BY date_first_registered ASC LIMIT 1

translate to SQL: Find the codes of countries that have more than 50 players.


Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Predict. :SELECT country_code FROM player GROUP BY country_code HAVING count(*) > 50
Expected: SELECT country_code FROM players GROUP BY country_code HAVING count(*)  >  50

translate to SQL: How many times at most can a course enrollment result show in different transcripts? Also show the course enrollment id.
Predict. :SELECT T1.course_enrollment_id FROM Course_Enrolment AS T1 JOIN Course_Enrolment AS T2 ON T1.course_id = T2.course_id GROUP BY T1.course_enrollment_id
Expected: SELECT count(*) ,  student_course_id FROM Transcript_Contents GROUP BY student_course_id ORDER BY count(*) DESC LIMIT 1

