# Fine-Tuning T5 Transformer Model on NL2SQL Dataset

This notebook is used to fine-tune T5 model architecture using a NL2SQL custom dataset.

Adapted From:
* https://github.com/DorBernsohn/CodeLM/tree/main/SQLM

## Install Packages

In [None]:
#Note: There is issues with jupyter notebook progress bar, so you can ignore this installation
#! pip install ipywidgets
#!jupyter nbextension enable --py widgetsnbextension
#! conda install nodejs -c conda-forge --repodata-fn=repodata.json #Run it in terminal if not working here
#! jupyter labextension install @jupyter-widgets/jupyterlab-manager

## Import Packages

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append('../src/')

In [None]:
import os
import pandas as pd
import json
import argparse
from nlp import load_metric
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl
from transformers import T5Tokenizer
import shutil
from sklearn.model_selection import train_test_split

from model_miguel import T5FineTuner, set_seed
import dataset as ds
from callback import LoggingCallback, logger
from config import model_params

from modelperformance import *

In [None]:
#!pip install -U pytorch-lightning

In [None]:
#!pip install spacy
#! python -m spacy download en_core_web_sm

In [None]:
pl.__version__

## Data Preprocessing

In [None]:
SPLIT_LEVEL = 'input-level'

NROWS = None

DATA_DIR = '/home/ec2-user/SageMaker/efs/data/pilot_nl2sql_dev/processed_equivalent_questions/'
OUTPUT_DIR = '/home/ec2-user/SageMaker/efs/data/pilot_nl2sql_dev/final_equivalent_questions/'

TRAIN_FNAME = f'pilot_questions_for_labeling_{SPLIT_LEVEL}_training_updated.csv'
VAL_FNAME = f'pilot_questions_for_labeling_{SPLIT_LEVEL}_val_updated.csv'
TEST_FNAME = f'pilot_questions_for_labeling_{SPLIT_LEVEL}_test_updated.csv'

TRAIN_DATA_PATH = os.path.join(DATA_DIR, TRAIN_FNAME)
TEST_DATA_PATH = os.path.join(DATA_DIR, TEST_FNAME)

OUT_TRAIN_DATA_PATH = os.path.join(OUTPUT_DIR, TRAIN_FNAME.replace('_updated', '_sampled'))
OUT_VAL_DATA_PATH = os.path.join(OUTPUT_DIR, VAL_FNAME.replace('_updated', '_sampled'))
OUT_TEST_DATA_PATH = os.path.join(OUTPUT_DIR, TEST_FNAME.replace('_updated', '_sampled'))

VAL_SIZE = 0.15
NUM_SAMPLES_PER_QUESTION = 10

#DATA READY FOR MODEL TRAINING
FINAL_OUTPUT_DIR = '/home/ec2-user/SageMaker/efs/data/pilot_nl2sql_dev/t5_tuning2/data/'
FINAL_TRAIN_DATA_PATH = os.path.join(FINAL_OUTPUT_DIR, 'train.csv')
FINAL_VAL_DATA_PATH = os.path.join(FINAL_OUTPUT_DIR, 'validation.csv')
FINAL_TEST_DATA_PATH = os.path.join(FINAL_OUTPUT_DIR, 'test.csv')

MODEL_CONFIG_PATH = 'config.py'

SET_SEED = False

In [None]:
df_train0 = pd.read_csv(TRAIN_DATA_PATH, nrows=NROWS)
df_test0 = pd.read_csv(TEST_DATA_PATH, nrows=NROWS)

#Rename column name to make it compatible with model training
df_train0.rename({'Outputs': 'question'}, inplace=True, axis=1)
df_test0.rename({'Outputs': 'question'}, inplace=True, axis=1)

print(f'Train: {df_train0.shape}, Test: {df_test0.shape}')
df_train0.head()

In [None]:
n_train_questions = df_train0['Input'].nunique()
n_test_questions = df_test0['Input'].nunique()
print(f'Train Unique Questions: {n_train_questions}, Test Unique Questions: {n_test_questions}')

In [None]:
#Split Train to Train & Val
n_val = int(VAL_SIZE * n_train_questions)
n_val

In [None]:
#Get unique questions from train
questions = list(df_train0['Input'].unique())
len(questions)

In [None]:
train_questions, val_questions = train_test_split(questions, test_size=n_val)

In [None]:
len(train_questions), len(val_questions)

In [None]:
df_train = df_train0[df_train0['Input'].isin(train_questions)]
df_val = df_train0[df_train0['Input'].isin(val_questions)]

df_train0.shape, df_train.shape, df_val.shape

In [None]:
###Randomly sample equivalent questions
df_train = df_train.groupby('Input').apply(lambda x: x.sample(min(x.shape[0], NUM_SAMPLES_PER_QUESTION))).reset_index(drop=True)
df_val = df_val.groupby('Input').apply(lambda x: x.sample(min(x.shape[0], NUM_SAMPLES_PER_QUESTION))).reset_index(drop=True)
df_test = df_test0.groupby('Input').apply(lambda x: x.sample(min(x.shape[0], NUM_SAMPLES_PER_QUESTION))).reset_index(drop=True)
print(f'Train: {df_train.shape}, Val: {df_val.shape}, Test: {df_test.shape}')

In [None]:
df_train.head()

In [None]:
#Save to output
os.makedirs(OUTPUT_DIR, exist_ok=True)
df_train.to_csv(OUT_TRAIN_DATA_PATH, index=False)
df_val.to_csv(OUT_VAL_DATA_PATH, index=False)
df_test.to_csv(OUT_TEST_DATA_PATH, index=False)

In [None]:
##Copy training data to the appropriate place
os.makedirs(FINAL_OUTPUT_DIR, exist_ok=True)
shutil.copy(OUT_TRAIN_DATA_PATH, FINAL_TRAIN_DATA_PATH)
shutil.copy(OUT_VAL_DATA_PATH, FINAL_VAL_DATA_PATH)
shutil.copy(OUT_TEST_DATA_PATH, FINAL_TEST_DATA_PATH)
print('Success!')

### Model Training

In [None]:
model_params

In [None]:
#Set random seed if needed
if model_params["seed"]:
    set_seed(model_params["seed"])

In [None]:
args = argparse.Namespace(**model_params)

In [None]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=3
)

## If resuming from checkpoint, add an arg resume_from_checkpoint
train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    resume_from_checkpoint=args.resume_from_checkpoint,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=checkpoint_callback,
    val_check_interval=args.val_check_interval,
    logger=None,
    callbacks=[LoggingCallback()],
    
    #newly added
    accelerator="dp", #"dp"|"ddp"|"ddp2"
    automatic_optimization = False
)

In [None]:
model = T5FineTuner(args)

In [None]:
model.automatic_optimization = False

In [None]:
trainer = pl.Trainer(**train_params)

In [None]:
trainer.fit(model)

## Model Inference

In [None]:
df_test = pd.read_csv(FINAL_TEST_DATA_PATH)
df_test.head()

In [None]:
question = df_test.iloc[0]['question']
question

In [None]:
batch_temp = model.tokenizer.batch_encode_plus(["SELECT people FROM peoples where age > 10"], max_length=150, 
                                                     padding='max_length', truncation=True, return_tensors="pt")

In [None]:
batch_temp['input_ids'].shape

In [None]:
def get_sql(question, model):
    #input_text = "translate English to SQL: %s </s>" % question
    input_text = "translate English to SQL: %s" % question
    inputs = model.tokenizer.batch_encode_plus([input_text], 
                                                 max_length=150,
                                                 padding='max_length', 
                                                 truncation=True, 
                                                 return_tensors="pt")

    model.to('cuda')
    outs = model.model.generate(
                inputs["input_ids"].cuda(),
                attention_mask=inputs["attention_mask"].cuda(),
                use_cache=False,
                # decoder_attention_mask=batch['target_mask'].cuda(),
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0
                # early_stopping=True
            )

    sql = [model.tokenizer.decode(ids) for ids in outs][0]
    sql = sql.replace('<pad>', '')
    sql = sql.replace('</s>', '')
    sql = sql.strip()
    
    return sql

def predict(row, model):
    question = row['question'].strip()
    sql = get_sql(question, model)
    return sql

In [None]:
_ = get_sql(question, model)

In [None]:
df_test['prediction'] = df_test.apply(predict, args=(model,), axis=1)
df_test.head()

In [None]:
true_values = df_test['query'].values
pred_values = df_test['prediction'].values

In [None]:
acc_if(true_values, pred_values, token=True)

In [None]:
acc_ex(true_values, pred_values, entites=['test'])

### Next Steps
* Add model performance metrics