# Pilot Data Split for Fine-Tuning T5

This notebook is used to split the dataset into train/val/test.

## Import Packages

In [15]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
import os
import pandas as pd
import json
import argparse
from nlp import load_metric
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl
from transformers import T5Tokenizer
import shutil
from sklearn.model_selection import train_test_split

from model_miguel import T5FineTuner, set_seed
import dataset as ds
from callback import LoggingCallback, logger
from config import model_params

## Data Preprocessing

In [29]:
SPLIT_LEVEL = 'input-level'

NROWS = None

VAL_SIZE = 0.15
NUM_SAMPLES_PER_QUESTION = 100 #None for all

if NUM_SAMPLES_PER_QUESTION:
    SAMPLE_FOLDER = str(NUM_SAMPLES_PER_QUESTION)
else:
    SAMPLE_FOLDER = 'all'
    
DATA_DIR = '/home/ec2-user/SageMaker/efs/data/pilot_nl2sql_dev/processed_equivalent_questions/'
OUTPUT_DIR = f'/home/ec2-user/SageMaker/efs/data/pilot_nl2sql_dev/t5_tuning2/{SPLIT_LEVEL}/{SAMPLE_FOLDER}/data/'
    
TRAIN_FNAME = f'pilot_questions_for_labeling_{SPLIT_LEVEL}_training_updated.csv'
VAL_FNAME = f'pilot_questions_for_labeling_{SPLIT_LEVEL}_val_updated.csv'
TEST_FNAME = f'pilot_questions_for_labeling_{SPLIT_LEVEL}_test_updated.csv'

TRAIN_DATA_PATH = os.path.join(DATA_DIR, TRAIN_FNAME)
TEST_DATA_PATH = os.path.join(DATA_DIR, TEST_FNAME)

#DATA READY FOR MODEL TRAINING
OUT_TRAIN_DATA_PATH = os.path.join(OUTPUT_DIR, 'train.csv')
OUT_VAL_DATA_PATH = os.path.join(OUTPUT_DIR, 'validation.csv')
OUT_TEST_DATA_PATH = os.path.join(OUTPUT_DIR, 'test.csv')

MODEL_CONFIG_PATH = 'config.py'

SET_SEED = False

In [30]:
df_train0 = pd.read_csv(TRAIN_DATA_PATH, nrows=NROWS)
df_test = pd.read_csv(TEST_DATA_PATH, nrows=NROWS)

#Rename column name to make it compatible with model training
df_train0.rename({'Outputs': 'question'}, inplace=True, axis=1)
df_test.rename({'Outputs': 'question'}, inplace=True, axis=1)

print(f'Train: {df_train0.shape}, Test: {df_test.shape}')
df_train0.head()

Train: (548490, 4), Test: (247845, 4)


Unnamed: 0,question,Input,query,final_query_example
0,number of patients grouped by gender and year ...,Number of patients grouped by year of birth an...,"SELECT year_of_birth, gender, COUNT( DISTINCT ...","SELECT year_of_birth, gender, COUNT( DISTINCT ..."
1,number of patients grouped by gender and birth...,Number of patients grouped by year of birth an...,"SELECT year_of_birth, gender, COUNT( DISTINCT ...","SELECT year_of_birth, gender, COUNT( DISTINCT ..."
2,number of patients grouped by gender and yob.,Number of patients grouped by year of birth an...,"SELECT year_of_birth, gender, COUNT( DISTINCT ...","SELECT year_of_birth, gender, COUNT( DISTINCT ..."
3,number of patients grouped by sex and year of ...,Number of patients grouped by year of birth an...,"SELECT year_of_birth, gender, COUNT( DISTINCT ...","SELECT year_of_birth, gender, COUNT( DISTINCT ..."
4,number of patients grouped by sex and birth year.,Number of patients grouped by year of birth an...,"SELECT year_of_birth, gender, COUNT( DISTINCT ...","SELECT year_of_birth, gender, COUNT( DISTINCT ..."


In [31]:
n_train_questions = df_train0['Input'].nunique()
n_test_questions = df_test['Input'].nunique()
print(f'Train Unique Questions: {n_train_questions}, Test Unique Questions: {n_test_questions}')

Train Unique Questions: 44, Test Unique Questions: 12


In [32]:
#Split Train to Train & Val
n_val = int(VAL_SIZE * n_train_questions)
n_val

6

In [33]:
#Get unique questions from train
questions = list(df_train0['Input'].unique())
len(questions)

44

In [34]:
train_questions, val_questions = train_test_split(questions, test_size=n_val)

In [35]:
len(train_questions), len(val_questions)

(38, 6)

In [36]:
df_train = df_train0[df_train0['Input'].isin(train_questions)]
df_val = df_train0[df_train0['Input'].isin(val_questions)]

df_train0.shape, df_train.shape, df_val.shape

((548490, 4), (492495, 4), (55995, 4))

In [37]:
###Randomly sample equivalent questions
if NUM_SAMPLES_PER_QUESTION is not None:
    df_train = df_train.groupby('Input').apply(lambda x: x.sample(min(x.shape[0], NUM_SAMPLES_PER_QUESTION))).reset_index(drop=True)
    df_val = df_val.groupby('Input').apply(lambda x: x.sample(min(x.shape[0], NUM_SAMPLES_PER_QUESTION))).reset_index(drop=True)
    df_test = df_test.groupby('Input').apply(lambda x: x.sample(min(x.shape[0], NUM_SAMPLES_PER_QUESTION))).reset_index(drop=True)
print(f'Train: {df_train.shape}, Val: {df_val.shape}, Test: {df_test.shape}')

Train: (3800, 4), Val: (600, 4), Test: (1200, 4)


In [38]:
df_train.head()

Unnamed: 0,question,Input,query,final_query_example
0,tell me how many people did each race group in...,Count of patients grouped by race.,"SELECT race, COUNT(DISTINCT pe1.person_id) AS ...","SELECT race, COUNT(DISTINCT pe1.person_id) AS ..."
1,How many people end up in each race group?,Count of patients grouped by race.,"SELECT race, COUNT(DISTINCT pe1.person_id) AS ...","SELECT race, COUNT(DISTINCT pe1.person_id) AS ..."
2,calculate how many subjects does each race gro...,Count of patients grouped by race.,"SELECT race, COUNT(DISTINCT pe1.person_id) AS ...","SELECT race, COUNT(DISTINCT pe1.person_id) AS ..."
3,count of subjects grouped by race.,Count of patients grouped by race.,"SELECT race, COUNT(DISTINCT pe1.person_id) AS ...","SELECT race, COUNT(DISTINCT pe1.person_id) AS ..."
4,How many individuals end up in race groups?,Count of patients grouped by race.,"SELECT race, COUNT(DISTINCT pe1.person_id) AS ...","SELECT race, COUNT(DISTINCT pe1.person_id) AS ..."


In [39]:
#Save to output
os.makedirs(OUTPUT_DIR, exist_ok=True)
df_train.to_csv(OUT_TRAIN_DATA_PATH, index=False)
df_val.to_csv(OUT_VAL_DATA_PATH, index=False)
df_test.to_csv(OUT_TEST_DATA_PATH, index=False)