# Preprocessing the dataset
### Here we are converting the dataset into csv. We remove all columns except 'question' and 'query'

In [5]:
import json
import csv

In [7]:
input_file = 'raw_dataset/validation.jsonl'
output_file = 'validation.csv'


# Open the output CSV file for writing
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
    # Define the CSV writer
    writer = csv.writer(csvfile)
    
    # Write the header
    writer.writerow(['question', 'query'])
    
    # Open the JSONL file for reading
    with open(input_file, 'r', encoding='utf-8') as jsonlfile:
        # Read the file line by line
        for line in jsonlfile:
            # Parse the JSON object
            entry = json.loads(line.strip())
            
            # Extract the question and query
            question = entry.get('question', '')
            query = entry.get('query', '')
            
            # Write the row to the CSV file
            writer.writerow([question, query])

In [13]:
import pandas as pd
import json_lines
import csv

In [14]:
import nltk
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

## Try to fine-tune this model using custom dataset. Convert it from the hugging face datasets library

https://charanhu.medium.com/converting-pandas-dataframe-into-a-dataset-and-pushing-to-hugging-face-146e2ccac38d

In [15]:
import pandas as pd
from datasets import Dataset
# Load data into a Pandas DataFrame
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
validation_df = pd.read_csv('validation.csv')
# Convert the DataFrame into a Dataset
train = Dataset.from_pandas(train_df)
test = Dataset.from_pandas(test_df)
validation = Dataset.from_pandas(validation_df)

In [16]:
# Load the tokenizer, model, and data collator
MODEL_NAME = "google/flan-t5-small"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [49]:
# We prefix our tasks with "answer the question"
prefix = "Please translate Turkish to SQL query: "

# Define the preprocessing function

def preprocess_function(examples):
   """Add prefix to the sentences, tokenize the text, and set the labels"""
   # The "inputs" are the tokenized answer:
   inputs = [prefix + doc for doc in examples["question"]]
   model_inputs = tokenizer(inputs, max_length=128, truncation=True)
  
   # The "labels" are the tokenized outputs:
   labels = tokenizer(text_target=examples["query"], 
                      max_length=512,         
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

In [50]:
# Map the preprocessing function across our dataset
tokenized_train = train.map(preprocess_function, batched=True)

Map:   0%|          | 0/7566 [00:00<?, ? examples/s]

In [51]:
tokenized_validation = validation.map(preprocess_function, batched=True)

Map:   0%|          | 0/1081 [00:00<?, ? examples/s]

In [52]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

In [53]:
def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  
   return result

In [54]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 3

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="./results",
   eval_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

In [55]:
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_validation,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

In [58]:
#print("Model:", model)
#print("Tokenizer:", tokenizer)
#print("Training Arguments:", training_args)
#print("Training Dataset:", tokenized_train)
#print("Validation Dataset:", tokenized_validation)

In [57]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 