In [None]:
import os
import re
import gc
import pandas as pd
from datasets import Dataset
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, 
    DataCollatorForSeq2Seq, EarlyStoppingCallback, get_linear_schedule_with_warmup
)
from peft import LoraConfig, get_peft_model, TaskType
from accelerate import Accelerator
import torch
import numpy as np
from transformers.integrations import TensorBoardCallback

# Enable dynamic GPU memory allocation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

# Load the dataset
df = pd.read_csv('./data/generated_input_output_pairs.csv')

# Normalize and preprocess text
def normalize_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['input'] = df['input'].apply(normalize_text)
df['output'] = df['output'].apply(normalize_text)
dataset = Dataset.from_pandas(df)

# Split dataset
train_dataset, val_dataset = dataset.train_test_split(test_size=0.1, seed=42).values()

# Load pre-trained tokenizer and model
model_name = "google/flan-t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Enable gradient checkpointing for memory optimization
model.gradient_checkpointing_enable()

# Freeze certain model layers to save memory and speed up training
for param in model.encoder.block[:3].parameters():
    param.requires_grad = False

# Apply LoRA
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q", "v"],
    bias="none",
)
model = get_peft_model(model, lora_config)

# Tokenize function
def tokenize_function(examples):
    inputs = tokenizer(examples['input'], padding="max_length", truncation=True, max_length=128)
    outputs = tokenizer(examples['output'], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = outputs["input_ids"]
    return inputs

train_dataset = train_dataset.map(tokenize_function, batched=True, num_proc=8)
val_dataset = val_dataset.map(tokenize_function, batched=True, num_proc=8)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Accelerator
accelerator = Accelerator(mixed_precision="fp16")
model, train_dataset, val_dataset, data_collator = accelerator.prepare(
    model, train_dataset, val_dataset, data_collator
)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

# Scheduler
num_training_steps = (len(train_dataset) // 16) * 3  # Adjust for batch size and epochs
num_warmup_steps = int(0.1 * num_training_steps)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps
)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    evaluation_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=3,
    load_best_model_at_end=True,
    fp16=True,
    logging_steps=200,
    max_grad_norm=1.5,
    dataloader_pin_memory=True,
    dataloader_num_workers=4,
    learning_rate=3e-5,
    lr_scheduler_type="linear",
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="tensorboard",
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset.shuffle(seed=42).select(range(100)),
    data_collator=data_collator,
    optimizers=(optimizer, lr_scheduler),
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=2),
        TensorBoardCallback()
    ],
)

# Train the model
print(f"Initial Learning Rate: {training_args.learning_rate}")
trainer.train()

# Evaluate
results = trainer.evaluate()
print(f"Evaluation results: {results}")

# Save model and tokenizer
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

# Save results
results_df = pd.DataFrame({
    "epoch": [trainer.state.epoch],
    "train_loss": [trainer.state.best_metric],
    "eval_loss": [results.get("eval_loss")],
})
results_df.to_csv('./results/model_training_evaluation_results.csv', index=False)

print("Training complete. Results saved to './results/model_training_evaluation_results.csv'.")


In [None]:
# Finetunnig with Rogue Score
# import os
# import pandas as pd
# from datasets import Dataset
# from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, EarlyStoppingCallback
# from peft import LoraConfig, get_peft_model, TaskType
# from accelerate import Accelerator
# import evaluate

# # Enable dynamic GPU memory allocation
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

# # Load the dataset
# df = pd.read_csv('./data/generated_input_output_pairs.csv')
# dataset = Dataset.from_pandas(df)

# # Split dataset into train and validation sets
# train_dataset, val_dataset = dataset.train_test_split(test_size=0.1, seed=42).values()

# # Load pre-trained T5 tokenizer and model
# model_name = "google/flan-t5-small"
# tokenizer = T5Tokenizer.from_pretrained(model_name, use_fast=True,legacy=False)  # Use non-legacy tokenizer behavior
# model = T5ForConditionalGeneration.from_pretrained(model_name)

# # Apply QLoRA using PEFT
# lora_config = LoraConfig(
#     task_type=TaskType.SEQ_2_SEQ_LM,  # Task type for T5
#     r=8,  # Low-rank matrix dimension
#     lora_alpha=32,  # Scaling factor
#     lora_dropout=0.1,  # Dropout for LoRA layers
#     target_modules=["q", "v"],  # Target attention weights for adaptation
#     bias="none",
# )
# model = get_peft_model(model, lora_config)

# # Tokenization function with increased max_length
# def tokenize_function(examples):
#     inputs = tokenizer(
#         examples['input'], padding="max_length", truncation=True, max_length=128  # Increase max_length
#     )
#     outputs = tokenizer(
#         examples['output'], padding="max_length", truncation=True, max_length=128  # Increase max_length
#     )
#     inputs["labels"] = outputs["input_ids"]
#     return inputs

# # Tokenize datasets
# train_dataset = train_dataset.map(tokenize_function, batched=True)
# val_dataset = val_dataset.map(tokenize_function, batched=True)

# # Define DataCollator for efficient padding
# data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# # Load evaluation metrics
# bleu_metric = evaluate.load("bleu")
# rouge_metric = evaluate.load("rouge")

# # Compute metrics
# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     if isinstance(predictions, tuple):
#         predictions = predictions[0]
#     predictions = predictions.argmax(axis=-1) if predictions.ndim > 2 else predictions
#     labels = labels.tolist()
#     for i in range(len(labels)):
#         labels[i] = [label if label != -100 else tokenizer.pad_token_id for label in labels[i]]
#     decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
#     bleu = bleu_metric.compute(predictions=decoded_preds, references=[[lbl] for lbl in decoded_labels])
#     rouge = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)
#     rouge_l_score = rouge.get("rouge-l", {}).get("mid", {}).get("fmeasure", 0)
#     return {
#         "bleu": bleu["bleu"],
#         "rouge": rouge_l_score,
#     }

# # Define training arguments
# training_args = Seq2SeqTrainingArguments(
#     output_dir='./results',
#     # num_train_epochs=,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     gradient_accumulation_steps=16,
#     warmup_steps=10,
#     logging_dir='./logs',
#     logging_steps=100,
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     save_total_limit=1,
#     load_best_model_at_end=True,
#     report_to="tensorboard",
#     fp16=True,  # Use mixed precision for faster training
#     dataloader_pin_memory=True,
#     dataloader_num_workers=4,
#     overwrite_output_dir=True,
# )

# # Setup Accelerator
# accelerator = Accelerator(mixed_precision="fp16")

# # Prepare model, datasets, and data collator for Accelerator
# model, train_dataset, val_dataset, data_collator = accelerator.prepare(
#     model, train_dataset, val_dataset, data_collator
# )

# # Define Trainer with early stopping callback
# trainer = Seq2SeqTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     compute_metrics=compute_metrics,
#     data_collator=data_collator,
#     callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
# )

# # Train the model
# trainer.train()

# # Save the model and tokenizer
# model.save_pretrained("./flanfine_tuned_model")
# tokenizer.save_pretrained("./flanfine_tuned_model")

# # Evaluate and save results
# results = trainer.evaluate()
# results_df = pd.DataFrame({
#     "epoch": [trainer.state.epoch],
#     "train_loss": [trainer.state.best_metric],
#     "eval_loss": [results.get("eval_loss")],
#     "bleu": [results.get("eval_bleu", float("nan"))],
#     "rouge": [results.get("eval_rouge", float("nan"))],
# })
# results_df.to_csv('./results/model_training_evaluation_results.csv', index=False)

# print("Training and evaluation complete. Results saved to './results/model_training_evaluation_results.csv'.")
