# Importing the Libraries

In [None]:
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset
import torch
from torch.utils.data import DataLoader
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    get_linear_schedule_with_warmup
)
import peft
from peft import LoraConfig, TaskType, get_peft_model
import ast
import gc
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from evaluate import load as load_metric
import os

nltk.download('punkt')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Creating the directories for the storing the results

In [None]:
os.makedirs("/content/drive/MyDrive/results", exist_ok=True)
os.makedirs("/content/drive/MyDrive/logs", exist_ok=True)
os.makedirs("/content/drive/MyDrive/final_model", exist_ok=True)

# Dataset Preparation

In [None]:
dataset = load_dataset('humarin/chatgpt-paraphrases')['train']
dataset = dataset.filter(lambda x: x['source'] == 'quora')

In [None]:
def parse_paraphrases(example):
    example["paraphrases"] = ast.literal_eval(example["paraphrases"])
    return example

dataset1 = dataset.map(parse_paraphrases, remove_columns=['category', 'source'])

In [None]:
dataset1[0]

In [None]:
def expand_dataset(ds, batch_size=100):
    all_pairs = []

    for i in range(0, len(ds), batch_size):
        end_idx = min(i+batch_size, len(ds))
        batch = ds.select(range(i, end_idx))

        for j in range(len(batch)):
            item = batch[j]
            text = item['text']
            paraphrases_list = item['paraphrases']

            for paraphrase in paraphrases_list:
                all_pairs.append({
                    'text': text,
                    'paraphrases': paraphrase
                })

                all_pairs.append({
                    'text': paraphrase,
                    'paraphrases': text
                })

    df = pd.DataFrame(all_pairs)
    expanded_dataset = Dataset.from_pandas(df)

    return expanded_dataset

expanded_dataset = expand_dataset(dataset1, batch_size=50)

print(expanded_dataset)

for i in range(5):
    print(expanded_dataset[i])
    print()

# Splitting the dataset

In [None]:
splits = expanded_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = splits['train']

test_val_splits = splits['test'].train_test_split(test_size=0.5, seed=42)
val_dataset = test_val_splits['train']
test_dataset = test_val_splits['test']

In [None]:
train_dataset

In [None]:
val_dataset

In [None]:
test_dataset

# Preparing the model and tokenizer

In [None]:
def prepare_model():
    model_name = "t5-base"
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)

    return model, tokenizer

# Preprocessing the dataset

In [None]:
def preprocess_function(examples, tokenizer, max_length=128):
    prefix = "paraphrase: "

    inputs = [prefix + text for text in examples["text"]]
    targets = examples["paraphrases"]

    model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True)
    labels = tokenizer(targets, max_length=max_length, padding="max_length", truncation=True)

    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [None]:
def preprocess_all_datasets(train_dataset, val_dataset, test_dataset, tokenizer):

    train_dataset_processed = train_dataset.map(
        lambda examples: preprocess_function(examples, tokenizer),
        batched=True,
        remove_columns=train_dataset.column_names,
        desc="Preprocessing training dataset"
    )

    val_dataset_processed = val_dataset.map(
        lambda examples: preprocess_function(examples, tokenizer),
        batched=True,
        remove_columns=val_dataset.column_names,
        desc="Preprocessing validation dataset"
    )

    test_dataset_processed = test_dataset.map(
        lambda examples: preprocess_function(examples, tokenizer),
        remove_columns=test_dataset.column_names,
        batched=True,
        desc="Preprocessing test dataset"
    )

    return train_dataset_processed, val_dataset_processed, test_dataset_processed, test_dataset


In [None]:
model, tokenizer = prepare_model()
train_dataset_processed, val_dataset_processed, test_dataset_processed, test_dataset_original = preprocess_all_datasets(train_dataset, val_dataset, test_dataset, tokenizer)

# Full Fine Tunning

In [None]:
def full_finetune(train_dataset_processed, val_dataset_processed, model, tokenizer):

    training_args = TrainingArguments(
        output_dir="/content/drive/MyDrive/results/t5-paraphrase-full",
        learning_rate=5e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        report_to= "none",
        weight_decay=0.01,
        save_total_limit=2,
        num_train_epochs=5,
        logging_dir="./logs",
        logging_steps=500,
        push_to_hub=False,
        save_strategy="epoch",
)

    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset_processed,
        eval_dataset=val_dataset_processed,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()

    model_path = "/content/drive/MyDrive/final_model/t5-paraphrase-full"
    trainer.save_model("/content/drive/MyDrive/final_model/t5-paraphrase-full")
    tokenizer.save_pretrained("/content/drive/MyDrive/final_model/t5-paraphrase-full")

    return model, model_path


In [None]:
model, fully_finetuned_model_path = full_finetune(train_dataset_processed, val_dataset_processed, model, tokenizer)

full_results, full_examples = evaluate_model(model, tokenizer, test_dataset_original, test_dataset_processed, "Full Fine-tuned Model (Step 1)")

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache() if torch.cuda.is_available() else None

# Lora Fine Tunning

In [None]:
def lora_finetune(train_dataset_processed, val_dataset_processed, tokenizer, fully_finetuned_model_path):

    model = T5ForConditionalGeneration.from_pretrained(fully_finetuned_model_path)

    lora_config = LoraConfig(
        task_type=TaskType.SEQ_2_SEQ_LM,
        r=8,
        lora_alpha=32,
        lora_dropout=0.1,
        target_modules=["q", "v"],
    )

    model = get_peft_model(model, lora_config)

    model.print_trainable_parameters()

    training_args = TrainingArguments(
        output_dir="/content/drive/MyDrive/results/t5-paraphrase-lora",
        learning_rate=1e-4,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        report_to= "none",
        weight_decay=0.01,
        save_total_limit=2,
        num_train_epochs=5,
        logging_dir="./logs",
        logging_steps=500,
        push_to_hub=False,
        save_strategy="epoch",
    )

    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset_processed,
        eval_dataset=val_dataset_processed,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()

    print("Saving the final model (fully fine-tuned + LoRA)...")
    model_path = "/content/drive/MyDrive/final_model/t5-paraphrase-full-then-lora"
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)

    return model, model_path

In [None]:
lora_model, final_model_path = lora_finetune(train_dataset_processed, val_dataset_processed, tokenizer, fully_finetuned_model_path)

lora_results, lora_examples = evaluate_model(lora_model, tokenizer, test_dataset_original, test_dataset_processed, "Full+LoRA Fine-tuned Model (Step 2)")

# Generating the paraphrases sentences

In [None]:
def generate_paraphrases(model, tokenizer, texts, num_variations=3, max_length=128, prefix="paraphrase: ", device="cuda"):
    model.to(device)
    model.eval()

    all_paraphrases = []

    for text in texts:
        text_paraphrases = []
        input_text = prefix + text

        inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(device)

        for i in range(num_variations):
            with torch.no_grad():
                outputs = model.generate(
                    input_ids=inputs.input_ids,
                    attention_mask=inputs.attention_mask,
                    max_length=max_length,
                    num_beams=5,
                    no_repeat_ngram_size=2,
                    top_p=0.92,
                    do_sample=True
                )

            paraphrase = tokenizer.decode(outputs[0], skip_special_tokens=True)

            if paraphrase != text and paraphrase not in text_paraphrases:
                text_paraphrases.append(paraphrase)

        all_paraphrases.append(text_paraphrases)

    return all_paraphrases