# Importing libraries

In [None]:
import random
from datasets import load_dataset
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
import torch
import ast

# Dataset Preparation

In [None]:
dataset = load_dataset("humarin/chatgpt-paraphrases")
dataset = dataset['train']
dataset = dataset.filter(lambda x: x['source'] == 'quora')
dataset

In [None]:
def parse_paraphrases(example):
    example["paraphrases"] = ast.literal_eval(example["paraphrases"])
    return example

dataset = dataset.map(parse_paraphrases, remove_columns=['category', 'source'])

In [None]:
dataset[0]

In [None]:
def select_paraphrase(example):
    if example['paraphrases']:
        example['target'] = random.choice(example['paraphrases'])
    else:
        example['target'] = ""
    return example

dataset = dataset.map(select_paraphrase)

In [None]:
dataset[0]

In [None]:
dataset = dataset.remove_columns(['paraphrases'])
dataset

# Splitting the dataset

In [None]:
splits = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = splits['train']

test_val_splits = splits['test'].train_test_split(test_size=0.5, seed=42)
val_dataset = test_val_splits['train']
test_dataset = test_val_splits['test']

In [None]:
train_dataset

In [None]:
val_dataset

In [None]:
test_dataset

# Preparing the model and tokenizer

In [None]:
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Preprocessing the dataset

In [None]:
def preprocess_function(examples):
    inputs = ["paraphrase: " + text for text in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["target"], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



In [None]:
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

# Fine tunning by T5-small

In [None]:
num_examples_per_save = 50000
batch_size = 16
save_steps = num_examples_per_save // batch_size
output_dir = "/content/drive/MyDrive/results/t5_paraphrase_results"

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_steps=save_steps,
    logging_steps=100,
    learning_rate=3e-3,
    weight_decay=0.01,
    save_total_limit=5,
    fp16=True,
    report_to="none",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
)

trainer.train()

In [None]:
tokenizer.save_pretrained(output_dir)

In [None]:
def generate_paraphrases(input_text, num_return_sequences=3, num_beams=5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.to(device)

    encoding = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    input_ids = encoding["input_ids"].to(device)

    outputs = model.generate(
        input_ids=input_ids,
        max_length=128,
        num_beams=num_beams,
        num_return_sequences=num_return_sequences,
        early_stopping=True,
    )

    paraphrases = [
        tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        for output in outputs
    ]
    return paraphrases


user_input = "Can we connect pendrive by using otg cable to iPhone?"
generated_paraphrases = generate_paraphrases(user_input, num_return_sequences=3, num_beams=5)

print("Generated Paraphrases:")
for idx, para in enumerate(generated_paraphrases, 1):
    print(f"{idx}: {para}")