# Importing libraries

In [2]:
import random
from datasets import load_dataset
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
import torch
import ast

# Dataset Preparation

In [3]:
dataset = load_dataset("humarin/chatgpt-paraphrases")
dataset = dataset['train']
dataset = dataset.filter(lambda x: x['source'] == 'quora')
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

chatgpt_paraphrases.csv:   0%|          | 0.00/265M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/419197 [00:00<?, ? examples/s]

Filter:   0%|          | 0/419197 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'paraphrases', 'category', 'source'],
    num_rows: 247138
})

In [4]:
def parse_paraphrases(example):
    example["paraphrases"] = ast.literal_eval(example["paraphrases"])
    return example

dataset = dataset.map(parse_paraphrases, remove_columns=['category', 'source'])

Map:   0%|          | 0/247138 [00:00<?, ? examples/s]

In [None]:
dataset[0]

{'text': 'What is the step by step guide to invest in share market in india?',
 'paraphrases': ['Can you provide a detailed procedure for investing in the Indian stock market?',
  'What are the sequential instructions for investing in shares in India?',
  'Could you outline the step-by-step process for investing in the Indian share market?',
  'What is the systematic guide to investing in the Indian stock exchange?',
  'Can you provide a comprehensive guide on how to invest in the Indian share market?']}

In [5]:
def select_paraphrase(example):
    if example['paraphrases']:
        example['target'] = random.choice(example['paraphrases'])
    else:
        example['target'] = ""
    return example

dataset = dataset.map(select_paraphrase)

Map:   0%|          | 0/247138 [00:00<?, ? examples/s]

In [None]:
dataset[0]

{'text': 'What is the step by step guide to invest in share market in india?',
 'paraphrases': ['Can you provide a detailed procedure for investing in the Indian stock market?',
  'What are the sequential instructions for investing in shares in India?',
  'Could you outline the step-by-step process for investing in the Indian share market?',
  'What is the systematic guide to investing in the Indian stock exchange?',
  'Can you provide a comprehensive guide on how to invest in the Indian share market?'],
 'target': 'Could you outline the step-by-step process for investing in the Indian share market?'}

In [6]:
dataset = dataset.remove_columns(['paraphrases'])
dataset

Dataset({
    features: ['text', 'target'],
    num_rows: 247138
})

# Splitting the dataset

In [7]:
splits = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = splits['train']

test_val_splits = splits['test'].train_test_split(test_size=0.5, seed=42)
val_dataset = test_val_splits['train']
test_dataset = test_val_splits['test']

In [None]:
train_dataset

Dataset({
    features: ['text', 'target'],
    num_rows: 222424
})

In [None]:
val_dataset

Dataset({
    features: ['text', 'target'],
    num_rows: 12357
})

In [None]:
test_dataset

Dataset({
    features: ['text', 'target'],
    num_rows: 12357
})

# Preparing the model and tokenizer

In [8]:
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

# Preprocessing the dataset

In [9]:
def preprocess_function(examples):
    inputs = ["paraphrase: " + text for text in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["target"], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



In [10]:
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/222424 [00:00<?, ? examples/s]



Map:   0%|          | 0/12357 [00:00<?, ? examples/s]

Map:   0%|          | 0/12357 [00:00<?, ? examples/s]

# Fine tunning by T5-small

In [11]:
num_examples_per_save = 50000
batch_size = 16
save_steps = num_examples_per_save // batch_size
output_dir = "/content/drive/MyDrive/results/t5_paraphrase_results"

In [14]:
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_steps=save_steps,
    logging_steps=100,
    learning_rate=3e-3,
    weight_decay=0.01,
    save_total_limit=5,
    fp16=True,
    report_to="none",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
)

trainer.train()

Step,Training Loss


KeyboardInterrupt: 

In [None]:
tokenizer.save_pretrained(output_dir)

('/content/drive/MyDrive/t5_paraphrase_results/tokenizer_config.json',
 '/content/drive/MyDrive/t5_paraphrase_results/special_tokens_map.json',
 '/content/drive/MyDrive/t5_paraphrase_results/spiece.model',
 '/content/drive/MyDrive/t5_paraphrase_results/added_tokens.json')

In [None]:
def generate_paraphrases(input_text, num_return_sequences=3, num_beams=5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.to(device)

    encoding = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    input_ids = encoding["input_ids"].to(device)

    outputs = model.generate(
        input_ids=input_ids,
        max_length=128,
        num_beams=num_beams,
        num_return_sequences=num_return_sequences,
        early_stopping=True,
    )

    paraphrases = [
        tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        for output in outputs
    ]
    return paraphrases


user_input = "Can we connect pendrive by using otg cable to iPhone?"
generated_paraphrases = generate_paraphrases(user_input, num_return_sequences=3, num_beams=5)

print("Generated Paraphrases:")
for idx, para in enumerate(generated_paraphrases, 1):
    print(f"{idx}: {para}")

Generated Paraphrases:
1: Is it possible to connect a pendrive with an iPhone using otg cable to connect with an iPhone?
2: Is it feasible to connect a pendrive using otg cable to an iPhone?
3: Can a USB cable be used to connect a pendrive to an iPhone using an otg cable?
