# Importing libraries

In [None]:
import random
import os
from datasets import load_dataset
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
import torch
import ast
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np
nltk.download('punkt')

# Dataset Preparation

In [None]:
dataset = load_dataset("humarin/chatgpt-paraphrases")
dataset = dataset['train']
dataset = dataset.filter(lambda x: x['source'] == 'quora')
dataset

In [28]:
def parse_paraphrases(example):
    example["paraphrases"] = ast.literal_eval(example["paraphrases"])
    return example

dataset1 = dataset.map(parse_paraphrases, remove_columns=['category', 'source'])

In [29]:
def select_paraphrase(example):
    if example['paraphrases']:
        example['target'] = random.choice(example['paraphrases'])
    else:
        example['target'] = ""
    return example

dataset1 = dataset1.map(select_paraphrase)

In [30]:
dataset1 = dataset1.remove_columns('paraphrases')

In [None]:
dataset1[0]

In [None]:
dataset1

# Splitting the dataset

In [17]:
split_dataset = dataset1.train_test_split(test_size=0.2)
train_dataset = split_dataset['train']
temp_dataset = split_dataset['test'].train_test_split(test_size=0.5)
val_dataset = temp_dataset['train']
test_dataset = temp_dataset['test']

In [None]:
train_dataset

In [None]:
val_dataset

In [None]:
test_dataset

In [None]:
train_dataset[0]

# Building a Limited Vocabulary

In [36]:
def top_words(dataset, n=1000):
    text = []
    for sample in dataset:
        text.append(sample['text'])
        text.append(sample['target'])
    words = []
    for word in text:
        words.extend(word_tokenize(word.lower()))
    word_counts = Counter(words)

    top_words = [word for word, _ in word_counts.most_common(n)]

    return top_words, word_counts

top_1000_words, word_counts = top_words(train_dataset,1000)

In [None]:
print(len(word_counts))

In [None]:
top_1000_words[:10]

# Create custom T5 Tokenizor with Limited Vocabulary

In [40]:
original_tokenizer = T5Tokenizer.from_pretrained('t5-small')

def create_limited_tokenizer(original_tokenizer, top_words):
    special_tokens = [
        original_tokenizer.pad_token,
        original_tokenizer.eos_token,
        original_tokenizer.bos_token,
        original_tokenizer.unk_token,
        original_tokenizer.sep_token,
    ]

    special_tokens = [token for token in special_tokens if token is not None]
    
    limited_vocab = set(special_tokens)
    for word in top_words:
        tokenized_word = original_tokenizer.tokenize(word)
        limited_vocab.update(tokenized_word)
    
    limited_vocab_list = list(limited_vocab)
    
    return limited_vocab_list, original_tokenizer

limited_vocab_list, limited_tokenizer = create_limited_tokenizer(original_tokenizer, top_1000_words)

# Preparing data for Traning

In [43]:
def preprocess_function(examples, tokenizer, max_length=128):
    inputs = ["paraphrase: " + text for text in examples['text']]
    targets = examples['target']
    
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=128, truncation=True, padding='max_length')
    
    labels_with_ignore = []
    for label in labels['input_ids']:
        label_with_ignore = [l if l != tokenizer.pad_token_id else -100 for l in label]
        labels_with_ignore.append(label_with_ignore)
    
    model_inputs['labels'] = labels_with_ignore
    return model_inputs

In [44]:
def preprocess_limited_vocab(examples, tokenizer, limited_vocab_list, max_length=128, prefix="paraphrase: "):
    inputs = [prefix + text for text in examples['text']]
    targets = examples['target']
    
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=max_length, truncation=True, padding='max_length')
    
    unk_token_id = tokenizer.unk_token_id
    
    model_inputs_filtered = {'input_ids': [], 'attention_mask': model_inputs['attention_mask']}
    labels_filtered = []
    
    for input_ids in model_inputs['input_ids']:
        filtered_ids = [id if tokenizer.convert_ids_to_tokens([id])[0] in limited_vocab_list 
                        else unk_token_id for id in input_ids]
        model_inputs_filtered['input_ids'].append(filtered_ids)
    
    for label in labels['input_ids']:
        filtered_label = []
        for l in label:
            if l == tokenizer.pad_token_id:
                filtered_label.append(-100)  
            elif tokenizer.convert_ids_to_tokens([l])[0] in limited_vocab_list:
                filtered_label.append(l) 
            else:
                filtered_label.append(unk_token_id)  
        labels_filtered.append(filtered_label)
    
    model_inputs_filtered['labels'] = labels_filtered
    return model_inputs_filtered


In [None]:
train_dataset_limited = train_dataset.map(
    lambda examples: preprocess_limited_vocab(examples, limited_tokenizer, limited_vocab_list),
    batched=True
)

validation_dataset_limited = val_dataset.map(
    lambda examples: preprocess_limited_vocab(examples, limited_tokenizer, limited_vocab_list),
    batched=True
)

test_dataset_limited = test_dataset.map(
    lambda examples: preprocess_limited_vocab(examples, limited_tokenizer, limited_vocab_list),
    batched=True
)

# Fine Tunning

In [None]:
training_args = TrainingArguments(
    output_dir="./t5_small_limited_vocab",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none" 
)

limited_model = T5ForConditionalGeneration.from_pretrained('t5-small')

data_collator = DataCollatorForSeq2Seq(
    tokenizer=limited_tokenizer,
    model=limited_model,
    padding=True
)

limited_trainer = Trainer(
    model=limited_model,
    args=training_args,
    train_dataset=train_dataset_limited,
    eval_dataset=validation_dataset_limited,
    tokenizer=limited_tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
limited_trainer.train()



# Saving the model

In [None]:
limited_model_dir = "./t5_small_limited_final"
limited_trainer.save_model(limited_model_dir)
limited_tokenizer.save_pretrained(limited_model_dir)

with open(os.path.join(limited_model_dir, "limited_vocab_list.txt"), "w") as file:
    for word in limited_vocab_list:
        file.write(f"{word}\n")

# Loading the saved model

In [69]:
limited_model_dir = "./t5_small_limited_final"

# Load the model
model = T5ForConditionalGeneration.from_pretrained(limited_model_dir)

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained(limited_model_dir)

# Generating the paraphrases

In [None]:
import torch

def generate_paraphrase_limited_vocab(model, tokenizer, limited_vocab_list, input_text, 
                                      max_length=64, num_beams=5):

    model.eval()
    prefix = "paraphrase: "
    input_text_with_prefix = prefix + input_text

    input_ids = tokenizer.encode(input_text_with_prefix, return_tensors="pt")
    
    unk_token_id = tokenizer.unk_token_id
    filtered_input_ids = []
    for id in input_ids[0]:
        token = tokenizer.convert_ids_to_tokens([id.item()])[0]
        if token in limited_vocab_list or id == tokenizer.pad_token_id:
            filtered_input_ids.append(id.item())
        else:
            filtered_input_ids.append(unk_token_id)
    
    filtered_input_ids = torch.tensor([filtered_input_ids])

    with torch.no_grad():
        outputs = model.generate(
            filtered_input_ids,
            max_length=max_length,
            num_beams=num_beams,
            early_stopping=True,
            num_return_sequences=1,
        )

    paraphrased_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return paraphrased_text


def generate_paraphrase_full_vocab(model, tokenizer, input_text, max_length=64, num_beams=5):
    """
    Generate paraphrase by encoding the input text using the full/original tokenizer (no vocab restriction).
    """
    model.eval()
    prefix = "paraphrase: "
    input_text_with_prefix = prefix + input_text

    input_ids = tokenizer.encode(input_text_with_prefix, return_tensors="pt")

    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            max_length=max_length,
            num_beams=num_beams,
            early_stopping=True,
            num_return_sequences=1,
        )

    paraphrased_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return paraphrased_text

## Testing

In [None]:
questions = [
    "In what ways does globalization exacerbate socioeconomic disparities across various geopolitical regions?",
    "How does the proliferation of misinformation on social media platforms undermine democratic institutions and public discourse?",
    "How can sustainable urban development be achieved without compromising economic growth and infrastructural expansion?",
    "What are the cognitive and psychological consequences of excessive screen time among adolescents in the digital age?",
    "To what extent does climate change influence the frequency and intensity of meteorological anomalies worldwide?",
    "How do multinational corporations navigate complex regulatory environments while maintaining corporate social responsibility?",
    "In what manner does linguistic relativism affect cross-cultural communication and perception?",
]

for question in questions:
    generated_paraphrase = generate_paraphrase_limited_vocab(model,tokenizer,limited_vocab_list, question)
    print(f"Question: {question}")
    print(f"Generated Paraphrase:{generated_paraphrase}")
    print("\n")


Question: In what ways does globalization exacerbate socioeconomic disparities across various geopolitical regions?
Generated Paraphrase:what ways does global acrosst?


Question: How does the proliferation of misinformation on social media platforms undermine democratic institutions and public discourse?
Generated Paraphrase:does the of on social media and public?


Question: How can sustainable urban development be achieved without compromising economic growth and infrastructural expansion?
Generated Paraphrase:can development be without growth and inra?


Question: What are the cognitive and psychological consequences of excessive screen time among adolescents in the digital age?
Generated Paraphrase:are the and consequences of screen time among in the digital age?


Question: To what extent does climate change influence the frequency and intensity of meteorological anomalies worldwide?
Generated Paraphrase:what does change the and of?


Question: How do multinational corporations n

In [109]:
para_limited = generate_paraphrase_limited_vocab(model, tokenizer, limited_vocab_list, 'How does photosynthesis help plants grow?')
print(para_limited)

para_full = generate_paraphrase_full_vocab(model, original_tokenizer, 'How does photosynthesis help plants grow?')
print(para_full)


is the role of in?
does photosynthesis contribute to the growth of plants?


In [108]:
para_limited = generate_paraphrase_limited_vocab(model, tokenizer, limited_vocab_list, 'What is the captial city of india?')
print(para_limited)

para_full = generate_paraphrase_full_vocab(model, original_tokenizer, 'What is the captial city of india?')
print(para_full)

is the most city in?
is the city of?
