In [None]:
import pandas as pd
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from sklearn.utils import resample

# Load the Pegasus paraphrase model and tokenizer
model_name = "tuner007/pegasus_paraphrase"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

# Function to generate paraphrases
def paraphrase(sentence, num_return_sequences=3, num_beams=10):
    # Tokenize the input sentence
    inputs = tokenizer([sentence], max_length=60, truncation=True, padding="longest", return_tensors="pt")
    
    # Generate paraphrased sentences
    outputs = model.generate(
        inputs['input_ids'],
        max_length=60,
        num_beams=num_beams,
        num_return_sequences=num_return_sequences,
        temperature=1.5
    )
    
    # Decode the generated sentences and return
    paraphrased_sentences = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return paraphrased_sentences

# Load the dataset
# Assume the CSV has two columns: 'text' for the review content and 'label' for the sentiment label
df = pd.read_csv("hotel_reviews.csv")

# Check class distribution
print("Class distribution before augmentation:\n", df['label'].value_counts())

# Identify the minority class
minority_class = df['label'].value_counts().idxmin()
minority_class_count = df['label'].value_counts().min()

# Extract minority class data
minority_data = df[df['label'] == minority_class]

# Augment the minority class using paraphrasing
augmented_texts = []
augmented_labels = []

for index, row in minority_data.iterrows():
    original_text = row['text']
    
    # Generate paraphrases
    paraphrased_sentences = paraphrase(original_text, num_return_sequences=3)  # You can adjust this number
    augmented_texts.extend(paraphrased_sentences)
    augmented_labels.extend([minority_class] * len(paraphrased_sentences))

# Create a DataFrame for the augmented data
augmented_df = pd.DataFrame({
    'text': augmented_texts,
    'label': augmented_labels
})

# Combine the original dataset with the augmented data
augmented_df_combined = pd.concat([df, augmented_df])

# Shuffle the dataset to ensure mixed distribution
augmented_df_combined = augmented_df_combined.sample(frac=1).reset_index(drop=True)

# Save the augmented dataset
augmented_df_combined.to_csv("augmented_hotel_reviews.csv", index=False)

# Print new class distribution
print("Class distribution after augmentation:\n", augmented_df_combined['label'].value_counts())
