In [1]:
import random
import pandas as pd

# -------------------------------
# Utility: random helpers
# -------------------------------
def choice(x): return random.choice(x)

# Word pools
subjects = ["The boy", "The girl", "The teacher", "The cat", "The chef", "The dog", "The student", "The singer", "The driver", "The police officer"]
objects  = ["the ball", "the book", "the cake", "the mouse", "the car", "the song", "the homework", "the painting", "the phone", "the door"]
verbs    = ["kicked", "read", "baked", "chased", "washed", "opened", "painted", "fixed", "caught", "wrote"]

names = ["John", "Mary", "Alex", "Ravi", "Sarah", "Priya", "Tom", "Liam", "Asha", "Noah"]
feelings = ["happy", "tired", "angry", "sad", "excited", "bored", "confused"]
likes = ["apples", "books", "music", "movies", "coffee", "chocolate"]

positive_adjectives = ["good", "tasty", "beautiful", "clean", "smart", "bright", "strong", "peaceful", "amazing", "fun"]
negative_adjectives = ["bad", "dirty", "boring", "ugly", "weak", "dull", "sad", "noisy", "terrible", "lazy"]

data = []

# -------------------------------
# 1. Active <-> Passive
# -------------------------------
for _ in range(60):
    s = choice(subjects)
    o = choice(objects)
    v = choice(verbs)
    active = f"{s} {v} {o}."
    passive = f"{o.capitalize()} was {v} by {s.lower()}."
    data.append([active, passive, "Active to Passive"])
    data.append([passive, active, "Passive to Active"])

# -------------------------------
# 2. Direct <-> Indirect Speech
# -------------------------------
for _ in range(40):
    name = choice(names)
    adj = choice(feelings)
    direct = f'{name} said, "I am {adj}."'
    indirect = f"{name} said that he was {adj}."
    data.append([direct, indirect, "Direct to Indirect"])
    data.append([indirect, direct, "Indirect to Direct"])

    like_item = choice(likes)
    direct2 = f'{name} said, "I like {like_item}."'
    indirect2 = f"{name} said that he liked {like_item}."
    data.append([direct2, indirect2, "Direct to Indirect"])
    data.append([indirect2, direct2, "Indirect to Direct"])

# -------------------------------
# 3. Positive <-> Negative
# -------------------------------
for _ in range(50):
    subj = choice(subjects)
    adj = choice(positive_adjectives)
    pos = f"{subj} is {adj}."
    neg = pos.replace(" is ", " is not ")
    data.append([pos, neg, "Positive to Negative"])
    data.append([neg, pos, "Negative to Positive"])

# -------------------------------
# 4. Shuffle and save
# -------------------------------
random.shuffle(data)
df = pd.DataFrame(data, columns=["original_sentence", "transformed_sentence", "label"])

print("✅ Generated examples per label:")
print(df['label'].value_counts())
print("\nSample:")
print(df.head(10))

df.to_csv("sentence_transformation_dataset.csv", index=False)
print("\nSaved as 'sentence_transformation_dataset.csv' successfully!")


✅ Generated examples per label:
label
Direct to Indirect      80
Indirect to Direct      80
Passive to Active       60
Active to Passive       60
Negative to Positive    50
Positive to Negative    50
Name: count, dtype: int64

Sample:
                    original_sentence                transformed_sentence  \
0  The cake was caught by the driver.         The driver caught the cake.   
1      Noah said, "I like chocolate."  Noah said that he liked chocolate.   
2        Ravi said that he was happy.            Ravi said, "I am happy."   
3        The driver is not beautiful.            The driver is beautiful.   
4                   The chef is good.               The chef is not good.   
5            The boy chased the ball.     The ball was chased by the boy.   
6           The cat washed the mouse.    The mouse was washed by the cat.   
7        The police officer is tasty.    The police officer is not tasty.   
8    Sarah said that he liked coffee.        Sarah said, "I like coffee.