**Imports**
---

In [None]:
import pandas as pd
import random
from tqdm import tqdm

**Topics and Keywords**
---

In [None]:
# Extended topics for each category
category_topics = {
    "Learning and Education": [
        "machine learning", "neuroscience", "educational games", "online learning platforms",
        "critical thinking", "data analysis", "learning disabilities", "study techniques",
        "virtual classrooms", "academic writing", "cognitive science", "language acquisition",
        "teacher training", "interactive learning", "remote education", "student assessment",
        "early childhood education", "STEM education", "curriculum development", "gamification in education",
        "digital literacy", "inclusive education", "memory techniques", "learning analytics",
        "language immersion", "educational psychology", "open-source learning", "vocational training",
        "adaptive learning", "peer tutoring", "project-based learning", "self-paced learning",
        "adult education", "standardized testing", "collaborative learning", "critical pedagogy",
        "study motivation", "blended learning", "home schooling", "character education",
        "flipped classrooms", "personalized learning", "special education", "educational technology",
        "school counseling", "study habits", "higher education policies", "problem-solving skills",
        "creative thinking", "digital textbooks"
    ],
    "Science and Technology": [
        "quantum computing", "biotechnology", "robotics", "nanotechnology", 
        "3D printing", "internet of things", "space travel", "AI ethics",
        "genomics", "autonomous vehicles", "sustainable tech", "5G networks",
        "renewable energy sources", "machine vision", "biomedical engineering", "human-computer interaction",
        "deep learning", "neural interfaces", "environmental engineering", "space exploration",
        "cybersecurity", "network infrastructure", "computational biology", "virtual reality",
        "augmented reality", "bioinformatics", "microbiology", "particle physics",
        "green technology", "biodegradable materials", "digital twins", "cloud computing",
        "solar power", "fusion energy", "self-healing materials", "AI in healthcare",
        "smart cities", "bioplastics", "space telescopes", "blockchain",
        "wearable technology", "holography", "brain-computer interfaces", "underwater exploration",
        "quantum cryptography", "evolutionary biology", "eco-friendly packaging", "hydrogen fuel",
        "satellite communication", "human augmentation"
    ],
    "Creative Writing and Storytelling": [
        "fantasy worlds", "science fiction", "mystery plots", "character development",
        "alternate realities", "urban legends", "mythology", "symbolism in art",
        "dystopian futures", "historical fiction", "metaphorical writing", "villain backstories",
        "first-person narratives", "plot twists", "stream of consciousness", "magical realism",
        "heroic journeys", "folktales", "morality tales", "allegory",
        "paranormal elements", "tragic flaws", "metafiction", "fables",
        "epic sagas", "alternate histories", "non-linear narratives", "ghost stories",
        "time-travel scenarios", "mythological creatures", "archetypes", "flash fiction",
        "world-building", "poetic justice", "literary genres", "emotional subtext",
        "magical creatures", "epistolary novels", "second-person perspective", "quest narratives",
        "story arcs", "epic battles", "humor in storytelling", "tales of revenge",
        "bildungsroman", "satire", "love triangles", "dual timelines",
        "anti-heroes", "supernatural elements"
    ],
    "Philosophy and Ethics": [
        "existentialism", "moral relativism", "utilitarianism", "free will",
        "AI ethics", "cognitive biases", "ethics of genetic engineering", "philosophy of mind",
        "consequentialism", "ethical dilemmas", "virtue ethics", "epistemology",
        "social contract", "deontology", "human rights", "justice theory",
        "bioethics", "moral absolutism", "determinism", "animal rights",
        "privacy in the digital age", "environmental ethics", "consciousness", "the nature of reality",
        "meaning of life", "nihilism", "transhumanism", "theories of happiness",
        "knowledge and skepticism", "cultural relativism", "capital punishment ethics", "freedom of speech",
        "ethics of war", "identity and self", "moral luck", "thought experiments",
        "political philosophy", "dualism", "intention vs outcome", "virtue and vice",
        "freedom and responsibility", "the trolley problem", "personal identity", "moral development",
        "social justice", "philosophy of language", "theism vs atheism", "existential angst",
        "philosophy of science", "value theory"
    ],
    "Health and Wellness": [
        "mental health awareness", "nutrition science", "physical fitness", "sleep hygiene",
        "stress management", "emotional intelligence", "public health policies", "alternative medicine",
        "yoga and meditation", "immune system health", "holistic wellness", "personal hygiene",
        "exercise routines", "dietary supplements", "chronic illness management", "healthcare accessibility",
        "substance abuse prevention", "mindfulness practices", "self-care routines", "workplace wellness",
        "cardiovascular health", "injury prevention", "holistic nutrition", "spiritual wellness",
        "mental resilience", "men's health", "women's health", "aging gracefully",
        "diabetes prevention", "health screening", "aromatherapy", "pediatric health",
        "healthy relationships", "rehabilitation", "gut health", "brain health",
        "fitness technology", "seasonal wellness", "hydration", "nutrition for children",
        "fitness for seniors", "healthy sleep patterns", "posture correction", "mental clarity",
        "phobias", "emotional release", "immune boosters", "recovery strategies",
        "genetic predispositions", "occupational health"
    ],
    "Environment and Nature": [
        "wildlife conservation", "pollution control", "climate resilience", "renewable energy",
        "sustainable agriculture", "ocean acidification", "habitat restoration", "recycling initiatives",
        "biodiversity hotspots", "deforestation", "carbon footprint", "green technology",
        "endangered species", "ecosystem services", "landscape restoration", "invasive species",
        "urban ecology", "zero waste living", "climate adaptation", "water scarcity",
        "soil health", "waste management", "reforestation", "clean energy sources",
        "marine biodiversity", "sustainable cities", "ozone layer protection", "environmental activism",
        "forest management", "desertification", "plastic pollution", "carbon neutrality",
        "glacier melting", "coastal erosion", "recycled materials", "animal migration",
        "environmental justice", "plant biodiversity", "fossil fuel alternatives", "sustainable tourism",
        "renewable resources", "natural disaster response", "wildlife tracking", "wetlands conservation",
        "species reintroduction", "biodegradable products", "eco-friendly lifestyle", "endangered habitats",
        "sustainable fishing", "geothermal energy"
    ],
    "History and Culture": [
        "ancient civilizations", "world wars", "cultural festivals", "historical figures",
        "traditional art forms", "colonial history", "the renaissance", "cultural revolutions",
        "folklore and myths", "medieval Europe", "historical trade routes", "archaeological discoveries",
        "indigenous cultures", "philosophers", "historical literature", "architectural heritage",
        "imperialism", "migration patterns", "musical traditions", "language evolution",
        "mythical creatures", "historical battles", "cultural rituals", "trade routes",
        "the age of exploration", "cultural diffusion", "historical inventions", "empires",
        "ancient religions", "maritime history", "the cold war", "political ideologies",
        "human rights movements", "slavery and emancipation", "military strategies", "social hierarchies",
        "ancient philosophies", "explorers", "medieval trade", "historical epics",
        "gender roles in history", "historical art movements", "early civilizations", "globalization impact",
        "revolutionary figures", "diaspora", "architectural landmarks", "oral traditions",
        "global languages", "cultural syncretism"
    ],
    "Business and Economics": [
        "supply chain management", "stock market", "digital currency", "entrepreneurship",
        "consumer behavior", "economic globalization", "branding strategies", "business ethics",
        "market research", "startups and innovation", "microeconomics", "corporate social responsibility",
        "e-commerce", "business models", "financial planning", "investment strategies",
        "taxation policies", "labor markets", "venture capital", "competition law",
        "marketing analytics", "foreign exchange", "business negotiations", "mergers and acquisitions",
        "advertising psychology", "business development", "public relations", "economic forecasting",
        "crowdfunding", "business innovation", "management theory", "human resources",
        "customer experience", "business law", "profit maximization", "digital marketing",
        "crisis management", "corporate culture", "cost-benefit analysis", "sustainable business practices",
        "real estate markets", "organizational behavior", "business cycles", "retail management",
        "investment portfolios", "employee motivation", "trade regulations", "social media marketing",
        "data-driven decisions", "intellectual property"
    ],
    "Personal Development and Motivation": [
        "goal setting", "time management", "self-discipline", "emotional resilience",
        "overcoming failure", "work-life balance", "positive habits", "mindfulness",
        "communication skills", "conflict resolution", "self-compassion", "career growth",
        "personal branding", "self-confidence", "assertiveness", "life coaching",
        "stress reduction", "mental clarity", "decision making", "self-reflection",
        "personal values", "building resilience", "habit formation", "self-care routines",
        "gratitude practices", "public speaking", "mentorship", "productivity techniques",
        "financial planning", "assertive communication", "mindset shifts", "problem-solving skills",
        "goal tracking", "journaling", "emotional intelligence", "self-awareness",
        "learning agility", "breaking bad habits", "healthy boundaries", "critical thinking",
        "dealing with criticism", "adaptability", "growth mindset", "building self-esteem",
        "meditation practices", "creativity enhancement", "career transitions", "personal achievements",
        "handling rejection", "visualization techniques"
    ],
    "Fun and Hypothetical Scenarios": [
        "alien encounters", "time travel possibilities", "zombie apocalypse survival", 
        "superpowers", "virtual reality vacations", "magic realism", "teleportation technology",
        "alternate universes", "robots in daily life", "cyberpunk futures", "space colonization",
        "undiscovered animals", "lost civilizations", "AI takeover scenarios", "haunted houses",
        "underwater cities", "dream manipulation", "living in a video game", "parallel dimensions",
        "mythical creatures", "weather control", "telepathy", "reincarnation", "genetic enhancements",
        "fantasy kingdoms", "time loops", "human-animal hybrids", "supernatural abilities",
        "robot-human relationships", "reality TV in space", "post-apocalyptic survival",
        "mass teleportation", "dragon riding", "magic schools", "first contact with aliens",
        "reverse aging", "villain rehabilitation", "dinosaurs in modern times", "mind control",
        "cloning celebrities", "ghost hunting", "living in a VR simulation", "human immortality",
        "ancient prophecies", "superhero societies", "instant learning", "personal AI assistants",
        "time machine malfunction", "city on Mars", "emotional robots"
    ]
}

**Templates**
---

In [None]:
# Define categories with specific templates and expanded topics
category_data = {
    "Learning and Education": {
        "templates": [
            "Explain the importance of {} in {}.",
            "What are some strategies for improving {} in {}?",
            "How can {} help to solve issues in {}?"
        ],
        "topics": category_topics["Learning and Education"]
    },
    "Science and Technology": {
        "templates": [
            "Describe how {} affects {}.",
            "What are some benefits of {} and how can it help with {}?",
            "How has {} evolved in the context of {}?"
        ],
        "topics": category_topics["Science and Technology"]
    },
    "Creative Writing and Storytelling": {
        "templates": [
            "Write a short story about {}.",
            "Imagine {}. What might happen?",
            "What are some unique perspectives on {} in {}?"
        ],
        "topics": category_topics["Creative Writing and Storytelling"]
    },
    "Philosophy and Ethics": {
        "templates": [
            "Provide a reasoned argument about {} and {}.",
            "What are the ethical implications of {} in {}?",
            "Compare the roles of {} and {}."
        ],
        "topics": category_topics["Philosophy and Ethics"]
    },
    "Health and Wellness": {
        "templates": [
            "Discuss the impact of {} on {}.",
            "What challenges does {} face in {}?",
            "What are some benefits of {} and how can it help with {}?"
        ],
        "topics": category_topics["Health and Wellness"]
    },
    "Environment and Nature": {
        "templates": [
            "Explain how {} can be applied in {}.",
            "How can {} impact the future of {}?",
            "What are some misconceptions about {} in {}?"
        ],
        "topics": category_topics["Environment and Nature"]
    },
    "History and Culture": {
        "templates": [
            "How has {} evolved in the context of {}?",
            "Summarize recent developments in {} related to {}.",
            "What are the social implications of {} in {}?"
        ],
        "topics": category_topics["History and Culture"]
    },
    "Business and Economics": {
        "templates": [
            "Discuss the impact of {} on {}.",
            "How can {} help to solve issues in {}?",
            "What challenges does {} face in {}?"
        ],
        "topics": category_topics["Business and Economics"]
    },
    "Personal Development and Motivation": {
        "templates": [
            "What are some strategies for improving {} in {}?",
            "What role does {} play in advancing {}?",
            "How can {} impact personal growth in {}?"
        ],
        "topics": category_topics["Personal Development and Motivation"]
    },
    "Fun and Hypothetical Scenarios": {
        "templates": [
            "Imagine {}. What might happen?",
            "What are some unique perspectives on {} in {}?",
            "Provide a hypothetical scenario about {} and {}."
        ],
        "topics": category_topics["Fun and Hypothetical Scenarios"]
    }
}

**Datagen**
---

In [None]:
# Generate 1000 unique prompts per category with progress bars
prompts = []
for category, data in category_data.items():
    unique_prompts = set()
    print(f"Generating prompts for category: {category}")
    with tqdm(total=1000) as pbar:
        while len(unique_prompts) < 1000:
            template = random.choice(data["templates"])
            topic1, topic2 = random.sample(data["topics"], 2)
            prompt = template.format(topic1, topic2)

            # Ensure uniqueness within each category
            if prompt not in unique_prompts:
                unique_prompts.add(prompt)
                prompts.append({"category": category, "prompt": prompt})
                pbar.update(1)

# Create a DataFrame and save to CSV
df_prompts = pd.DataFrame(prompts)
df_prompts.to_csv("10k_category_prompts.csv", index=False)
print("File saved as 10k_category_prompts.csv")

**Sorted**
---

In [None]:
# Assuming df_prompts is the DataFrame containing 10,000 prompts with a 'category' column
# Load the existing DataFrame if it's saved already
df_prompts = pd.read_csv("LLMWM_dataset/10k_category_prompts.csv")

# Sort prompts by category and shuffle within each category to ensure variety
sorted_prompts = []
categories = df_prompts['category'].unique()

# Create a dictionary to store prompts per category
category_groups = {category: df_prompts[df_prompts['category'] == category] for category in categories}

# Loop to add 10 prompts from each category iteratively until all are included
while any(not group.empty for group in category_groups.values()):
    for category in categories:
        if not category_groups[category].empty:
            # Take 10 prompts (or fewer if fewer than 10 remain) from the category
            batch = category_groups[category].head(10)
            sorted_prompts.append(batch)
            # Drop the selected rows from the group
            category_groups[category] = category_groups[category].iloc[10:]

# Concatenate all batches into a final sorted DataFrame
final_sorted_df = pd.concat(sorted_prompts, ignore_index=True)

# Save the sorted DataFrame to a new CSV
final_sorted_df.to_csv("LLMWM_dataset/10k_category_prompts_sorted.csv", index=False)
print("File saved as 10k_category_prompts_sorted.csv")