<a href="https://colab.research.google.com/github/ashwath-tech/llama-3.2-grumpy-it-finetune/blob/main/data-generation/SyntheticDatasetGenerator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# installs
!pip install -q json-repair
!pip install -q --upgrade bitsandbytes accelerate

In [None]:
# imports
import torch
from huggingface_hub import login
from transformers import pipeline
from google.colab import userdata
from datasets import load_dataset
from diffusers import DiffusionPipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import random
import json
import re
import gc

In [None]:
# login
hf_token = userdata.get("HF_TOKEN")
login(hf_token, add_to_git_credential=True)

In [None]:
# quantisation configuration
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

In [None]:
# get model
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct", device_map="auto", quantization_config = quant_config)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
topics = [
    # --- LINUX & SYSTEMS ---
    "Why 'chmod 777' is bad practice",
    "How to kill a Zombie process that won't die",
    "Disk space full but 'du' shows space available (deleted open files)",
    "Difference between hard links and soft links",
    "Why 'sudo rm -rf /' is blocked",
    "Understanding Linux file permissions (rwx)",
    "Using 'tail -f' vs 'less' for log files",
    "Why the cron job didn't run (environment variables)",
    "Systemd service failing to start (exit code 1)",
    "High Load Average vs High CPU usage",

    # --- GIT & VERSION CONTROL ---
    "Fixing a 'Detached HEAD' state in Git",
    "Resolving a merge conflict in package-lock.json",
    "Why you shouldn't force push to main",
    "Gitignore file not ignoring files that are already tracked",
    "Difference between 'git merge' and 'git rebase'",
    "Recovering a deleted branch with 'git reflog'",
    "Squashing commits before a pull request",
    "Why 'git stash' pop caused a conflict",
    "Submodule initialization errors",
    "Committing API keys to a public repo (and how to fix it)",

    # --- DOCKER & KUBERNETES ---
    "Docker container exits immediately (PID 1 issue)",
    "Why 'localhost' inside Docker isn't the host machine",
    "Docker volume mounting creates an empty directory",
    "Reducing Docker image size (multi-stage builds)",
    "Kubernetes Pod stuck in 'CrashLoopBackOff'",
    "Difference between a Pod and a Deployment",
    "Why the container is OOMKilled (Out of Memory)",
    "Connecting two containers on the same bridge network",
    "Docker Compose 'depends_on' vs actual service readiness",
    "Persisting data in a stateless container",

    # --- NETWORKING & WEB ---
    "CORS errors (Access-Control-Allow-Origin)",
    "Why DNS changes take 48 hours (TTL propagation)",
    "Difference between TCP and UDP",
    "Why the SSL certificate is marked 'Untrusted'",
    "HTTP 401 vs 403 Forbidden",
    "Port 80 vs Port 443 redirection",
    "Debugging a 502 Bad Gateway error",
    "Why 'ping' works but 'curl' times out (Firewall)",
    "Understanding CIDR notation (e.g., /24)",
    "Websocket connection failing (Upgrade header)",

    # --- DATABASE (SQL & NO-SQL) ---
    "Why 'SELECT *' is killing the database performance",
    "The N+1 query problem in ORMs",
    "Fixing a database deadlock situation",
    "SQL Injection risks in raw queries",
    "Why the index isn't being used (Full Table Scan)",
    "Difference between INNER JOIN and LEFT JOIN",
    "ACID properties in transactions",
    "Why the database connection pool is full",
    "Redis cache eviction policies (LRU)",
    "Migration script failing due to Foreign Key constraint",

    # --- CODING & PYTHON ---
    "Python circular import errors",
    "Mutable default arguments in Python functions",
    "Why floating point math is wrong (0.1 + 0.2 != 0.3)",
    "Memory leaks in long-running Node.js processes",
    "Blocking the Event Loop in JavaScript",
    "Dependency hell (pip/npm version conflicts)",
    "Encoding issues (UTF-8 vs ASCII)",
    "Race conditions in multi-threaded code",
    "Recursion depth exceeded errors",
    "Why you should use Virtual Environments (venv)",
]
stupid_distractions = [
    "Do you think my wireless mouse is dying because I haven't changed the batteries in two years?",
    "Can you log into the building's mainframe and redirect the AC vent? It's blowing on my left shoulder.",
    "I deleted the 'Internet' icon from my desktop. Does that mean the whole company is offline now?",
    "Why can't I find Comic Sans in the email dropdown? Can you install it on the mail server for me?",
    "I unplugged a grey cable under my desk to charge my phone. Is that why the servers are beeping?",
    "The break room coffee switched to dark roast. Does IT have a budget to fix this injustice?",
    "Since you're good with 'computers,' can you look at my kid's slow iPad during lunch?",
    "The printer isn't working. I haven't tried to print yet, but I have a 'feeling' it's going to jam.",
    "I found a random USB drive in the parking lot. Should I plug it in to see who it belongs to?",
    "My monitor is too low. Do you have any old phone books I can use as a stand?",
    "Which flavor of chips in the break room provides the best brain power for coding?",
    "Can you help me move my desk closer to the window? I need more 'natural light' for my RAM.",
    "Is it okay if I use 'Password123' for everything? It’s the only one I can remember.",
    "Why is my cursor so big today? Did the 'Internet' get wider overnight?",
    "I spilled a little bit of kale smoothie on my laptop. Should I put it in a bowl of rice?",
    "Does the Wi-Fi work better if I sit closer to the router, or should I hold my laptop up high?",
    "Can you teach me how to Excel? I told the boss I was an expert, but I don't know what a 'cell' is.",
    "The keyboard is making too much noise. Can you make the keys 'squishier' for me?",
    "I saw a movie where a hacker used a black screen with green text. Can you make my Word look like that?",
    "Is it true that if I don't restart my computer every night, the 'bits' get tired and slow down?"
]

In [None]:
def first_model_message():
    if not topics: return None

    selected_topic = random.choice(topics)
    stupid_topic = random.choice(stupid_distractions)

    scenario_type = random.choice(["EARLY", "MIDDLE", "LATE", "NEVER"])

    if scenario_type == "EARLY":
        structure = f"""
        - Turn 1 (User): Asks about {selected_topic} but mixes in a stupid comment about {stupid_topic}.
        - Turn 2 (Manager): Ignores the tech part, mocks the {stupid_topic} comment, then reluctantly answers the tech part.
        - Turn 3 (User): Apologizes and asks a clean follow-up on {selected_topic}.
        - Turn 4 (Manager): Grumpy technical deep-dive.
        """

    elif scenario_type == "MIDDLE":
        structure = f"""
        - Turn 1 (User): Legitimate question about {selected_topic}.
        - Turn 2 (Manager): Condescending technical explanation.
        - Turn 3 (User): Abruptly interrupts to ask about {stupid_topic}.
        - Turn 4 (Manager): Explodes with sarcasm, then pivots back to {selected_topic}.
        """

    elif scenario_type == "LATE":
        structure = f"""
        - Turn 1 (User): Legitimate question about {selected_topic}.
        - Turn 2 (Manager): Condescending technical explanation.
        - Turn 3 (User): Follow-up technical question.
        - Turn 4 (Manager): Even more complex explanation.
        - Turn 5 (User): "Oh thanks! By the way, {stupid_topic}?"
        - Turn 6 (Manager): Final angry shutdown. Ends conversation.
        """

    else:
        structure = f"""
        - Turn 1 (User): Confused question about {selected_topic}.
        - Turn 2 (Manager): Sarcastic but accurate explanation.
        - Turn 3 (User): Ask for clarification on a specific detail.
        - Turn 4 (Manager): Mocking the user's lack of knowledge, but explaining the detail perfectly.
        - Turn 5 (User): "Okay, I think I get it."
        - Turn 6 (Manager): "Good. Now stop bothering me."
        """

    system_prompt = f"You are a specialist dialogue generator. SCENARIO: {scenario_type} Interruption."

    user_prompt = f"""
    Generate a conversation between a Clueless User and a Grumpy IT Manager.
    TOPIC: {selected_topic}
    DISTRACTION: {stupid_topic} (Only use if scenario calls for it)

    FLOW TO GENERATE:
    {structure}

    FORMAT: Use 'User: [Text]' and 'Manager: [Text]'.
    """

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
    return tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)

In [None]:
def second_model_message(raw_answer):
    second_system_prompt = '''You are a synthetic data formatter.
    Convert raw dialogue into a SINGLE JSON object.

    RULES:
    - Output ONLY valid JSON.
    - Do NOT use placeholders. Use actual text.
    - Do NOT repeat turns.
    '''

    second_user_prompt = f'''
    Extract the dialogue into this JSON structure.
    RAW DIALOGUE:
    {raw_answer}

    JSON STRUCTURE:
    {{
      "messages": [
        {{"role": "system", "content": "You are a highly competent but grumpy and sarcastic IT specialist who provides technically correct answers."}},
        {{"role": "user", "content": "..."}},
        {{"role": "assistant", "content": "..."}},
        {{"role": "user", "content": "..."}},
        {{"role": "assistant", "content": "..."}},
        {{"role": "user", "content": "..."}},
        {{"role": "assistant", "content": "..."}}
      ]
    }}
    REMEMBER THE ROLES PROPERLY AND DONT CHANGE ANYTHING
    OUTPUT ONLY THE JSON.
    '''

    messages = [
        {"role": "system", "content": second_system_prompt},
        {"role": "user", "content": second_user_prompt}
    ]
    return tokenizer.apply_chat_template(messages, tokenize=True, return_tensors="pt", add_generation_prompt=True).to(model.device)

In [None]:
filename = "grumpy_it_dataset.jsonl"

# NOTE: if generation stops in between, check the file to see how many rows have been made and change the range from that row

for i in range(500):
    print("--------------------------------------------")
    print(f"Loop number {i+1}")

    # Generation
    tokenized_prompt = first_model_message()
    if tokenized_prompt is None: break

    outputs = model.generate(
        **tokenized_prompt,
        max_new_tokens=1024,
        temperature=0.85,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    raw_answer = tokenizer.decode(outputs[0][tokenized_prompt['input_ids'].shape[1]:], skip_special_tokens=True).strip()

    # Formatting
    second_prompt = second_model_message(raw_answer)
    outputs = model.generate(
        **second_prompt,
        max_new_tokens=1200,
        temperature=0.1,
        repetition_penalty=1.1,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    raw_answer2 = tokenizer.decode(outputs[0][second_prompt['input_ids'].shape[1]:], skip_special_tokens=True).strip()

    # Extraction
    all_fragments = []
    clean_text = re.sub(r'^(assistant|json|output)[:\s]*', '', raw_answer2, flags=re.IGNORECASE).strip()
    clean_text = re.sub(r'```json|```', '', clean_text).strip()

    try:
        data = json.loads(clean_text)
        if "messages" in data: all_fragments = data["messages"]
    except json.JSONDecodeError:
        print("⚠️ JSON Decode Error. Attempting regex recovery...")

    # Cleaning
    clean_dialogue = []

    # Define Persona System Prompt
    sys_msg = {"role": "system", "content": "You are a highly competent but grumpy and sarcastic IT specialist who provides technically correct answers."}

    if all_fragments:
        for m in all_fragments:
            content = m.get("content", "").strip()
            role = m.get("role", "").lower()

            # Ghost Turns & Empty Content
            if not content or content.lower() in ["assistant", "user", "system", "...", "turn 1", "turn 2"]:
                continue

            # Skip existing System roles
            if role == "system":
                continue

            # Normalize Role Names
            actual_role = "user" if role == "user" else "assistant"

            # If the new role is the same as the previous role, glue them together
            if clean_dialogue and clean_dialogue[-1]["role"] == actual_role:
                clean_dialogue[-1]["content"] += "\n" + content
                continue

            # If roles are different, append as a new message
            clean_dialogue.append({"role": actual_role, "content": content})

        # --- FINAL CHECKS ---

        # Sequence Guard (Must start with User)
        if clean_dialogue and clean_dialogue[0]["role"] == "assistant":
            clean_dialogue.pop(0)

        # Truncation Guard (Remove incomplete final turn)
        if clean_dialogue and clean_dialogue[-1]["role"] == "assistant":
            last_text = clean_dialogue[-1]["content"]
            if not last_text.endswith(('.', '!', '?', '"', '}')):
                print("⚠️ Truncation detected. Removing incomplete final turn.")
                clean_dialogue.pop()

        # Tail Check (Must end on Assistant)
        while clean_dialogue and clean_dialogue[-1]["role"] == "user":
            clean_dialogue.pop()
    # Save
    if len(clean_dialogue) >= 2:
        final_row = {"messages": [sys_msg] + clean_dialogue}

        with open(filename, "a", encoding="utf-8") as f:
            f.write(json.dumps(final_row, ensure_ascii=False) + "\n")

        print(f"✅ Saved row with {len(clean_dialogue)} turns.")
    else:
        print("❌ Failed: Conversation invalid or too short.")

    # Clear
    del outputs, raw_answer, raw_answer2, tokenized_prompt, second_prompt
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
dataset = load_dataset("json", data_files="grumpy_it_dataset.jsonl")

# Push to Hub
dataset.push_to_hub("XXXXX/grumpy-it-dataset", private=False) #insert name