🔧 1. Setup & Configuration

In [1]:
!pip uninstall -y gradio gradio_client pydantic fastapi starlette uvicorn

Found existing installation: gradio 5.25.0
Uninstalling gradio-5.25.0:
  Successfully uninstalled gradio-5.25.0
Found existing installation: gradio_client 1.8.0
Uninstalling gradio_client-1.8.0:
  Successfully uninstalled gradio_client-1.8.0
Found existing installation: pydantic 2.11.3
Uninstalling pydantic-2.11.3:
  Successfully uninstalled pydantic-2.11.3
Found existing installation: fastapi 0.115.12
Uninstalling fastapi-0.115.12:
  Successfully uninstalled fastapi-0.115.12
Found existing installation: starlette 0.46.2
Uninstalling starlette-0.46.2:
  Successfully uninstalled starlette-0.46.2
Found existing installation: uvicorn 0.34.1
Uninstalling uvicorn-0.34.1:
  Successfully uninstalled uvicorn-0.34.1


In [2]:
!pip install gradio pydantic fastapi starlette uvicorn

Collecting gradio
  Using cached gradio-5.25.0-py3-none-any.whl.metadata (16 kB)
Collecting pydantic
  Using cached pydantic-2.11.3-py3-none-any.whl.metadata (65 kB)
Collecting fastapi
  Using cached fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting starlette
  Using cached starlette-0.46.2-py3-none-any.whl.metadata (6.2 kB)
Collecting uvicorn
  Using cached uvicorn-0.34.1-py3-none-any.whl.metadata (6.5 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Using cached gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Using cached gradio-5.25.0-py3-none-any.whl (46.9 MB)
Using cached gradio_client-1.8.0-py3-none-any.whl (322 kB)
Using cached pydantic-2.11.3-py3-none-any.whl (443 kB)
Using cached fastapi-0.115.12-py3-none-any.whl (95 kB)
Using cached starlette-0.46.2-py3-none-any.whl (72 kB)
Using cached uvicorn-0.34.1-py3-none-any.whl (62 kB)
Installing collected packages: uvicorn, starlette, pydantic, gradio-client, fastapi, gradio
Successfully installed fastapi-0.115.1

In [3]:
!pip install -q openai anthropic transformers

In [4]:
import gradio, pydantic

print("Gradio:", gradio.__version__)
print("Pydantic:", pydantic.__version__)

Gradio: 5.25.0
Pydantic: 2.11.3


🛠️ Step 2: Import Libraries & Setup API Keys

In [5]:
import os
import random
from openai import OpenAI
import anthropic
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import pandas as pd
from typing import List, Dict
from huggingface_hub import login

In [6]:
openai_api_key = os.getenv("OPENAI_API_KEY")
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
hf_token = os.getenv("HF_TOKEN")

# ✅ Instantiate OpenAI client
openai = OpenAI(api_key=openai_api_key)

# ✅ Instantiate Claude client
anthropic_client = anthropic.Anthropic(api_key=anthropic_api_key)

# ✅ Hugging Face login (for LLaMA access)
login(token=hf_token)

# ✅ Load Meta-LLaMA-3.1–8B-Instruct
llama_model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(llama_model_name, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(
    llama_model_name,
    torch_dtype=torch.float16,
    device_map="auto"  # Automatically uses Colab GPU
)
llama_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the disk.
Device set to use mps


📜 3. Prompt Templates for Each Category

In [7]:
def build_prompt(category: str, label: str, add_noise: bool = False) -> dict:
    system_base = (
        "You are a system that generates realistic SMS messages for training a machine learning classifier. "
        "Your goal is to create short, natural-sounding text messages based on the user's instructions. "
        "Do not include explanations or disclaimers. Respond only with a single message.\n"
    )

    if add_noise:
        system_base += (
            "Add realistic noise to the message, such as misspellings, missing punctuation, improper grammar, or informal abbreviations. "
            "Make the message still understandable, but messy or deceptive in style.\n"
        )

    if label == "spam":
        system_base += (
            "If the prompt involves spam, simulate realistic scam or spam messages. These could involve phishing, financial fraud, fake package alerts, "
            "impersonation, crypto scams, romance baiting, or health/product spam. Do not use disclaimers or reveal that it's a fake.\n"
        )
    elif label == "ham":
        system_base += (
            "If the prompt involves ham (non-spam), generate messages typical of friends, coworkers, family, or professional services. "
            "These messages should be warm, mundane, helpful, or casual in nature.\n"
        )

    user_prompts = {
        "phishing": "Write a fake alert SMS from a bank asking the recipient to click a login link to verify their account.",
        "romance": "Write a scam SMS from someone pretending to be romantically interested and asking to connect or chat.",
        "delivery_scam": "Write a fake text message about a missed delivery or package held by UPS, FedEx, or USPS.",
        "crypto": "Create an SMS promoting a fake crypto opportunity with high returns or celebrity backing.",
        "authority": "Write a scam SMS pretending to be from the IRS, FBI, or a law enforcement agency requesting urgent action.",
        "tech_support": "Write an SMS pretending to be from Apple or Microsoft warning about a virus and giving a phone number to call.",
        "health_product": "Write an SMS promoting a shady health supplement or prescription offer.",
        "retail": "Write a spam SMS offering a limited-time retail deal, discount, or gift card.",
        "toll_violation": "Write a scam SMS pretending to be a toll road violation notice with a payment link. It should sound urgent and impersonate a government or toll agency.",
        "ham": "Write a natural and casual SMS message between friends, family, or coworkers about everyday topics like plans, updates, or reminders."
    }

    if category not in user_prompts:
        raise ValueError(f"Unknown category: {category}")

    user = user_prompts[category]

    return {
        "system": system_base.strip(),
        "user": user
    }

In [8]:
build_prompt(category="phishing", label="spam", add_noise=True)

{'system': "You are a system that generates realistic SMS messages for training a machine learning classifier. Your goal is to create short, natural-sounding text messages based on the user's instructions. Do not include explanations or disclaimers. Respond only with a single message.\nAdd realistic noise to the message, such as misspellings, missing punctuation, improper grammar, or informal abbreviations. Make the message still understandable, but messy or deceptive in style.\nIf the prompt involves spam, simulate realistic scam or spam messages. These could involve phishing, financial fraud, fake package alerts, impersonation, crypto scams, romance baiting, or health/product spam. Do not use disclaimers or reveal that it's a fake.",
 'user': 'Write a fake alert SMS from a bank asking the recipient to click a login link to verify their account.'}

🧱 Step-by-Step LLM Generator Functions

🔹 1. GPT-4 (OpenAI)

In [9]:
def generate_with_openai_prompt(system: str, user: str) -> str:
    response = openai.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": user}
        ],
        temperature=0.7,
        max_tokens=300
    )
    return response.choices[0].message.content.strip()

🔹 2. Claude 3 Opus (Anthropic)

In [10]:
def generate_with_anthropic_prompt(system: str, user: str) -> str:
    response = anthropic_client.messages.create(
        model="claude-3-opus-20240229",
        system=system,
        messages=[{"role": "user", "content": user}],
        temperature=0.7,
        max_tokens=300
    )
    return response.content[0].text.strip()

🔹 3. Meta-LLaMA-3.1–8B-Instruct (Hugging Face)

* Since LLaMA doesn’t support system/user roles natively, we simulate the chat-style prompt.

In [11]:
def generate_with_llama_prompt(system: str, user: str) -> str:
    full_prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{system}\n<|start_header_id|>user<|end_header_id|>\n{user}<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n"
    response = llama_pipeline(full_prompt, max_new_tokens=300, do_sample=True, temperature=0.7)
    return response[0]["generated_text"].replace(full_prompt, "").strip()

Controller Methods

In [12]:
def generate_message(model_name: str, category: str, label: str, add_noise: bool = False) -> str:
    prompt_data = build_prompt(category, label, add_noise)
    system = prompt_data["system"]
    user = prompt_data["user"]

    if model_name == "GPT-4":
        return generate_with_openai_prompt(system, user)
    elif model_name == "Claude":
        return generate_with_anthropic_prompt(system, user)
    elif model_name == "LLaMA-3.1-8B-Instruct":
        return generate_with_llama_prompt(system, user)
    else:
        raise ValueError(f"Unsupported model: {model_name}")

In [None]:
def llm_noisify(text: str, model_name: str = "GPT-4") -> str:
    system_prompt = """
    You're an assistant that rewrites messages to sound like a real-world scam or spam SMS text message. 
    Apply just a little bit of casual grammar mistakes or misspellings, informal phrasing, and occasional urgency. 
    Avoid overdoing emoji use — keep it to 0–2 max. 
    Keep the core message readable and believable, like a real scammer would write. 
    Do NOT use weird capitalization everywhere or excessive text speak. Make it subtle and realistic.
    """

    user = f"Original message: {text}\n\nReturn the same message with noise added."

    if model_name == "GPT-4":
        response = openai.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": user}
            ],
            temperature=0.8,
            max_tokens=300
        )
        return response.choices[0].message.content.strip()
    else:
        raise NotImplementedError(f"Noisifier currently supports GPT-4 only (got: {model_name})")

In [14]:
def generate_dataset(
    model_name: str,
    category: str,
    spam_percentage: float,
    num_messages: int,
    add_noise: bool = False
) -> pd.DataFrame:
    assert 0 <= spam_percentage <= 1, "spam_percentage must be between 0 and 1"
    assert num_messages <= 20, "Limit of 20 messages enforced"

    spam_count = round(num_messages * spam_percentage)
    ham_count = num_messages - spam_count

    data = []

    for i in range(spam_count):
        try:
            msg = generate_message(model_name, category, label="spam", add_noise=False)
            if add_noise:
                msg = llm_noisify(msg)
            data.append({"message": msg, "label": "spam"})
        except Exception as e:
            print(f"⚠️ Spam gen error {i+1}/{spam_count}: {str(e)}")

    for i in range(ham_count):
        try:
            msg = generate_message(model_name, category="ham", label="ham", add_noise=False)
            if add_noise:
                msg = llm_noisify(msg)
            data.append({"message": msg, "label": "ham"})
        except Exception as e:
            print(f"⚠️ Ham gen error {i+1}/{ham_count}: {str(e)}")

    df = pd.DataFrame(data)
    return df

✅ Gradio App in Colab

In [15]:
def run_dataset_generator(model_name, category, spam_percent, num_messages, add_noise):
    spam_ratio = spam_percent / 100.0
    print(f"🔧 Params → Model: {model_name}, Category: {category}, Spam%: {spam_percent}, Num: {num_messages}, Noise: {add_noise}")

    df = generate_dataset(
        model_name=model_name,
        category=category,
        spam_percentage=spam_ratio,
        num_messages=num_messages,
        add_noise=add_noise
    )

    if df.empty:
        print("❌ Empty dataframe. Returning empty file path.")
        return df, ""  # ← important: return an empty string, NOT None

    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
    df.to_csv(tmp_file.name, index=False)

    if not os.path.exists(tmp_file.name):
        print("❌ CSV file path missing. Returning safe fallback.")
        return df, ""

    return df, tmp_file.name

# --- Gradio Interface ---

In [16]:
import gradio as gr
import tempfile
import os
import pandas as pd

# Safe wrapper for Gradio to avoid type errors
def gradio_wrapper(model_name, category, spam_percent, num_messages, add_noise):
    df, file_path = run_dataset_generator(
        model_name=model_name,
        category=category,
        spam_percent=spam_percent,
        num_messages=num_messages,
        add_noise=add_noise
    )

    # ✅ Ensure file_path is a string or Gradio will explode
    if not isinstance(file_path, str) or not os.path.exists(file_path):
        file_path = ""

    print("Returned:", type(df), type(file_path), "→", file_path)

    return df, file_path

In [17]:
demo = gr.Interface(
    fn=gradio_wrapper,
    inputs=[
        gr.Dropdown(["GPT-4", "Claude", "LLaMA-3.1-8B-Instruct"], label="LLM Model"),
        gr.Dropdown([
            "phishing", "romance", "delivery_scam", "crypto", "authority",
            "tech_support", "health_product", "retail", "toll_violation", "ham"
        ], label="Message Category"),
        gr.Slider(0, 100, value=80, step=5, label="Spam Percentage"),
        gr.Slider(1, 20, value=10, step=1, label="Number of Messages"),
        gr.Checkbox(label="Add Noise (typos, grammar issues)", value=True)
    ],
    outputs=[
        gr.Dataframe(label="Generated Messages"),
        gr.File(label="⬇️ Download CSV")
    ],
    title="📲 Synthetic SMS Generator",
    description="Generate spam/ham messages using GPT-4, Claude, or LLaMA. Includes toll violation scams!"
)

demo.launch(share=True, debug=True)

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://8cf64b49a1f550f163.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


🔧 Params → Model: GPT-4, Category: tech_support, Spam%: 80, Num: 10, Noise: True
Returned: <class 'pandas.core.frame.DataFrame'> <class 'str'> → /var/folders/kd/w1l5sb7s7_s1n7hx4jbk2bjc0000gn/T/tmp3kau8od6.csv
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://8cf64b49a1f550f163.gradio.live


