# Synthetic Vehcile Data Generator

## About
This colab notebook uses LLM models for generating synthetic vehicle test data.

## Package installs

In [1]:
# Installs
!pip install -q gradio requests torch bitsandbytes transformers accelerate openai

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m78.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m62.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# imports
import re
import os
import sys
import gc
import io
import json
import gradio as gr
import requests
import subprocess
import google.generativeai as ggai
import torch
import tempfile
import shutil
from io import StringIO
import pandas as pd
from google.colab import userdata
from huggingface_hub import login
from openai import OpenAI
from pathlib import Path
from datetime import datetime
from IPython.display import Markdown, display, update_display
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig

### Domain-level Exception

In [3]:
class SyntheticDataGeneratorException(Exception):
  def __init__(self, message: str, cause: Exception):
    self._message = message
    self._cause = cause

  @property
  def message(self) -> str:
    return self._message

  @property
  def cause(self) -> Exception | None:
    return self._cause

## HuggingFace Setup

In [4]:
# Sign in to HuggingFace Hub

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)
print("Logged in to HuggingFace Hub")

Logged in to HuggingFace Hub


## OpenAI Setup

In [5]:
openai_client = OpenAI(api_key=userdata.get('OPENAI_API_KEY'))
print("Successfully configured OpenAI client")

Successfully configured OpenAI client


## Defining Prompts

In [6]:
_SYSTEM_PROMPT = """
You are a synthetic vehicle dataset generator. Your role is to create a synthetic dataset which infers structured data schemas from business scenarios given by the user.
The business scenarios will always relate to vehicle maintenance and telemetry data. Every vehicle belongs to a fleet. A fleet has a desired goal to keep vehicle downtime
to a minimum.

Your task is to:
1. Understand the user's business problem(s) or use case(s).
2. Identify key fields needed to support the scenario(s).
3. Define appropriate field names, data types, and formats.
4. Generate synthetic records which match the inferred schema.

Guidelines:
- Use realistic field names and values.
- Chose sensible data types such as string, integer, double-precision, boolean,..
- Respect logical constraints such as age-range, date-range, email-format.

Before generating the data, display the inferred schema in JSON format.
"""
def get_system_prompt() -> str:
  """
  Returns the system prompt for the synthetic data generator.

  :return: The system prompt for the synthetic data generator.
  """
  return _SYSTEM_PROMPT

In [7]:
def get_user_prompt(business_problem, sample_size, file_format) -> str:
  """
  Returns a prompt for generating synthetic data.

  :param business_problem: The buisiness problem defined by the user.
  :param sample_size: The max number of samples in the dataset.
  :param file_format: The file format of the dataset.
  :return: A prompt for generating synthetic data.
  """
  user_prompt = f"""The business scenario for which I want you to generate a dataset is defined as: {business_problem}

  Generate a synthetic dataset of {sample_size} rows in {file_format} format.
  """
  return user_prompt

## Quantization

In [8]:
def get_quantization_config(quant_type: str = "nf4"):
  return BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_compute_dtype=torch.bfloat16,
      bnb_4bit_quant_type=quant_type,
  )

## HuggingFace Model Inference

In [16]:
def run_hfmodel_and_get_response(prompt, model_name, output_tokens):
  """
  Run a HF model on a given prompt.

  :param prompt: The prompt to run the model on.
  :param model_name: The name of the model to run.
  :param output_tokens: The number of tokens to generate.
  :return: The generated text.
  """
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  tokenizer.pad_token = tokenizer.eos_token
  inputs = tokenizer.apply_chat_template(prompt, return_tensors="pt")
  if torch.cuda.is_available():
    inputs = inputs.to("cuda")
  streamer = TextStreamer(tokenizer)
  if "microsoft/bitnet-b1.58-2B-4T" in model_name:
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", trust_remote_code=True)
  elif "tiiuae/Falcon-E-3B-Instruct" in model_name:
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16 )
  else:
    # Use quantization
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", quantization_config=get_quantization_config())
  outputs = model.generate(inputs, max_new_tokens=output_tokens, streamer=streamer)
  response = tokenizer.decode(outputs[0])
  del model, inputs, tokenizer, outputs
  gc.collect()
  torch.cuda.empty_cache()
  return response

## OpenAI Model Interface

In [17]:
# ChatGPT response
def get_chatgpt_response(prompt, model_name, output_tokens):
  response = openai_client.chat.completions.create(
        model=model_name,
        messages=prompt,
        max_tokens=output_tokens,
    )
  return response.choices[0].message.content

## Gradio UI

In [18]:
MODEL_TYPES = ["GPT", "HuggingFace"]
OPENAI_MODEL_NAMES=["gpt-4o-mini", "gpt-4o", "gpt-3.5-turbo"]
HUGGINGFACE_MODELS=[
    "meta-llama/Llama-3.2-3B-Instruct",
    "microsoft/bitnet-b1.58-2B-4T",
    "ByteDance-Seed/Seed-Coder-8B-Instruct",
    "tiiuae/Falcon-E-3B-Instruct",
    "Qwen/Qwen2.5-7B-Instruct"
]
MODEL_NAMES = {
    "GPT": OPENAI_MODEL_NAMES,
    "HuggingFace": HUGGINGFACE_MODELS
}

In [19]:
with gr.Blocks() as generator_ui:
    gr.Markdown("# Goodyear Business Scenario → Synthetic Dataset Generator")

    with gr.Row():
      with gr.Column(scale=3):
        with gr.Row():
          dataset_size=gr.Number(value=10, label="Enter the number of data samples to generate.", show_label=True)
          format=gr.Dropdown(["json", "csv", "txt", "markdown"], label="Select the format for the dataset", show_label=True)
        with gr.Row():
          scenario=gr.Textbox(label="Business Scenario", lines=5, placeholder="Describe your business scenario here")
        with gr.Row():
          error = gr.Markdown(visible=False)
        with gr.Row():
          clear = gr.Button("Clear Everything")
          submit = gr.Button("Generate Dataset", variant="primary")

      with gr.Column(scale=1):
          model_type = gr.Dropdown(MODEL_TYPES, label="Model Type", show_label=True, info="Select the model type you want to use")
          model_name = gr.Dropdown(MODEL_NAMES[model_type.value], label="Model Name", show_label=True, allow_custom_value=True, info="Select the model name or enter one manually")
          output_tokens= gr.Number(value=1000, label="Enter the max number of output tokens to generate.", show_label=True, info="This will impact the length of the response containg the dataset")

    with gr.Row():
      # Chatbot Interface
        chatbot = gr.Chatbot(
            type='messages',
            label='Chatbot',
            show_label=True,
            height=300,
            resizable=True,
            elem_id="chatbot",
            avatar_images=("🧑", "🤖",)
        )
    with gr.Row(variant="compact"):
      extract_btn = gr.Button("Extract and Save Dataset", variant="huggingface", visible=False)
      file_name = gr.Textbox(label="Enter file name here (without file extension)", placeholder="e.g. cancer_synthetic, warehouse_synthetic (no digits)", visible=False)
    with gr.Row():
      markdown_preview = gr.Markdown(visible = False)
      dataset_preview = gr.Textbox(label="Dataset Preview",visible=False)
    with gr.Row():
      file_saved = gr.Textbox(visible=False)

    def run_inference(scenario, model_type, model_name, output_tokens, dataset_size, format):
      """Run the model and get the response"""
      model_type=model_type.lower()
      print(f"scenario: {scenario}")
      print(f"model_type: {model_type}")
      print(f"model_name: {model_name}")
      if not scenario.strip():
        return gr.update(value="**Error:** Please define a scenario first!",visible=True), []

      user_prompt = get_user_prompt(scenario, dataset_size, format)
      prompt =  [
          {"role": "system", "content": get_system_prompt()},
          {"role": "user", "content": user_prompt},
      ]

      if model_type == "gpt":
        response = get_chatgpt_response(prompt=prompt, model_name=model_name, output_tokens=output_tokens)
      else:
        response = run_hfmodel_and_get_response(prompt=prompt, model_name=model_name, output_tokens=output_tokens)
        torch.cuda.empty_cache()
      history = [
          {"role": "user", "content": scenario},
          {"role": "assistant", "content": response}
      ]
      return gr.update(visible=False), history

    def extract_dataset_string(response):
      """Extract dataset content between defined tags using regex."""
      # Remove known artificial tokens (common in HuggingFace or Claude)
      response = re.sub(r"<\[.*?\]>", "", response)

      # Remove system or prompt echo if repeated before dataset
      response = re.sub(r"(?is)^.*?<<<", "<<<", response.strip(), count=1)

      # 1. Match strict <<<>>>...<<<>>> tag blocks (use last match)
      matches = re.findall(r"<<<>>>[\s\r\n]*(.*?)[\s\r\n]*<<<>>>", response, re.DOTALL)
      if matches:
          return matches[-1].strip()

      # 2. Match loose <<< ... >>> format
      matches = re.findall(r"<<<[\s\r\n]*(.*?)[\s\r\n]*>>>", response, re.DOTALL)
      if matches:
          return matches[-1].strip()

      # 3. Match final fallback: take everything after last <<< as raw data
      last_open = response.rfind("<<<")
      if last_open != -1:
          raw = response[last_open + 3 :].strip()
          # Optionally cut off noisy trailing notes, explanations, etc.
          raw = re.split(r"\n\s*\n|Explanation:|Note:|---", raw)[0]
          return raw.strip()

      return "Could not extract dataset! Try again with a different model."

    def extract_dataset_from_response(chatbot_history, file_name, file_type):
      """Extract dataset and update in gradio UI components"""
      response = chatbot_history[-1]["content"]
      if not response:
        return gr.update(visible=True, value="Could not find LLM Response! Try again."), gr.update(visible=False)

      dataset = extract_dataset_string(response)
      if dataset == "Could not extract dataset! Try again with a different model.":
        return gr.update(visible=True, value=dataset), gr.update(visible=False)
      text = save_dataset(dataset, file_type, file_name)
      return gr.update(visible=True, value=text), gr.update(visible=True, value=dataset)

    def save_dataset(dataset, file_format, file_name):
      """Save dataset to a file based on the selected format."""
      file_name=file_name+"."+file_format
      print(dataset)
      print(file_name)
      if file_format == "json":
        try:
          data = json.loads(dataset)
          with open(file_name, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=4)
          return "Dataset saved successfully!"
        except:
          return "Could not save dataset! Try again in another format."
      elif file_format == "csv":
        try:
          df = pd.read_csv(StringIO(dataset))
          df.to_csv(file_name, index=False)
          return "Dataset saved successfully!"
        except:
          return "Could not save dataset! Try again in another format."
      elif file_format == "txt":
        try:
          with open(file_name, "w", encoding="utf-8") as f:
            f.write(dataset)
          return "Dataset saved successfully!"
        except:
          return "Could not save dataset! Try again in another format."

    def clear_chat():
      """Clear the chat history."""
      return "", [], gr.update(visible=False), gr.update(visible=False)

    def show_extract_btn(chatbot_history, format):
      """Show the extract button if the response has been displayed in the chatbot and format is not set to markdown"""
      if chatbot_history == []:
        return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
      if format == "markdown":
        return gr.update(visible=True, value=chatbot_history[1]["content"]), gr.update(visible=False), gr.update(visible=False)
      return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)

    extract_btn.click(
        fn=extract_dataset_from_response,
        inputs=[chatbot, file_name, format],
        outputs=[file_saved, dataset_preview]
    )

    chatbot.change(
        fn=show_extract_btn,
        inputs=[chatbot, format],
        outputs=[markdown_preview, extract_btn, file_name]
    )

    model_type.change(
        fn=lambda x: gr.update(choices=MODEL_NAMES[x], value=MODEL_NAMES[x][0]),
        inputs=[model_type],
        outputs=[model_name]
    )

    submit.click(
        fn=run_inference,
        inputs=[scenario, model_type, model_name, output_tokens, dataset_size, format],
        outputs=[error, chatbot],
        show_progress=True
    )

    clear.click(
        clear_chat,
        outputs=[scenario, chatbot, dataset_preview, file_saved]
    )

## Launch UI

In [None]:
generator_ui.launch(share=True, debug=True, inbrowser=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://fd21213e498169c984.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


scenario: create a dataset for a fleet with 5 vehicles
model_type: gpt
model_name: gpt-4o-mini
