<a href="https://colab.research.google.com/github/aimanalishezan/Synthetic-Data-Generator-Ai/blob/main/Synthetic_Data_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Build a Model to Generate Synthetic Data

In [68]:
!pip install -q gradio

In [69]:
import os
import requests
import json
import torch
from google.colab import userdata
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer, BitsAndBytesConfig
import gradio as gr

In [81]:
!pip install -U bitsandbytes



In [71]:
hf_token= userdata.get('HF_TOKEN')

!git config --global credential.helper store

login(hf_token,add_to_git_credential=True)

In [72]:
model=userdata.get('model')

In [73]:
system_prompt="You are an expert in generating synthetic datasets. Your goal is to generate realistic datasets \ based on a given business and its requirements from the user . you will also be given the desired dataset format."
system_prompt+="Do not repeat the instructions."

user_prompt=("Please provide me a dataset for the following business."
"For example:\n"
"The Business : A retail store selling luxury watches.\n"
"The Data Format: CSV.\n"
"Output:\n"
"Item,Price,Quantity,Brand,Sale Date.\n"
"Superocean IT,80,000$, 5, Breitling, 2025-04-10 \n"
"If i don't provide you the necessary columns, please create the columns based on your knowledge about the given business"
)

In [74]:
def dataset_format(data_format, num_records):
  format_message=""
  if data_format=='CSV':
    format_message="Please provide the dateset in a CSV format."
  elif data_format=='JSON':
    format_message="Please provide the dateset in a JSON format."
  elif data_format=="Tabular":
    format_message="Please provide the dateset in a tabular format."

  return format_message + f'please generate {num_records} records '

In [75]:
def  complete_user_prompt(user_input,data_format,num_records):
  messages=[
      {"role":"system","content":system_prompt},
      {"role":"user","content":user_input},
      {"role":"user","content":user_prompt+dataset_format(data_format,num_records)}
  ]
  return messages

In [76]:
print("Cuda Available", torch.cuda.is_available())
if  torch.cuda.is_available():
  print("GPU_DEVICE",torch.cuda.get_device_name(torch.cuda.current_device()))
else:
  print("NO Gpu Found")

Cuda Available True
GPU_DEVICE Tesla T4


In [77]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
)

In [78]:
def generate_model(model_id, messages):
  try:
    tokenizer = AutoTokenizer.from_pretrained(model_id,trust_remote_code=True)
    inputs=tokenizer.apply_chat_template(messages,return_tensors="pt").to(device="cuda")
    streamer = TextStreamer(tokenizer)
    model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, device_map="auto")
    outpts=model.generate(inputs,max_new_tokens=2000,streamer=streamer)
    generated_text=tokenizer.decode(outpts[0],skip_special_tokens=True)
    del tokenizer,streamer,model,inputs,outpts
    return generated_text
  except Exception as e:
    return f'Error during generation:{str(e)}'

In [79]:
def generate_dataset(user_input,target_format,model_choices,num_records):
  if model_choices=="Llama_3.2-8B":
    model_id=model
  messages=complete_user_prompt(user_input,target_format,num_records)
  generated_text=generate_model(model_id,messages)
  return generated_text

In [80]:
with gr.Blocks(title="Synthetic Data Generator") as ui:
  gr.Markdown("## Synthetic Data Generator")
  with gr.Row():
    with gr.Column(min_width=600):
      user_inputs=gr.Textbox(label="Enter your business details data requiremnets",placeholder="Enter here",lines=15)

      model_choices=gr.Dropdown(
          ['Llama_3.2-8B'],
          label="Select Model",
          value="Llama_3.2-8B"
      )
      target_format=gr.Dropdown(
          ['CSV','JSON','Tabular'],
          label="Select Target Format",
          value="CSV"
      )
      num_records=gr.Dropdown(
          [50,100,200,500,1000],
          label="Select Number of Records",
          value=50
      )
      generate_button=gr.Button('Generate')
    with gr.Column():
      output=gr.Textbox(label='Generate Synthetic Data',lines=30)
  generate_button.click(fn=generate_dataset,inputs=[user_inputs,target_format,model_choices,num_records],outputs=output)
ui.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8450c4f9938c229d26.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


