# Build a Model to generate Synthetic Data

Code was written in Google Colab. 

## Imports

In [8]:
!pip install -q gradio transformers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [9]:
import os
import requests
import json

from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch

import gradio as gr

In [10]:
!pip install -U bitsandbytes


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## Open Source Models from HF

In [31]:
deepseek_model = 'deepseek-ai/deepseek-llm-7b-chat'
llama_model = 'meta-llama/Meta-Llama-3.1-8B-Instruct'
qwen2 = 'Qwen/Qwen-2.5-1.5B-instruct'

## Use Ori inference Endpoints

In [37]:
API_TOKEN="eyJhbGciOiJSUzUxMiIsImtpZCI6InNpZy0xNzMzMTMwNTg0IiwidHlwIjoiSldUIn0.eyJpYXQiOjE3NDcxMzUyNzEsImlzcyI6Im9nYy5vcmkuY28iLCJzdWIiOiJmMTQ5MzhlMi1mY2UyLTRjMzktYjAzYy1mN2QzNTBjYThjYjkifQ.R822kwa_GT_HwBzpKLC653HVy69RVZmL1PRLOLir7RD1zL3IqTKOU8bYAMT1OVCdUq6GajnoVBVw1E9sABjX1cEZaC_SPoJIPHtMyBd-Nvw4qleHkT_36dKGRSAzlT8OD1W8FqX9m9qlq1rMUChT2m8_Aq7XWvZpsKh0aRmyAvd9H55ieIerTtXEwxNlH1a8Lq8V78jR7E3b0AWh5Icmabr04o5GmjjvnBH0U_PaYqaD3LVzpoaEyy89EHy-wqHQroZzWyt4JFm2hDhWNByhyDB18FEbIl24bvLPyGT3tE-stJGJvGlPs5ynyfKwIZmxumymMnh1hFI-jN8ePZ9oLQ"
os.environ["HF_ENDPOINT"] = "https://llmlearning.inference.ogc.ori.co/openai/v1/"

In [39]:
hf_token = userdata.get('HF_ENDPOINT')
login(hf_token, add_to_git_credential=True)

NameError: name 'userdata' is not defined

## Creating Prompts

In [16]:
system_prompt = "You are an expert in generating synthetic datasets. Your goal is to generate realistic datasets \
based on a given business and its requirements from the user. You will also be given the desired datset format."
system_prompt += "Do not repeat the instructions."

user_prompt = ("Please provide me a dataset for the following business."
"For example:\n"
"The Business: A retail store selling luxury watches.\n"
"The Data Format: CSV.\n"
"Output:\n"
"Item,Price,Quantity,Brand,Sale Date\n"
"Superocean II, 20.000$, 3, Breitling, 2025-04-08 \n"
"If I don't provide you the necessary columns, please create the columns based on your knowledge about the given business")

In [17]:
def dataset_format(data_format, num_records):
    format_message = ''
    if data_format == 'CSV':
        format_message = 'Please provide the dataset in a CSV format.'
    elif data_format == 'JSON':
        format_message =  'Please provide the dataset in a JSON format'
    elif data_format == 'Tabular':
        format_message =  'Please provide the dataset in a Tabular format'

    return format_message + f'Please generate {num_records} records'

In [18]:
def complete_user_prompt(user_input, data_format, num_records):
    messages = [
        {'role': 'system', 'content': system_prompt},
        {'role': 'user', 'content': user_input + user_prompt + dataset_format(data_format, num_records)}
    ]

    return messages

## Accessing the Models

In [19]:
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU-Device:", torch.cuda.get_device_name(torch.cuda.current_device()))
else:
    print("No GPU found.")

CUDA available: False
No GPU found.


In [20]:
quant_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = False,
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_quant_type= 'nf4'
)

In [28]:
def generate_model(model_id, messages):
    try:
      tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code = True, token = API_TOKEN)
      inputs = tokenizer.apply_chat_template(messages, return_tensors = 'pt').to('cuda')
      streamer = TextStreamer(tokenizer)
      model = AutoModelForCausalLM.from_pretrained(model_id, token = API_TOKEN, device_map = 'auto', quantization_config = quant_config)
      outputs = model.generate(inputs, max_new_tokens = 2000, streamer = streamer)
      generated_text = tokenizer.decode(outputs[0], skip_special_tokens = True)
      del tokenizer, streamer, model, inputs, outputs
      return generated_text

    except Exception as e:
      return f'Error during generation: {str(e)}'

## Generate Dataset

In [34]:
def generate_dataset(user_input, target_format, model_choice, num_records):
    if model_choice == 'DeepSeek':
        model_id = deepseek_model
    elif model_choice == 'Llama-3.1-8B':
        model_id = llama_model
    elif model_choice == 'Qwen2':
        model_id = '8c71c184-7040-4cb6-87ee-fd46ec8b4054'

    messages = complete_user_prompt(user_input, target_format, num_records)
    return generate_model(model_id, messages)

## Creating Gradio UI

In [35]:
with gr.Blocks(title = 'Synthetic Data Generator') as ui:
    gr.Markdown('# Synthetic Data Generator')

    with gr.Row():
        with gr.Column(min_width=600):
            user_inputs = gr.Textbox(label = 'Enter your Business details and data requirements',
                                     placeholder = 'Type here...', lines = 15)

            model_choice = gr.Dropdown(
                ['DeepSeek', 'Llama-3.1-8B', 'Qwen2'],
                label = 'Choose your Model',
                value = 'DeepSeek'
            )

            target_format = gr.Dropdown(
                ['CSV', 'JSON', 'Tabular'],
                label = 'Choose your Format',
                value = 'CSV'
            )
            num_records = gr.Dropdown(
                [50, 100, 150, 200],
                label = 'Number of Records',
                value = 50
            )

            generate_button = gr.Button('Generate')

        with gr.Column():
            output = gr.Textbox(label = 'Generated Synthetic Data',
                               lines = 30)

    generate_button.click(fn = generate_dataset, inputs = [user_inputs, target_format, model_choice, num_records],
                          outputs = output
                         )

In [36]:
ui.launch()

Running on local URL:  http://127.0.0.1:7875

To create a public link, set `share=True` in `launch()`.


